├── src
    ├── .gitignore
    ├── Makefile.am
    ├── tounicode.hh
    ├── regex_dispatcher_m.hh
    ├── pdf-handler.hh
    ├── build_table.py
    ├── main.cc
    ├── pdf-handler.cc
    ├── tounicode_table.cc
    ├── tounicode.cc
    └── cmdlineparse.hh
├── test
    ├── bfchar-expected.txt
    ├── bfchar-garbled.txt
    ├── bfrange-expected.txt
    ├── bfrange-garbled.txt
    ├── bfrange2-expected.txt
    ├── bfrange2-garbled.txt
    ├── bfchar.tex
    ├── test-pdftotext.sh
    ├── bfchar-qdfcontents.txt
    ├── bfrange-qdfcontents.txt
    ├── test-qdfcontents.sh
    ├── bfrange2-qdfcontents.txt
    ├── SourceHanSerif-Regular-UTF16
    ├── build_bfrange2.py
    ├── build_bfrange.py
    └── Makefile.am
├── autogen.sh
├── Makefile.am
├── .gitignore
├── .dir-locals.el
├── LICENSE
├── Readme.md
├── configure.ac
├── Readme.ja.md
└── m4
    └── ax_cxx_compile_stdcxx.m4


/src/.gitignore:
--------------------------------------------------------------------------------
1 | aj17-kanji.txt
2 | 


--------------------------------------------------------------------------------
/test/bfchar-expected.txt:
--------------------------------------------------------------------------------
1 | 初見はハ長調で高音の白玉があった
2 | 


--------------------------------------------------------------------------------
/test/bfchar-garbled.txt:
--------------------------------------------------------------------------------
1 | 初⾒はハ⻑調で⾼⾳の⽩⽟があった
2 | 


--------------------------------------------------------------------------------
/test/bfrange-expected.txt:
--------------------------------------------------------------------------------
1 | 初見はハ長調で高音の白玉があった
2 | 


--------------------------------------------------------------------------------
/test/bfrange-garbled.txt:
--------------------------------------------------------------------------------
1 | 初⾒はハ⻑調で⾼⾳の⽩⽟があった
2 | 


--------------------------------------------------------------------------------
/test/bfrange2-expected.txt:
--------------------------------------------------------------------------------
1 | 初見はハ長調で高音の白玉があった
2 | 


--------------------------------------------------------------------------------
/test/bfrange2-garbled.txt:
--------------------------------------------------------------------------------
1 | 初⾒はハ⻑調で⾼⾳の⽩⽟があった
2 | 


--------------------------------------------------------------------------------
/autogen.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | autoreconf -v -i
4 | 


--------------------------------------------------------------------------------
/Makefile.am:
--------------------------------------------------------------------------------
1 | SUBDIRS = src test
2 | 
3 | dist_doc_DATA = Readme.md Readme.ja.md LICENSE
4 | EXTRA_DIST = autogen.sh
5 | 
6 | ACLOCAL_AMFLAGS = -I m4
7 | 


--------------------------------------------------------------------------------
/test/bfchar.tex:
--------------------------------------------------------------------------------
1 | % plain upTeX
2 | \special{dvipdfmx:config z 0}
3 | \special{pdf:mapline uprml-h unicode !SourceHanSerif-Regular.otf}
4 | \nopagenumbers
5 | 初見はハ長調で高音の白玉があった
6 | \bye
7 | 


--------------------------------------------------------------------------------
/test/test-pdftotext.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | TESTFILE="$1"
 4 | BASENAME=`basename ${TESTFILE} | sed -e "s/-testtext\.pdftotext$//"`
 5 | EXPECTED=${BASENAME}-expected.txt
 6 | 
 7 | if [ ! -e "${EXPECTED}" ]; then
 8 |     EXPECTED=${srcdir}/${BASENAME}-expected.txt
 9 | fi   
10 | 
11 | ${DIFF} -ubwBE ${EXPECTED} ${TESTFILE}
12 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *~
 2 | .\#*
 3 | \#*
 4 | \#.*
 5 | *.o
 6 | *.a
 7 | *.exe
 8 | pdf-fix-tuc
 9 | build/
10 | .deps/
11 | Makefile
12 | Makefile.in
13 | aclocal.m4
14 | autom4te.cache/
15 | config.h
16 | config.h.in
17 | config.log
18 | config.status
19 | configure
20 | depcomp
21 | install-sh
22 | missing
23 | stamp-h1
24 | test-driver
25 | 


--------------------------------------------------------------------------------
/test/bfchar-qdfcontents.txt:
--------------------------------------------------------------------------------
 1 | 16 beginbfchar
 2 | <05B5> <3042>
 3 | <05BF> <304C>
 4 | <05D2> <305F>
 5 | <05D6> <3063>
 6 | <05DA> <3067>
 7 | <05E1> <306E>
 8 | <05E2> <306F>
 9 | <0640> <30CF>
10 | <2BB4> <521D>
11 | <691D> <7389>
12 | <6EAE> <767D>
13 | <9536> <898B>
14 | <97AA> <8ABF>
15 | <AA7D> <9577>
16 | <AF1E> <97F3>
17 | <B45A> <9AD8>
18 | endbfchar
19 | 


--------------------------------------------------------------------------------
/.dir-locals.el:
--------------------------------------------------------------------------------
 1 | ;;; Directory Local Variables
 2 | ;;; See Info node `(emacs) Directory Variables' for more information.
 3 | 
 4 | ((c++-mode .
 5 |   ((c-file-style . "gnu")
 6 |   (indent-tabs-mode . nil)))
 7 |  (autoconf-mode .
 8 |   ((indent-tabs-mode . nil)))
 9 |  (python-mode .
10 |   ((indent-tabs-mode . nil)))
11 |  (shellscript-mode .
12 |   ((indent-tabs-mode . nil))))
13 | 


--------------------------------------------------------------------------------
/test/bfrange-qdfcontents.txt:
--------------------------------------------------------------------------------
 1 | 16 beginbfrange
 2 | <05B5> <05B5> <3042>
 3 | <05BF> <05BF> <304C>
 4 | <05D2> <05D2> <305F>
 5 | <05D6> <05D6> <3063>
 6 | <05DA> <05DA> <3067>
 7 | <05E1> <05E1> <306E>
 8 | <05E2> <05E2> <306F>
 9 | <0640> <0640> <30CF>
10 | <2BB4> <2BB4> <521D>
11 | <691D> <691D> <7389>
12 | <6EAE> <6EAE> <767D>
13 | <9536> <9536> <898B>
14 | <97AA> <97AA> <8ABF>
15 | <AA7D> <AA7D> <9577>
16 | <AF1E> <AF1E> <97F3>
17 | <B45A> <B45A> <9AD8>
18 | endbfrange
19 | 


--------------------------------------------------------------------------------
/src/Makefile.am:
--------------------------------------------------------------------------------
 1 | bin_PROGRAMS = pdf-fix-tuc
 2 | pdf_fix_tuc_SOURCES = main.cc \
 3 | 	pdf-handler.cc pdf-handler.hh \
 4 | 	tounicode.cc tounicode.hh tounicode_table.cc \
 5 | 	cmdlineparse.hh regex_dispatcher_m.hh
 6 | pdf_fix_tuc_LDADD = $(LIBQPDF_LIBS)
 7 | 
 8 | EXTRA_DIST = build_table.py
 9 | 
10 | if ENABLE_BUILD_TABLE
11 | tounicode_table.cc: aj17-kanji.txt
12 | 	$(srcdir)/build_table.py $< > $@
13 | endif
14 | 
15 | aj17-kanji.txt:
16 | 	$(WGET) https://github.com/adobe-type-tools/Adobe-Japan1/raw/master/aj17-kanji.txt
17 | 


--------------------------------------------------------------------------------
/test/test-qdfcontents.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | TESTFILE="$1"
 4 | BASENAME=`basename ${TESTFILE} | sed -e "s/-testqdf\.qdfcontents$//"`
 5 | EXPECTED=${BASENAME}-qdfcontents.txt
 6 | 
 7 | if [ ! -e "${EXPECTED}" ]; then
 8 |     EXPECTED=${srcdir}/${BASENAME}-qdfcontents.txt
 9 | fi
10 | 
11 | NOTFOUND=no
12 | 
13 | function check () {
14 |     if "${GREP}" -q "$1" "${TESTFILE}"; then
15 |         echo \"$1\" is found
16 |     else
17 |         echo \"$1\" is not found
18 |         NOTFOUND=yes
19 |     fi
20 | }
21 | 
22 | cat "${EXPECTED}" | while read line; do
23 |     check "${line}"
24 | done
25 | 
26 | if [ "x${NOTFOUND}" = "xyes" ]; then
27 |     exit 1
28 | fi
29 | 
30 | exit 0
31 | 


--------------------------------------------------------------------------------
/test/bfrange2-qdfcontents.txt:
--------------------------------------------------------------------------------
 1 | 29 beginbfrange
 2 | <0001> <0002> <2E81>
 3 | <0003> <0003> <4E5A>
 4 | <0004> <0004> <6535>
 5 | <0005> <0005> <2E9A>
 6 | <0006> <0006> <65E1>
 7 | <0007> <0007> <4E80>
 8 | <0008> <0009> <2EF3>
 9 | <000A> <000A> <2EFF>
10 | <000B> <000B> <4E00>
11 | <000C> <000C> <4E28>
12 | <000D> <000D> <9F9C>
13 | <000E> <000E> <9FA0>
14 | <000F> <000F> <2FD6>
15 | <05B5> <05B5> <3042>
16 | <05BF> <05BF> <304C>
17 | <05D2> <05D2> <305F>
18 | <05D6> <05D6> <3063>
19 | <05DA> <05DA> <3067>
20 | <05E1> <05E1> <306E>
21 | <05E2> <05E2> <306F>
22 | <0640> <0640> <30CF>
23 | <2BB4> <2BB4> <521D>
24 | <691D> <691D> <7389>
25 | <6EAE> <6EAE> <767D>
26 | <9536> <9536> <898B>
27 | <97AA> <97AA> <8ABF>
28 | <AA7D> <AA7D> <9577>
29 | <AF1E> <AF1E> <97F3>
30 | <B45A> <B45A> <9AD8>
31 | endbfrange
32 | 


--------------------------------------------------------------------------------
/test/SourceHanSerif-Regular-UTF16:
--------------------------------------------------------------------------------
 1 | %!PS-Adobe-3.0 Resource-CMap
 2 | %
 3 | % *** Caution ***
 4 | % For test only.
 5 | % This file contains garbled Unicode codepoints.
 6 | % *** Caution ***
 7 | %
 8 | /CIDInit /ProcSet findresource begin
 9 | 12 dict begin
10 | begincmap
11 | /CMapName /SourceHanSerif-Regular-UTF16 def
12 | /CMapType 2 def
13 | /CIDSystemInfo <<
14 |   /Registry (Adobe)
15 |   /Ordering (UCS)
16 |   /Supplement 0
17 | >> def
18 | 1 begincodespacerange
19 | <0000> <FFFF>
20 | endcodespacerange
21 | 16 beginbfchar
22 | <05B5> <3042>
23 | <05BF> <304C>
24 | <05D2> <305F>
25 | <05D6> <3063>
26 | <05DA> <3067>
27 | <05E1> <306E>
28 | <05E2> <306F>
29 | <0640> <30CF>
30 | <2BB4> <521D>
31 | % <691D> <7389>
32 | <691D> <2F5F>
33 | % <6EAE> <767D>
34 | <6EAE> <2F69>
35 | % <9536> <898B>
36 | <9536> <2F92>
37 | <97AA> <8ABF>
38 | % <AA7D> <9577>
39 | <AA7D> <2ED1>
40 | % <AF1E> <97F3>
41 | <AF1E> <2FB3>
42 | % <B45A> <9AD8>
43 | <B45A> <2FBC>
44 | endbfchar
45 | endcmap
46 | CMapName currentdict /CMap defineresource pop
47 | end
48 | end
49 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (C) 2021 Masamichi Hosoda.
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
 1 | <!-- -*- coding: utf-8 -*- -->
 2 | # A tool to fix ToUnicod CMap in PDF to prevent extracted text from being garbled
 3 | 
 4 | [ English / [日本語 (Japanese)](Readme.ja.md) ]
 5 | 
 6 | When copying and pasting text from a PDF file, depending on the PDF, kanji characters such as "見" and "高" are often garbled into similar but different characters (e.g. special characters such as Kangxi Radical and CJK Radicals Supplement).
 7 | This tool fixes such PDF that raises the garbled text extraction and generates a PDF that does not raise the garbled.
 8 | 
 9 | ## Caution
10 | 
11 | It is not possible to reverse the conversion from the PDF generated by this tool to the original PDF (it is irreversible).
12 | It is also possible that the generated PDF may be corrupted without you noticing it.
13 | Be sure to keep a backup of the original PDF.
14 | 
15 | ## Usage
16 | 
17 | ```
18 | $ pdf-fix-tuc
19 | Fix ToUnicode CMap in PDF 0.0.0
20 | Copyright (C) 2021 Masamichi Hosoda. All rights reserved.
21 | License: BSD-2-Clause
22 | 
23 | https://github.com/trueroad/pdf-fix-tuc
24 | 
25 | Usage: pdf-fix-tuc [options] [INPUT.pdf OUTPUT.pdf] ...
26 | 
27 |   -h, --help
28 |     Print help and exit
29 |   -V, --version
30 |     Print version and exit
31 |   --verbose
32 |     Verbose
33 | 
34 | Output PDF settings (QPDF):
35 |   --linearize
36 |     Output linearized (web-optimized) PDF
37 |   --object-streams=[preserve|disable|generate]   (default=preserve)
38 |     Settings for object streams
39 |   --newline-before-endstream
40 |     Output newline before endstream
41 |   --qdf
42 |     Output QDF
43 | 
44 | $
45 | ```
46 | 
47 | ## Install
48 | 
49 | ### Required
50 | 
51 | * C++11 compiler (g++ 4.9+ etc.)
52 | * libqpdf
53 | * pkg-config etc.
54 | * Autoconf 2.69+
55 | * Automake
56 | 
57 | When you would like to use packages for preparing the required library, the following might be convenient.
58 | 
59 | * Debian / Ubuntu
60 |     + libqpdf-dev
61 | * Fedora
62 |     + qpdf-devel
63 | * Cygwin
64 |     + libqpdf-devel
65 | 
66 | ### Build & install
67 | 
68 | ```
69 | $ git clone https://github.com/trueroad/pdf-fix-tuc.git
70 | $ cd pdf-fix-tuc
71 | $ ./autogen.sh
72 | $ mkdir build
73 | $ cd build
74 | $ ../configure
75 | $ make
76 | $ make install
77 | ```
78 | 
79 | ## License
80 | 
81 | Copyright (C) 2021 Masamichi Hosoda. All rights reserved.
82 | 
83 | License: BSD-2-Clause
84 | 
85 | See [LICENSE](./LICENSE).
86 | 


--------------------------------------------------------------------------------
/test/build_bfrange2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | #
 4 | # Fix ToUnicode CMap in PDF
 5 | # https://github.com/trueroad/pdf-fix-tuc
 6 | #
 7 | # build_bfrange2.py:
 8 | #   Build bfrange2 PDF.
 9 | #
10 | # Copyright (C) 2021 Masamichi Hosoda.
11 | # All rights reserved.
12 | #
13 | # Redistribution and use in source and binary forms, with or without
14 | # modification, are permitted provided that the following conditions
15 | # are met:
16 | #
17 | # * Redistributions of source code must retain the above copyright notice,
18 | #   this list of conditions and the following disclaimer.
19 | #
20 | # * Redistributions in binary form must reproduce the above copyright notice,
21 | #   this list of conditions and the following disclaimer in the documentation
22 | #   and/or other materials provided with the distribution.
23 | #
24 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
25 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 | # ARE DISCLAIMED.
28 | # IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
29 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 | # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 | # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 | # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 | # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 | # SUCH DAMAGE.
36 | #
37 | 
38 | import re
39 | import sys
40 | 
41 | def main ():
42 |     re_begin = re.compile (r"(\d+)\s+beginbfrange\r?\n?")
43 | 
44 |     for line in sys.stdin.buffer:
45 |         try:
46 |             line = line.decode ("utf-8")
47 |         except UnicodeDecodeError:
48 |             sys.stdout.buffer.write (line)
49 |             continue
50 | 
51 |         m = re_begin.match (line)
52 |         if m:
53 |             print ("{} beginbfrange".format (int (m.group (1)) + 5))
54 |             print ("""\
55 | <0001> <0003> <2E81>
56 | <0004> <0006> <2E99>
57 | <0007> <0009> <2EF2>
58 | <000A> <000C> <2EFF>
59 | <000D> <000F> <2FD4>\
60 | """)
61 |             continue
62 | 
63 |         print (line, end = "", flush = True)
64 | 
65 | if __name__ == "__main__":
66 |     main ()
67 | 


--------------------------------------------------------------------------------
/test/build_bfrange.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | #
 4 | # Fix ToUnicode CMap in PDF
 5 | # https://github.com/trueroad/pdf-fix-tuc
 6 | #
 7 | # build_bfrange.py:
 8 | #   Build bfrange PDF.
 9 | #
10 | # Copyright (C) 2021 Masamichi Hosoda.
11 | # All rights reserved.
12 | #
13 | # Redistribution and use in source and binary forms, with or without
14 | # modification, are permitted provided that the following conditions
15 | # are met:
16 | #
17 | # * Redistributions of source code must retain the above copyright notice,
18 | #   this list of conditions and the following disclaimer.
19 | #
20 | # * Redistributions in binary form must reproduce the above copyright notice,
21 | #   this list of conditions and the following disclaimer in the documentation
22 | #   and/or other materials provided with the distribution.
23 | #
24 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
25 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 | # ARE DISCLAIMED.
28 | # IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
29 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 | # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 | # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 | # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 | # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 | # SUCH DAMAGE.
36 | #
37 | 
38 | import re
39 | import sys
40 | 
41 | def main ():
42 |     b_bfchar = False
43 |     re_begin = re.compile (r"(\d+)\s+beginbfchar\r?\n?")
44 |     re_bfchar = re.compile (r"<([\da-fA-F]+)>\s*<([\da-fA-F]+)>\r?\n?")
45 |     re_end = re.compile (r"endbfchar\r?\n?")
46 | 
47 |     for line in sys.stdin.buffer:
48 |         try:
49 |             line = line.decode ("utf-8")
50 |         except UnicodeDecodeError:
51 |             sys.stdout.buffer.write (line)
52 |             continue
53 | 
54 |         if b_bfchar:
55 |             if re_end.match (line):
56 |                 line = "endbfrange\n"
57 |                 b_bfchar = False
58 |             else:
59 |                 m = re_bfchar.match (line)
60 |                 if m:
61 |                     line = "<{}> <{}> <{}>\n".\
62 |                         format (m.group (1), m.group (1), m.group (2))
63 |         else:
64 |             m = re_begin.match (line)
65 |             if m:
66 |                 line = "{} beginbfrange\n".format (m.group (1))
67 |                 b_bfchar = True
68 | 
69 |         print (line, end = "", flush = True)
70 | 
71 | if __name__ == "__main__":
72 |     main ()
73 | 


--------------------------------------------------------------------------------
/test/Makefile.am:
--------------------------------------------------------------------------------
 1 | TESTS = $(TESTPDFTOTEXT) $(TESTQDFCONTENTS)
 2 | 
 3 | TEST_EXTENSIONS = .pdftotext .qdfcontents
 4 | 
 5 | PDFTOTEXT_LOG_COMPILER = $(srcdir)/test-pdftotext.sh
 6 | QDFCONTENTS_LOG_COMPILER = $(srcdir)/test-qdfcontents.sh
 7 | 
 8 | TESTS_ENVIRONMENT = top_builddir=$(top_builddir) \
 9 | 	srcdir=$(srcdir) \
10 | 	DIFF=$(DIFF) \
11 | 	PDFTOTEXT=$(PDFTOTEXT) \
12 | 	GREP=$(GREP)
13 | 
14 | # Source files for test PDFs
15 | TESTTEXS = bfchar.tex
16 | # Test PDFs
17 | TESTPDFS = $(TESTTEXS:.tex=.pdf) bfrange.pdf bfrange2.pdf
18 | # Garbled text
19 | TESTGARBLED = $(TESTPDFS:.pdf=-garbled.txt)
20 | # Expected text
21 | TESTEXPECTED = $(TESTPDFS:.pdf=-expected.txt)
22 | # Expected QDF contents
23 | TESTCONTENTS = $(TESTPDFS:.pdf=-qdfcontents.txt)
24 | 
25 | # Tests by pdftotext
26 | TESTPDFTOTEXT = $(TESTPDFS:.pdf=-testtext.pdftotext)
27 | 
28 | # Tests by QDF contents
29 | TESTQDFCONTENTS = $(TESTPDFS:.pdf=-testqdf.qdfcontents)
30 | 
31 | # Test scripts and test PDFs
32 | EXTRA_DIST = test-pdftotext.sh test-qdfcontents.sh \
33 | 	$(TESTTEXS) $(TESTPDFS) \
34 | 	$(TESTGARBLED) $(TESTEXPECTED) $(TESTCONTENTS) \
35 | 	SourceHanSerif-Regular-UTF16 \
36 | 	build_bfrange.py build_bfrange2.py
37 | 
38 | # For `make clean'
39 | CLEANFILES = $(TESTTEXS:.tex=.log) \
40 | 	$(TESTPDFTOTEXT) \
41 | 	$(TESTQDFCONTENTS) \
42 | 	$(TESTPDFS:.pdf=-test.pdf)
43 | 
44 | if GENERATE_TEST_PDF
45 | # Generate garbled PDFs
46 | # Tarball contains them for convenience.
47 | %.dvi: %.tex
48 | 	$(UPTEX) -halt-on-error -interaction=nonstopmode -file-line-error $<
49 | 
50 | %.pdf: %.dvi %-garbled.txt SourceHanSerif-Regular-UTF16
51 | 	if [ ! -e SourceHanSerif-Regular-UTF16 ]; then \
52 | 		ln -s $(srcdir)/SourceHanSerif-Regular-UTF16 ; \
53 | 	fi
54 | 	$(DVIPDFMX) $< -o $@
55 | 	$(PDFTOTEXT) $@ $(@:.pdf=-extracted.txt)
56 | 	$(DIFF) -ubwBE $(word 2,$^) $(@:.pdf=-extracted.txt)
57 | 
58 | # Generate bfrange PDF
59 | # Tarball contains them for convenience.
60 | bfchar.qdf: bfchar.pdf
61 | 	$(QPDF) --qdf $< $@
62 | 
63 | bfrange.qdf: bfchar.qdf
64 | 	$(srcdir)/build_bfrange.py < $< > $@
65 | 
66 | bfrange.pdf: bfrange.qdf
67 | 	$(FIX_QDF) < $< > $@
68 | 
69 | # Generate bfrange2 PDF
70 | # Tarball contains them for convenience.
71 | bfrange2.qdf: bfrange.qdf
72 | 	$(srcdir)/build_bfrange2.py < $< > $@
73 | 
74 | bfrange2.pdf: bfrange2.qdf
75 | 	$(FIX_QDF) < $< > $@
76 | endif
77 | 
78 | # Test PDFs
79 | %-test.pdf: %.pdf
80 | 	${top_builddir}/src/pdf-fix-tuc $< $@
81 | 
82 | # Test pdftotext
83 | %-testtext.pdftotext: %-test.pdf %-expected.txt
84 | 	$(PDFTOTEXT) $< $@
85 | 
86 | # Test QDFs
87 | %-test.qdf: %.pdf
88 | 	${top_builddir}/src/pdf-fix-tuc --qdf $< $@
89 | 
90 | # Test qdfcontents
91 | %-testqdf.qdfcontents: %-test.qdf %-qdfcontents.txt
92 | 	ln -s $< $@
93 | 
94 | .PRECIOUS: %.dvi %.pdf %-test.pdf %-testtext.pdftotext \
95 | 		%-test.qdf %-testqdf.qdfcontents
96 | 


--------------------------------------------------------------------------------
/src/tounicode.hh:
--------------------------------------------------------------------------------
 1 | //
 2 | // Fix ToUnicode CMap in PDF
 3 | // https://github.com/trueroad/pdf-fix-tuc
 4 | //
 5 | // tounicode.hh: ToUnicode CMap handler class
 6 | //
 7 | // Copyright (C) 2021 Masamichi Hosoda.
 8 | // All rights reserved.
 9 | //
10 | // Redistribution and use in source and binary forms, with or without
11 | // modification, are permitted provided that the following conditions
12 | // are met:
13 | //
14 | // * Redistributions of source code must retain the above copyright notice,
15 | //   this list of conditions and the following disclaimer.
16 | //
17 | // * Redistributions in binary form must reproduce the above copyright notice,
18 | //   this list of conditions and the following disclaimer in the documentation
19 | //   and/or other materials provided with the distribution.
20 | //
21 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 | // ARE DISCLAIMED.
25 | // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
26 | // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 | // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 | // OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 | // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 | // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 | // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 | // SUCH DAMAGE.
33 | //
34 | 
35 | #ifndef INCLUDE_GUARD_TOUNICODE_HH
36 | #define INCLUDE_GUARD_TOUNICODE_HH
37 | 
38 | #include <regex>
39 | #include <sstream>
40 | #include <string>
41 | #include <unordered_map>
42 | 
43 | #include "regex_dispatcher_m.hh"
44 | 
45 | class tounicode: public regex_dispatcher::member_table<tounicode>
46 | {
47 | public:
48 |   tounicode ();
49 | 
50 |   std::string get (void);
51 | 
52 | private:
53 |   bool bfchar_hex (const std::smatch &);
54 |   void bfrange_output (const int, const int, const int,
55 |                        const std::string &, const std::string &,
56 |                        const std::string &);
57 |   bool bfrange_hex (const std::smatch &);
58 |   bool beginbfchar (const std::smatch &);
59 |   bool endbfchar (const std::smatch &);
60 |   bool beginbfrange (const std::smatch &);
61 |   bool endbfrange (const std::smatch &);
62 |   bool other (const std::smatch &);
63 | 
64 |   std::stringstream ss_;
65 |   std::stringstream ss_bfrange_;
66 |   int bfranges_;
67 |   std::string beginbfrange_space_;
68 |   std::string beginbfrange_cr_;
69 |   bool is_bfchar_ = false;
70 |   bool is_bfrange_ = false;
71 | 
72 |   const static std::unordered_map<int, int> table_;
73 | };
74 | 
75 | #endif // INCLUDE_GUARD_TOUNICODE_HH
76 | 


--------------------------------------------------------------------------------
/src/regex_dispatcher_m.hh:
--------------------------------------------------------------------------------
 1 | //
 2 | // Regex dispatcher table for C++11
 3 | // https://gist.github.com/trueroad/bdb81622c083ee544698c4bc55cda6a9
 4 | //
 5 | // regex_dispatcher_m.hh: Regex dispatcher table class by member function
 6 | //
 7 | // Copyright (C) 2018 Masamichi Hosoda.
 8 | // All rights reserved.
 9 | //
10 | // Redistribution and use in source and binary forms, with or without
11 | // modification, are permitted provided that the following conditions
12 | // are met:
13 | //
14 | // * Redistributions of source code must retain the above copyright notice,
15 | //   this list of conditions and the following disclaimer.
16 | //
17 | // * Redistributions in binary form must reproduce the above copyright notice,
18 | //   this list of conditions and the following disclaimer in the documentation
19 | //   and/or other materials provided with the distribution.
20 | //
21 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 | // ARE DISCLAIMED.
25 | // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
26 | // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 | // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 | // OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 | // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 | // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 | // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 | // SUCH DAMAGE.
33 | //
34 | 
35 | #ifndef INCLUDE_GUARD_REGEX_DISPATCHER_M_HH_
36 | #define INCLUDE_GUARD_REGEX_DISPATCHER_M_HH_
37 | 
38 | #include <istream>
39 | #include <regex>
40 | #include <string>
41 | #include <utility>
42 | #include <vector>
43 | 
44 | namespace regex_dispatcher
45 | {
46 |   template <class Derived>
47 |   class member_table
48 |   {
49 |     using callback_type = bool (Derived::*) (const std::smatch &);
50 |     using table_type =
51 |       std::vector<std::pair<std::regex, callback_type>>;
52 | 
53 |   public:
54 |     member_table (table_type t):
55 |       table_ (t)
56 |     {
57 |     }
58 | 
59 |     void process_line (const std::string &s)
60 |     {
61 |       std::smatch sm;
62 |       for (const auto &t: table_)
63 |         {
64 |           if (std::regex_match (s, sm, t.first))
65 |             {
66 |               if ((static_cast<Derived*> (this)->*(t.second)) (sm))
67 |                 break;
68 |             }
69 |         }
70 |     }
71 | 
72 |     void process_stream (std::istream &is)
73 |     {
74 |       std::string line;
75 |       while (std::getline (is, line))
76 |         {
77 |           process_line (line);
78 |         }
79 |     }
80 | 
81 |   private:
82 |     table_type table_;
83 |   };
84 | }
85 | 
86 | #endif // INCLUDE_GUARD_REGEX_DISPATCHER_M_HH_
87 | 


--------------------------------------------------------------------------------
/src/pdf-handler.hh:
--------------------------------------------------------------------------------
  1 | //
  2 | // Fix ToUnicode CMap in PDF
  3 | // https://github.com/trueroad/pdf-fix-tuc
  4 | //
  5 | // pdf-handler.hh: PDF handler class
  6 | //
  7 | // Copyright (C) 2021 Masamichi Hosoda.
  8 | // All rights reserved.
  9 | //
 10 | // Redistribution and use in source and binary forms, with or without
 11 | // modification, are permitted provided that the following conditions
 12 | // are met:
 13 | //
 14 | // * Redistributions of source code must retain the above copyright notice,
 15 | //   this list of conditions and the following disclaimer.
 16 | //
 17 | // * Redistributions in binary form must reproduce the above copyright notice,
 18 | //   this list of conditions and the following disclaimer in the documentation
 19 | //   and/or other materials provided with the distribution.
 20 | //
 21 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 22 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 23 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 24 | // ARE DISCLAIMED.
 25 | // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 26 | // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 27 | // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 28 | // OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 29 | // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 30 | // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 31 | // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 32 | // SUCH DAMAGE.
 33 | //
 34 | 
 35 | #ifndef INCLUDE_GUARD_PDF_HANDLER_HH
 36 | #define INCLUDE_GUARD_PDF_HANDLER_HH
 37 | 
 38 | #include <string>
 39 | 
 40 | #include <qpdf/QPDF.hh>
 41 | #include <qpdf/QPDFObjectHandle.hh>
 42 | 
 43 | class pdf_handler
 44 | {
 45 | public:
 46 |   pdf_handler ()
 47 |   {
 48 |   }
 49 |   void set_verbose (bool v)
 50 |   {
 51 |     verbose_ = v;
 52 |   }
 53 |   void set_filename_in (const std::string &f)
 54 |   {
 55 |     filename_in_ = f;
 56 |   }
 57 |   void set_filename_out (const std::string &f)
 58 |   {
 59 |     filename_out_ = f;
 60 |   }
 61 |   void set_linearize (bool l)
 62 |   {
 63 |     linearize_ = l;
 64 |   }
 65 |   void set_object_streams (qpdf_object_stream_e os)
 66 |   {
 67 |     object_streams_ = os;
 68 |   }
 69 |   void set_newline_before_endstream (bool n)
 70 |   {
 71 |     newline_before_endstream_ = n;
 72 |   }
 73 |   void set_qdf (bool q)
 74 |   {
 75 |     qdf_ = q;
 76 |   }
 77 | 
 78 |   void load ();
 79 |   void process ();
 80 |   void save ();
 81 | 
 82 | private:
 83 |   void process_obj (QPDFObjectHandle oh);
 84 |   void process_font (QPDFObjectHandle oh);
 85 |   void process_tounicode (QPDFObjectHandle oh);
 86 | 
 87 |   std::string filename_in_;
 88 |   std::string filename_out_;
 89 | 
 90 |   QPDF qpdf_;
 91 | 
 92 |   bool verbose_ = false;
 93 | 
 94 |   bool linearize_ = false;
 95 |   qpdf_object_stream_e object_streams_ = qpdf_o_preserve;
 96 |   bool newline_before_endstream_ = false;
 97 |   bool qdf_ = false;
 98 | };
 99 | 
100 | #endif // INCLUDE_GUARD_PDF_HANDLER_HH
101 | 


--------------------------------------------------------------------------------
/src/build_table.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | #
  4 | # Fix ToUnicode CMap in PDF
  5 | # https://github.com/trueroad/pdf-fix-tuc
  6 | #
  7 | # build_table.py:
  8 | #   Build radical to ideograph table.
  9 | #
 10 | # Copyright (C) 2021 Masamichi Hosoda.
 11 | # All rights reserved.
 12 | #
 13 | # Redistribution and use in source and binary forms, with or without
 14 | # modification, are permitted provided that the following conditions
 15 | # are met:
 16 | #
 17 | # * Redistributions of source code must retain the above copyright notice,
 18 | #   this list of conditions and the following disclaimer.
 19 | #
 20 | # * Redistributions in binary form must reproduce the above copyright notice,
 21 | #   this list of conditions and the following disclaimer in the documentation
 22 | #   and/or other materials provided with the distribution.
 23 | #
 24 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 25 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 26 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 27 | # ARE DISCLAIMED.
 28 | # IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 29 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 30 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 31 | # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 32 | # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 33 | # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 34 | # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 35 | # SUCH DAMAGE.
 36 | #
 37 | 
 38 | import sys
 39 | 
 40 | # See tt_cmap.c in dvipdfmx source.
 41 | def is_PUA_or_presentation (unicode):
 42 |     uni = int (unicode.lstrip ('U+'), base = 16)
 43 | 
 44 |     if uni >= 0x2E80 and uni <= 0x2EF3:
 45 |         return True
 46 |     if uni >= 0x2F00 and uni <= 0x2FD5:
 47 |         return True
 48 |     if uni >= 0xE000 and uni <= 0xF8FF:
 49 |         return True
 50 |     if uni >= 0xFB00 and uni <= 0xFB4F:
 51 |         return True
 52 |     if uni >= 0xF900 and uni <= 0xFAFF:
 53 |         return True
 54 |     if uni >= 0x2F800 and uni <= 0x2FA1F:
 55 |         return True
 56 |     if uni >= 0xF0000 and uni <= 0xFFFFD:
 57 |         return True
 58 |     if uni >= 0x100000 and uni <= 0x10FFFD:
 59 |         return True
 60 |     if uni == 0x00AD:
 61 |         return True
 62 |     return False
 63 | 
 64 | def process (file):
 65 |     with open (file, "r") as f:
 66 |         for line in f:
 67 |             if line.startswith ("#"):
 68 |                 continue
 69 |             items = line.split ()
 70 |             unijis2004 = items[18]
 71 |             if ':' in unijis2004:
 72 |                 unis = unijis2004.split (':')
 73 |                 lows = []
 74 |                 highs = []
 75 |                 for u in unis:
 76 |                     if is_PUA_or_presentation (u):
 77 |                         lows.append (u)
 78 |                     else:
 79 |                         highs.append (u)
 80 |                 if len (lows) > 0 and len (highs) > 0:
 81 |                     for low in lows:
 82 |                         key = int (low.lstrip ('U+'), base = 16)
 83 |                         value = int (highs[0].lstrip ('U+'), base = 16)
 84 |                         if key < 0x10000 and value >= 0x10000:
 85 |                             print ("    // { " + \
 86 |                                    "0x{:x}, 0x{:x}".format (key, value) + \
 87 |                                    " },")
 88 |                         else:
 89 |                             print ("    { " + \
 90 |                                    "0x{:x}, 0x{:x}".format (key, value) + \
 91 |                                    " },")
 92 | 
 93 | def main ():
 94 |     input_filename = "aj17-kanji.txt"
 95 | 
 96 |     if len (sys.argv) > 1:
 97 |         input_filename = sys.argv[1]
 98 | 
 99 |     print ('''\
100 | // Generated by build_table.py
101 | // Do not edit this file.
102 | 
103 | #include <unordered_map>
104 | 
105 | #include "tounicode.hh"
106 | 
107 | const std::unordered_map<int, int> tounicode::table_ =
108 |   {\
109 | ''')
110 | 
111 |     process (input_filename)
112 | 
113 |     print ('''\
114 |   };\
115 | ''')
116 | 
117 | if __name__ == "__main__":
118 |     main ()
119 | 


--------------------------------------------------------------------------------
/src/main.cc:
--------------------------------------------------------------------------------
  1 | //
  2 | // Fix ToUnicode CMap in PDF
  3 | // https://github.com/trueroad/pdf-fix-tuc
  4 | //
  5 | // Copyright (C) 2021 Masamichi Hosoda.
  6 | // All rights reserved.
  7 | //
  8 | // Redistribution and use in source and binary forms, with or without
  9 | // modification, are permitted provided that the following conditions
 10 | // are met:
 11 | //
 12 | // * Redistributions of source code must retain the above copyright notice,
 13 | //   this list of conditions and the following disclaimer.
 14 | //
 15 | // * Redistributions in binary form must reproduce the above copyright notice,
 16 | //   this list of conditions and the following disclaimer in the documentation
 17 | //   and/or other materials provided with the distribution.
 18 | //
 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 20 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 21 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 22 | // ARE DISCLAIMED.
 23 | // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 24 | // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 25 | // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 26 | // OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 27 | // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 28 | // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 29 | // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 30 | // SUCH DAMAGE.
 31 | //
 32 | 
 33 | #include "config.h"
 34 | 
 35 | #include <exception>
 36 | #include <iostream>
 37 | #include <string>
 38 | 
 39 | #include "cmdlineparse.hh"
 40 | #include "pdf-handler.hh"
 41 | 
 42 | bool parse_options (int argc, char *argv[], pdf_handler *pph)
 43 | {
 44 |   cmdlineparse::parser cmd;
 45 | 
 46 |   cmd.set_usage_unamed_opts ("INPUT.pdf OUTPUT.pdf");
 47 |   cmd.add_default ();
 48 | 
 49 |   bool verbose;
 50 |   cmd.add_flag (0, "verbose", &verbose, "    Verbose");
 51 | 
 52 |   bool linearize;
 53 |   std::string object_streams;
 54 |   bool newline_before_endstream;
 55 |   bool qdf;
 56 |   cmd.add_flag (0, "linearize", &linearize,
 57 |                 "    Output linearized (web-optimized) PDF",
 58 |                 "Output PDF settings (QPDF)");
 59 |   cmd.add_string (0, "object-streams", &object_streams, "preserve",
 60 |                   "    Settings for object streams",
 61 |                   "[preserve|disable|generate]",
 62 |                   "Output PDF settings (QPDF)");
 63 |   cmd.add_flag (0, "newline-before-endstream", &newline_before_endstream,
 64 |                 "    Output newline before endstream",
 65 |                 "Output PDF settings (QPDF)");
 66 |   cmd.add_flag (0, "qdf", &qdf,
 67 |                 "    Output QDF",
 68 |                 "Output PDF settings (QPDF)");
 69 | 
 70 |   if (!cmd.parse (argc, argv))
 71 |     return false;
 72 | 
 73 |   auto uargs {cmd.get_unamed_args ()};
 74 |   if (uargs.size () != 2)
 75 |     {
 76 |       std::cout << cmd.build_help ();
 77 |       return false;
 78 |     }
 79 | 
 80 |   std::cout << cmd.get_version_string () << std::endl;
 81 | 
 82 |   pph->set_verbose (verbose);
 83 | 
 84 |   pph->set_linearize (linearize);
 85 |   if (object_streams == "preserve")
 86 |     pph->set_object_streams (qpdf_o_preserve);
 87 |   else if (object_streams == "generate")
 88 |     pph->set_object_streams (qpdf_o_generate);
 89 |   else if (object_streams == "disable")
 90 |     pph->set_object_streams (qpdf_o_disable);
 91 |   else
 92 |     {
 93 |       std::cerr << "unknwon --object-streams mode" << std::endl;
 94 |       return false;
 95 |     }
 96 | 
 97 |   pph->set_newline_before_endstream (newline_before_endstream);
 98 |   pph->set_qdf (qdf);
 99 | 
100 |   pph->set_filename_in (uargs[0]);
101 |   pph->set_filename_out (uargs[1]);
102 | 
103 |   return true;
104 | }
105 | 
106 | int main (int argc, char *argv[])
107 | {
108 |   try
109 |     {
110 |       pdf_handler ph;
111 |       if (!parse_options (argc, argv, &ph))
112 |         return false;
113 | 
114 |       ph.load ();
115 |       ph.process ();
116 |       ph.save ();
117 | 
118 |       std::cout << "Done." << std::endl;
119 |     }
120 |   catch (std::exception &e)
121 |     {
122 |       std::cerr << e.what () << std::endl;
123 |       return 1;
124 |     }
125 | 
126 |   return 0;
127 | }
128 | 


--------------------------------------------------------------------------------
/configure.ac:
--------------------------------------------------------------------------------
  1 | #                                               -*- Autoconf -*-
  2 | # Process this file with autoconf to produce a configure script.
  3 | 
  4 | AC_PREREQ([2.69])
  5 | AC_INIT([Fix ToUnicode CMap in PDF], [0.0.0], ,
  6 |     [pdf-fix-tuc], [https://github.com/trueroad/pdf-fix-tuc])
  7 | AM_INIT_AUTOMAKE([foreign])
  8 | AC_CONFIG_SRCDIR([src/main.cc])
  9 | AC_CONFIG_HEADERS([config.h])
 10 | 
 11 | PACKAGE_COPYRIGHT="Copyright (C) 2021 Masamichi Hosoda. All rights reserved."
 12 | PACKAGE_LICENSE="License: BSD-2-Clause"
 13 | 
 14 | AC_SUBST([PACKAGE_COPYRIGHT])
 15 | AC_SUBST([PACKAGE_LICENSE])
 16 | AC_DEFINE_UNQUOTED([PACKAGE_COPYRIGHT], ["$PACKAGE_COPYRIGHT"],
 17 |     [Define to the copyright of this package.])
 18 | AC_DEFINE_UNQUOTED([PACKAGE_LICENSE], ["$PACKAGE_LICENSE"],
 19 |     [Define to the license of this package.])
 20 | 
 21 | # Checks for programs.
 22 | AC_PROG_CXX
 23 | 
 24 | AC_PATH_PROG([DIFF], [diff])
 25 | AS_IF([test "x$DIFF" = x],
 26 |     [AC_MSG_WARN([diff isn't found. `make check' will fail tests.])])
 27 | AC_SUBST([DIFF])
 28 | 
 29 | AC_PATH_PROG([PDFTOTEXT], [pdftotext])
 30 | AS_IF([test "x$PDFTOTEXT" = x],
 31 |     [AC_MSG_WARN([pdftotext isn't found. `make check' will fail tests.])])
 32 | AC_SUBST([PDFTOTEXT])
 33 | 
 34 | AC_PATH_PROG([GREP], [grep])
 35 | AS_IF([test "x$GREP" = x],
 36 |     [AC_MSG_WARN([grep isn't found. `make check' will fail tests.])])
 37 | AC_SUBST([GREP])
 38 | 
 39 | AC_MSG_CHECKING([for pre-generated test PDFs])
 40 | AS_IF([test ! -e ${srcdir}/test/bfchar.pdf],
 41 |     [AC_MSG_RESULT([no])
 42 |      AC_PATH_PROG([UPTEX], [uptex])
 43 |      AS_IF([test "x$UPTEX" = x],
 44 |         [AC_MSG_WARN(
 45 |             [uptex isn't found. Test PDFs cannot be generated.])
 46 |          AC_MSG_WARN(
 47 |             [You cannot use `make check'.])])
 48 |      AC_PATH_PROG([DVIPDFMX], [dvipdfmx])
 49 |      AS_IF([test "x$DVIPDFMX" = x],
 50 |         [AC_MSG_WARN(
 51 |             [dvipdfmx isn't found. Test PDFs cannot be generated.])
 52 |          AC_MSG_WARN(
 53 |             [You cannot use `make check'.])])
 54 |      AC_PATH_PROG([QPDF], [qpdf])
 55 |      AS_IF([test "x$QPDF" = x],
 56 |         [AC_MSG_WARN(
 57 |             [qpdf isn't found. Test PDFs cannot be generated.])
 58 |          AC_MSG_WARN(
 59 |             [You cannot use `make check'.])])
 60 |      AC_PATH_PROG([FIX_QDF], [fix-qdf])
 61 |      AS_IF([test "x$FIX_QDF" = x],
 62 |         [AC_MSG_WARN(
 63 |             [fix-qdf isn't found. Test PDFs cannot be generated.])
 64 |          AC_MSG_WARN(
 65 |             [You cannot use `make check'.])])
 66 |      AC_PATH_PROG([PYTHON3], [python3])
 67 |      AS_IF([test "x$PYTHON3" = x],
 68 |         [AC_MSG_WARN(
 69 |             [python3 isn't found. Test PDFs cannot be generated.])
 70 |          AC_MSG_WARN(
 71 |             [You cannot use `make check'.])])],
 72 |     [AC_MSG_RESULT([yes])])
 73 | AM_CONDITIONAL([GENERATE_TEST_PDF],
 74 |     [test ! -e ${srcdir}/test/bfchar.pdf])
 75 | AC_SUBST([UPTEX])
 76 | AC_SUBST([DVIPDFMX])
 77 | AC_SUBST([QPDF])
 78 | AC_SUBST([FIX_QDF])
 79 | 
 80 | AC_ARG_ENABLE([build-table],
 81 |     [AS_HELP_STRING([--enable-build-table],
 82 |         [Enable building table (default is disable)])],
 83 |     [enable_build_table=yes
 84 |      AC_MSG_CHECKING([for build-table source aj17-kanji.txt])
 85 |      AS_IF([test ! -e ${srcdir}/src/aj17-kanji.txt],
 86 |         [AC_MSG_RESULT([no])
 87 |          AC_PATH_PROG([WGET], [wget])
 88 |          AS_IF([test "x$WGET" = x],
 89 |             [AC_MSG_ERROR(
 90 |                 [wget isn't found. aj17-kanji.txt cannot be retrieved.])],
 91 |             [AC_MSG_NOTICE(
 92 |                 [wget is found. aj17-kanji.txt will be retrieved.])])],
 93 |         [AC_MSG_RESULT([yes])])
 94 |      AC_PATH_PROG([PYTHON3], [python3])
 95 |      AS_IF([test "x$PYTHON3" = x],
 96 |         [AC_MSG_ERROR([python3 isn't found. You cannot enable build-table.])],
 97 |         [AC_MSG_NOTICE([python3 is found. The table will build.])])],
 98 |     [enable_build_table=no])
 99 | AM_CONDITIONAL([ENABLE_BUILD_TABLE], [test "x$enable_build_table" = xyes])
100 | AC_SUBST([WGET])
101 | 
102 | # Checks for libraries.
103 | PKG_PROG_PKG_CONFIG
104 | PKG_CHECK_MODULES([LIBQPDF], [libqpdf])
105 | 
106 | # Checks for header files.
107 | 
108 | # Checks for typedefs, structures, and compiler characteristics.
109 | AX_CXX_COMPILE_STDCXX(11, [noext], [mandatory])
110 | 
111 | # Checks for library functions.
112 | 
113 | AC_CONFIG_FILES([Makefile
114 |     src/Makefile
115 |     test/Makefile])
116 | AC_OUTPUT
117 | 


--------------------------------------------------------------------------------
/src/pdf-handler.cc:
--------------------------------------------------------------------------------
  1 | //
  2 | // Fix ToUnicode CMap in PDF
  3 | // https://github.com/trueroad/pdf-fix-tuc
  4 | //
  5 | // pdf-handler.cc: PDF handler class
  6 | //
  7 | // Copyright (C) 2021 Masamichi Hosoda.
  8 | // All rights reserved.
  9 | //
 10 | // Redistribution and use in source and binary forms, with or without
 11 | // modification, are permitted provided that the following conditions
 12 | // are met:
 13 | //
 14 | // * Redistributions of source code must retain the above copyright notice,
 15 | //   this list of conditions and the following disclaimer.
 16 | //
 17 | // * Redistributions in binary form must reproduce the above copyright notice,
 18 | //   this list of conditions and the following disclaimer in the documentation
 19 | //   and/or other materials provided with the distribution.
 20 | //
 21 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 22 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 23 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 24 | // ARE DISCLAIMED.
 25 | // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 26 | // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 27 | // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 28 | // OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 29 | // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 30 | // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 31 | // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 32 | // SUCH DAMAGE.
 33 | //
 34 | 
 35 | #include "config.h"
 36 | 
 37 | #include "pdf-handler.hh"
 38 | 
 39 | #include <iostream>
 40 | #include <sstream>
 41 | #include <string>
 42 | 
 43 | #include <qpdf/QPDF.hh>
 44 | #include <qpdf/QPDFObjectHandle.hh>
 45 | #include <qpdf/QPDFWriter.hh>
 46 | 
 47 | #include "tounicode.hh"
 48 | 
 49 | void pdf_handler::load ()
 50 | {
 51 |   if (verbose_)
 52 |     std::cout << "Loading \"" << filename_in_ << "\"" << std::endl;
 53 | 
 54 |   qpdf_.processFile (filename_in_.c_str ());
 55 | }
 56 | 
 57 | void pdf_handler::process ()
 58 | {
 59 |   auto objs {qpdf_.getAllObjects ()};
 60 |   for (auto o: objs)
 61 |     process_obj (o);
 62 | }
 63 | 
 64 | void pdf_handler::save ()
 65 | {
 66 |   if (verbose_)
 67 |     std::cout << "Writing \"" << filename_out_ << "\"" << std::endl;
 68 | 
 69 |   QPDFWriter w (qpdf_, filename_out_.c_str ());
 70 |   w.setLinearization (linearize_);
 71 |   w.setObjectStreamMode (object_streams_);
 72 |   w.setNewlineBeforeEndstream (newline_before_endstream_);
 73 |   w.setQDFMode (qdf_);
 74 |   w.write ();
 75 | }
 76 | 
 77 | void pdf_handler::process_obj (QPDFObjectHandle oh)
 78 | {
 79 |   if (verbose_)
 80 |     {
 81 |       std::cout << "Object: ID " << oh.getObjectID ()
 82 |                 << ", Generation " << oh.getGeneration ()
 83 |                 << ", Type " << oh.getTypeName ()
 84 |                 << std::endl;
 85 |     }
 86 | 
 87 |   if (oh.isDictionary () && oh.hasKey ("/Type"))
 88 |     {
 89 |       if (verbose_)
 90 |         std::cout << "  /Type ";
 91 | 
 92 |       auto type {oh.getKey ("/Type")};
 93 |       if (type.isName ())
 94 |         {
 95 |           if (verbose_)
 96 |             std::cout << type.getName () << std::endl;
 97 |           if (std::string ("/Font") == type.getName ())
 98 |             process_font (oh);
 99 |         }
100 |       else
101 |         {
102 |           if(verbose_)
103 |             std::cout << "does not contain a name" << std::endl;
104 |         }
105 |     }
106 | }
107 | 
108 | void pdf_handler::process_font (QPDFObjectHandle oh)
109 | {
110 |   if (oh.hasKey ("/ToUnicode"))
111 |     {
112 |       if (verbose_)
113 |         std::cout << "  /ToUnicode ";
114 | 
115 |       auto tounicode {oh.getKey ("/ToUnicode")};
116 |       if (tounicode.isStream ())
117 |         {
118 |           if (verbose_)
119 |             std::cout << "stream" << std::endl;
120 |           process_tounicode (tounicode);
121 |         }
122 |       else
123 |         {
124 |           if(verbose_)
125 |             std::cout << "does not contain a stream" << std::endl;
126 |         }
127 |     }
128 | }
129 | 
130 | void pdf_handler::process_tounicode (QPDFObjectHandle oh)
131 | {
132 |   std::stringstream ss_in;
133 |   {
134 |     auto pb {oh.getStreamData ()};
135 |     std::string buff (reinterpret_cast<const char*> (pb->getBuffer ()),
136 |                       pb->getSize ());
137 |     ss_in.str (buff + "\n");
138 |   }
139 | 
140 |   tounicode tu;
141 |   tu.process_stream (ss_in);
142 | 
143 |   auto in {ss_in.str ()};
144 |   in.pop_back ();
145 |   auto out {tu.get ()};
146 |   out.pop_back ();
147 | 
148 |   if (in != out)
149 |     {
150 |       if (verbose_)
151 |         std::cout << "  replacing..." << std::endl;
152 | 
153 |       oh.replaceStreamData (out,
154 |                             QPDFObjectHandle::newNull(),
155 |                             QPDFObjectHandle::newNull());
156 |     }
157 | }
158 | 


--------------------------------------------------------------------------------
/src/tounicode_table.cc:
--------------------------------------------------------------------------------
  1 | // Generated by build_table.py
  2 | // Do not edit this file.
  3 | 
  4 | #include <unordered_map>
  5 | 
  6 | #include "tounicode.hh"
  7 | 
  8 | const std::unordered_map<int, int> tounicode::table_ =
  9 |   {
 10 |     { 0x2f90, 0x8863 },
 11 |     { 0x2f00, 0x4e00 },
 12 |     { 0x2f7b, 0x7fbd },
 13 |     { 0x2fac, 0x96e8 },
 14 |     { 0x2f85, 0x81fc },
 15 |     { 0x2f60, 0x74dc },
 16 |     { 0x2ee9, 0x9ec4 },
 17 |     { 0x2f04, 0x4e59 },
 18 |     { 0x2fb3, 0x97f3 },
 19 |     { 0x2f55, 0x706b },
 20 |     { 0x2f72, 0x79be },
 21 |     { 0x2f99, 0x8c9d },
 22 |     { 0x2f93, 0x89d2 },
 23 |     { 0x2fb0, 0x9769 },
 24 |     { 0x2f61, 0x74e6 },
 25 |     { 0x2f32, 0x5e72 },
 26 |     { 0x2f62, 0x7518 },
 27 |     { 0x2f78, 0x7f36 },
 28 |     { 0x2ee4, 0x9b3c },
 29 |     { 0x2fc1, 0x9b3c },
 30 |     { 0x2ef2, 0x4e80 },
 31 |     { 0x2fc9, 0x9ecd },
 32 |     { 0x2f38, 0x5f13 },
 33 |     { 0x2f5c, 0x725b },
 34 |     { 0x2fc2, 0x9b5a },
 35 |     { 0x2f5f, 0x7389 },
 36 |     { 0x2f31, 0x5dfe },
 37 |     { 0x2f44, 0x65a4 },
 38 |     { 0x2fa6, 0x91d1 },
 39 |     { 0x2f4b, 0x6b20 },
 40 |     { 0x2f73, 0x7a74 },
 41 |     { 0x2f8e, 0x8840 },
 42 |     { 0x2f49, 0x6708 },
 43 |     { 0x2f5d, 0x72ac },
 44 |     { 0x2f92, 0x898b },
 45 |     { 0x2f5e, 0x7384 },
 46 |     { 0x2f94, 0x8a00 },
 47 |     { 0x2f30, 0x5df1 },
 48 |     { 0x2f3e, 0x6238 },
 49 |     { 0x2ec1, 0x864e },
 50 |     { 0x2fce, 0x9f13 },
 51 |     { 0x2f1d, 0x53e3 },
 52 |     { 0x2f2f, 0x5de5 },
 53 |     { 0x2f8f, 0x884c },
 54 |     { 0x2fb9, 0x9999 },
 55 |     { 0x2fbc, 0x9ad8 },
 56 |     { 0x2fca, 0x9ed2 },
 57 |     { 0x2fbb, 0x9aa8 },
 58 |     { 0x2f89, 0x826e },
 59 |     { 0x2f6b, 0x76bf },
 60 |     { 0x2f2d, 0x5c71 },
 61 |     { 0x2f20, 0x58eb },
 62 |     { 0x2f26, 0x5b50 },
 63 |     { 0x2f40, 0x652f },
 64 |     { 0x2f4c, 0x6b62 },
 65 |     { 0x2f52, 0x6c0f },
 66 |     { 0x2f77, 0x7cf8 },
 67 |     { 0x2f84, 0x81f3 },
 68 |     { 0x2eed, 0x6b6f },
 69 |     { 0x2f70, 0x793a },
 70 |     { 0x2f7d, 0x800c },
 71 |     { 0x2f7f, 0x8033 },
 72 |     { 0x2f83, 0x81ea },
 73 |     { 0x2fc5, 0x9e7f },
 74 |     { 0x2f9e, 0x8eca },
 75 |     { 0x2f3f, 0x624b },
 76 |     { 0x2fb8, 0x9996 },
 77 |     { 0x2f88, 0x821f },
 78 |     { 0x2f17, 0x5341 },
 79 |     { 0x2f25, 0x5973 },
 80 |     { 0x2f29, 0x5c0f },
 81 |     { 0x2f8a, 0x8272 },
 82 |     { 0x2fb7, 0x98df },
 83 |     { 0x2f3c, 0x5fc3 },
 84 |     { 0x2f82, 0x81e3 },
 85 |     { 0x2f9d, 0x8eab },
 86 |     { 0x2f9f, 0x8f9b },
 87 |     { 0x2f08, 0x4eba },
 88 |     { 0x2f54, 0x6c34 },
 89 |     { 0x2f28, 0x5bf8 },
 90 |     { 0x2f63, 0x751f },
 91 |     { 0x2ec4, 0x897f },
 92 |     { 0x2ed8, 0x9752 },
 93 |     { 0x2eeb, 0x6589 },
 94 |     { 0x2f6f, 0x77f3 },
 95 |     { 0x2f9a, 0x8d64 },
 96 |     { 0x2f86, 0x820c },
 97 |     { 0x2f87, 0x821b },
 98 |     { 0x2fcf, 0x9f20 },
 99 |     { 0x2f9b, 0x8d70 },
100 |     { 0x2f9c, 0x8db3 },
101 |     { 0x2f24, 0x5927 },
102 |     { 0x2fa0, 0x8fb0 },
103 |     { 0x2f95, 0x8c37 },
104 |     { 0x2f75, 0x7af9 },
105 |     { 0x2f8d, 0x866b },
106 |     { 0x2ed1, 0x9577 },
107 |     { 0x2fa7, 0x9577 },
108 |     { 0x2fc3, 0x9ce5 },
109 |     { 0x2f56, 0x722a },
110 |     { 0x2fcd, 0x9f0e },
111 |     { 0x2f65, 0x7530 },
112 |     { 0x2f43, 0x6597 },
113 |     { 0x2f1f, 0x571f },
114 |     { 0x2f11, 0x5200 },
115 |     { 0x2f96, 0x8c46 },
116 |     { 0x2fa3, 0x9149 },
117 |     { 0x2f06, 0x4e8c },
118 |     { 0x2f81, 0x8089 },
119 |     { 0x2f47, 0x65e5 },
120 |     { 0x2f0a, 0x5165 },
121 |     { 0x2fba, 0x99ac },
122 |     { 0x2f69, 0x767d },
123 |     { 0x2ee8, 0x9ea6 },
124 |     { 0x2f0b, 0x516b },
125 |     { 0x2fa4, 0x91c6 },
126 |     { 0x2f50, 0x6bd4 },
127 |     { 0x2f6a, 0x76ae },
128 |     { 0x2fae, 0x975e },
129 |     { 0x2fb6, 0x98db },
130 |     { 0x2fd0, 0x9f3b },
131 |     { 0x2f66, 0x758b },
132 |     { 0x2f57, 0x7236 },
133 |     { 0x2fa9, 0x961c },
134 |     { 0x2fb5, 0x98a8 },
135 |     { 0x2f42, 0x6587 },
136 |     { 0x2f76, 0x7c73 },
137 |     { 0x2fb4, 0x9801 },
138 |     { 0x2f5a, 0x7247 },
139 |     { 0x2e9f, 0x6bcd },
140 |     { 0x2f45, 0x65b9 },
141 |     { 0x2f18, 0x535c },
142 |     { 0x2fc7, 0x9ebb },
143 |     { 0x2f1c, 0x53c8 },
144 |     { 0x2e92, 0x5df3 },
145 |     { 0x2ea0, 0x6c11 },
146 |     { 0x2f6d, 0x77db },
147 |     { 0x2faf, 0x9762 },
148 |     { 0x2f51, 0x6bdb },
149 |     { 0x2f4a, 0x6728 },
150 |     { 0x2f6c, 0x76ee },
151 |     { 0x2fa8, 0x9580 },
152 |     { 0x2f6e, 0x77e2 },
153 |     { 0x2fa2, 0x9091 },
154 |     { 0x2f23, 0x5915 },
155 |     { 0x2f64, 0x7528 },
156 |     { 0x2f7a, 0x7f8a },
157 |     { 0x2fa5, 0x91cc },
158 |     { 0x2f74, 0x7acb },
159 |     { 0x2eef, 0x7adc },
160 |     { 0x2fd3, 0x9f8d },
161 |     { 0x2f12, 0x529b },
162 |     { 0x2f7c, 0x8001 },
163 |     { 0x2f02, 0x4e36 },
164 |     { 0x2f03, 0x4e3f },
165 |     { 0x2f05, 0x4e85 },
166 |     { 0x2f07, 0x4ea0 },
167 |     { 0x2f09, 0x513f },
168 |     { 0x2e8e, 0x5140 },
169 |     { 0x2f0c, 0x5182 },
170 |     { 0x2f0d, 0x5196 },
171 |     { 0x2f0e, 0x51ab },
172 |     { 0x2f0f, 0x51e0 },
173 |     { 0x2f10, 0x51f5 },
174 |     { 0x2f13, 0x52f9 },
175 |     { 0x2f14, 0x5315 },
176 |     { 0x2f15, 0x531a },
177 |     { 0x2f16, 0x5338 },
178 |     { 0x2f19, 0x5369 },
179 |     { 0x2f1a, 0x5382 },
180 |     { 0x2f1b, 0x53b6 },
181 |     { 0x2f1e, 0x56d7 },
182 |     { 0x2f21, 0x5902 },
183 |     { 0x2f22, 0x590a },
184 |     { 0x2f27, 0x5b80 },
185 |     { 0x2e90, 0x5c22 },
186 |     { 0x2f2a, 0x5c22 },
187 |     { 0x2f2b, 0x5c38 },
188 |     { 0x2f2c, 0x5c6e },
189 |     { 0x2f2e, 0x5ddb },
190 |     { 0x2e93, 0x5e7a },
191 |     { 0x2f33, 0x5e7a },
192 |     { 0x2f34, 0x5e7f },
193 |     { 0x2f35, 0x5ef4 },
194 |     { 0x2f36, 0x5efe },
195 |     { 0x2f37, 0x5f0b },
196 |     { 0x2e94, 0x5f51 },
197 |     { 0x2f3a, 0x5f61 },
198 |     { 0x2f3b, 0x5f73 },
199 |     { 0x2f3d, 0x6208 },
200 |     { 0x2f41, 0x6534 },
201 |     { 0x2e99, 0x6535 },
202 |     { 0x2f46, 0x65e0 },
203 |     { 0x2e9b, 0x65e1 },
204 |     { 0x2f48, 0x66f0 },
205 |     { 0x2f4d, 0x6b79 },
206 |     { 0x2f4e, 0x6bb3 },
207 |     { 0x2f4f, 0x6bcb },
208 |     { 0x2f53, 0x6c14 },
209 |     { 0x2f58, 0x723b },
210 |     { 0x2f59, 0x723f },
211 |     { 0x2f68, 0x7676 },
212 |     { 0x2f79, 0x7f51 },
213 |     { 0x2f7e, 0x8012 },
214 |     { 0x2f80, 0x807f },
215 |     { 0x2f8b, 0x8278 },
216 |     { 0x2f8c, 0x864d },
217 |     { 0x2f91, 0x897e },
218 |     { 0x2f97, 0x8c55 },
219 |     { 0x2f98, 0x8c78 },
220 |     { 0x2faa, 0x96b6 },
221 |     { 0x2fab, 0x96b9 },
222 |     { 0x2fb1, 0x97cb },
223 |     { 0x2fb2, 0x97ed },
224 |     { 0x2fbd, 0x9adf },
225 |     { 0x2fbe, 0x9b25 },
226 |     { 0x2fbf, 0x9b2f },
227 |     { 0x2fc0, 0x9b32 },
228 |     { 0x2fc4, 0x9e75 },
229 |     { 0x2fc6, 0x9ea5 },
230 |     { 0x2fcb, 0x9ef9 },
231 |     { 0x2fcc, 0x9efd },
232 |     { 0x2fd1, 0x9f4a },
233 |     { 0x2fd2, 0x9f52 },
234 |     { 0x2fd4, 0x9f9c },
235 |     { 0x2fd5, 0x9fa0 },
236 |     { 0x2f8ed, 0x6adb },
237 |     { 0x2f8dc, 0x6753 },
238 |     { 0xf9ec, 0x6eba },
239 |     { 0x2f877, 0x5c60 },
240 |     { 0xf992, 0x6f23 },
241 |     { 0xf993, 0x7149 },
242 |     { 0x2f818, 0x51a4 },
243 |     { 0x2f5b, 0x7259 },
244 |     { 0x2f01, 0x4e28 },
245 |     { 0x2fad, 0x9751 },
246 |     { 0x2fc8, 0x9ec3 },
247 |     // { 0x2ebd, 0x26951 },
248 |     // { 0x2e8d, 0x2d544 },
249 |     { 0x2edf, 0x98e0 },
250 |     // { 0x2ede, 0x2967f },
251 |     { 0x2e97, 0x38fa },
252 |     { 0x2e85, 0x4ebb },
253 |     { 0x2ec3, 0x8980 },
254 |     // { 0x2eca, 0x27fb7 },
255 |     // { 0x2eae, 0x25ad7 },
256 |     // { 0x2eaa, 0x24d14 },
257 |     { 0x2eb9, 0x8002 },
258 |     // { 0x2e87, 0x20628 },
259 |     { 0x2f82c, 0x20984 },
260 |     { 0x2e8b, 0x353e },
261 |     { 0x2f8db, 0x233cc },
262 |     { 0x2ea6, 0x4e2c },
263 |     // { 0x2eb3, 0x2626a },
264 |     { 0x2ebe, 0x8279 },
265 |     { 0x2e83, 0x4e5a },
266 |     { 0x2e89, 0x5202 },
267 |     { 0x2e8f, 0x5c23 },
268 |     { 0x2f39, 0x5f50 },
269 |     { 0x2e96, 0x5fc4 },
270 |     { 0x2e98, 0x624c },
271 |     { 0x2ea1, 0x6c35 },
272 |     { 0x2ea3, 0x706c },
273 |     { 0x2ea8, 0x72ad },
274 |     { 0x2f67, 0x7592 },
275 |     { 0x2ead, 0x793b },
276 |     { 0x2f71, 0x79b8 },
277 |     { 0x2eab, 0x7f52 },
278 |     { 0x2eb2, 0x7f52 },
279 |     { 0x2eb1, 0x7f53 },
280 |     { 0x2ec2, 0x8864 },
281 |     { 0x2fa1, 0x8fb5 },
282 |     { 0x2ecd, 0x8fb6 },
283 |     { 0x2ed2, 0x9578 },
284 |     { 0x2ecf, 0x961d },
285 |     { 0x2ed6, 0x961d },
286 |     { 0x2e9e, 0x6b7a },
287 |   };
288 | 


--------------------------------------------------------------------------------
/Readme.ja.md:
--------------------------------------------------------------------------------
  1 | <!-- -*- coding: utf-8 -*- -->
  2 | # PDF 内の ToUnicod CMap を修正してコピペの文字化けを防ぐツール
  3 | 
  4 | [ [English](Readme.md) / 日本語 (Japanese) ]
  5 | 
  6 | PDF から文字をコピペするとき、
  7 | PDF によっては「見」とか「高」などの漢字が
  8 | よく似ているけど違う文字（康熙部首や CJK 部首補助などの特殊な文字）
  9 | に化けてしまう、なんてことが結構あります。
 10 | このツールはそうした文字化けする PDF を修正し、
 11 | 文字化けしない PDF を生成します。
 12 | 
 13 | ## 注意
 14 | 
 15 | このツールで生成した PDF から元の
 16 | PDF へ逆変換することはできません（不可逆です）。
 17 | また、気が付かないところで PDF が壊れている可能性も否定できません。
 18 | 元の PDF は必ずバックアップを残しておいてください。
 19 | 
 20 | ## 使い方
 21 | 
 22 | ```
 23 | $ pdf-fix-tuc
 24 | Fix ToUnicode CMap in PDF 0.0.0
 25 | Copyright (C) 2021 Masamichi Hosoda. All rights reserved.
 26 | License: BSD-2-Clause
 27 | 
 28 | https://github.com/trueroad/pdf-fix-tuc
 29 | 
 30 | Usage: pdf-fix-tuc [options] [INPUT.pdf OUTPUT.pdf] ...
 31 | 
 32 |   -h, --help
 33 |     Print help and exit
 34 |   -V, --version
 35 |     Print version and exit
 36 |   --verbose
 37 |     Verbose
 38 | 
 39 | Output PDF settings (QPDF):
 40 |   --linearize
 41 |     Output linearized (web-optimized) PDF
 42 |   --object-streams=[preserve|disable|generate]   (default=preserve)
 43 |     Settings for object streams
 44 |   --newline-before-endstream
 45 |     Output newline before endstream
 46 |   --qdf
 47 |     Output QDF
 48 | 
 49 | $
 50 | ```
 51 | 
 52 | ## インストール
 53 | 
 54 | ### 必要なもの
 55 | 
 56 | * C++11 コンパイラ（g++ 4.9 以降等）
 57 | * [libqpdf](http://qpdf.sourceforge.net/)
 58 | * pkg-config 等
 59 | * Autoconf 2.69 以降
 60 | * Automake
 61 | 
 62 | パッケージを使って準備したい場合には、以下が便利でしょう。
 63 | 
 64 | * Debian / Ubuntu
 65 |     + libqpdf-dev
 66 | * Fedora
 67 |     + qpdf-devel
 68 | * Cygwin
 69 |     + libqpdf-devel
 70 | 
 71 | ### ビルド・インストール方法
 72 | 
 73 | ```
 74 | $ git clone https://github.com/trueroad/pdf-fix-tuc.git
 75 | $ cd pdf-fix-tuc
 76 | $ ./autogen.sh
 77 | $ mkdir build
 78 | $ cd build
 79 | $ ../configure
 80 | $ make
 81 | $ make install
 82 | ```
 83 | 
 84 | ## しくみ
 85 | 
 86 | ### PDF のテキスト抽出（コピペ）
 87 | 
 88 | 多くの PDF は文字を表現する際に、
 89 | Unicode のような文字コードではなく
 90 | CID (Character ID) や GID (Glyph ID) と呼ばれる
 91 | フォント内のグリフ（字形）を識別するコードが用いられています。
 92 | 
 93 | PDF を作るときには、使用するフォントの cmap テーブルを使って
 94 | Unicode のコードポイントを CID/GID へ変換し、それを PDF 内に記述します。
 95 | 例えば「見」という漢字は Unicode では U+898B と表現されます。
 96 | この CID/GID が何になるかはフォントによって違うのですが、
 97 | [原ノ味フォント](https://github.com/trueroad/HaranoAjiFonts)
 98 | のような
 99 | [Adobe-Japan1](https://github.com/adobe-type-tools/Adobe-Japan1)
100 | に従っているフォントの場合、対応する CID は CID+1887 になります。
101 | つまり、PDF にする前は U+898B と表現されていた「見」を PDF にすると、
102 | PDF 内部では CID+1887 として記録されます。
103 | この PDF を表示や印刷する際には、CID+1887 を元に
104 | フォントから必要なグリフを取り出して表示や印刷をします。
105 | 
106 | 一方で、PDF からテキスト抽出（文字をコピペしたり検索したり等）する際には、
107 | CID+1887 だけでは元の文字コードがわかりません。
108 | そこで CID/GID から Unicode コードポイントへ変換できる
109 | ToUnicode CMap という変換テーブルが用いられます。
110 | 多くの PDF では PDF 生成時に使用するフォント毎に ToUnicode CMap が生成され
111 | PDF 内部に埋め込まれるため、これによってテキスト抽出が可能になっています。
112 | 
113 | ### 文字化けの原因
114 | 
115 | 「見」などをコピペすると文字化けする PDF は、
116 | この ToUnicode CMap の生成方法に問題がある、
117 | つまり PDF を作るのに使ったツールに問題があります。
118 | ですが、メジャーどころも含めて多くのツールが同じ問題を抱えており、
119 | こうしたツールで生成した PDF は同じ文字化けを起こしてしまいます
120 | （こうした問題に配慮してコピペが文字化けしない PDF 生成ツールもあります）。
121 | 
122 | 問題の一端は cmap テーブルの構造にあります。
123 | cmap テーブルは Unicode コードポイントから CID/GID
124 | へ変換するためのテーブルです。
125 | これを逆変換するテーブルを作れば ToUnicode CMap にすることができます。
126 | しかし、cmap テーブルは 1 対 1 対応になっていないため、
127 | 簡単に逆変換することができません。
128 | 「見」の場合、U+898B → CID+1887 というマッピングの他に、
129 | U+2F92 → CID+1887 というマッピングが書かれています。
130 | CID+1887 から逆変換しようとすると U+898B にすればよいのか
131 | U+2F92 にすればよいのか、わからなくて困ってしまいます。
132 | 「見」の他にも、複数の Unicode コードポイントが一つの CID/GID
133 | にマッピングされているものはたくさんあり、n 対 1 対応になっています。
134 | なぜ 1 対 1 対応になっていないのかというと、Unicode
135 | は見た目が同じでも由来が異なれば異なるコードポイントを割り当てるのに対して、
136 | CID/GID はフォントの仕組みなので見た目（字形）が同じであれば
137 | 同じ ID を割り当てる、という方針の違いがあるからと言えると思います。
138 | 
139 | こうした n 対 1 対応がある、ということを考慮せず逆変換しようとして
140 | CID+1887 に対応する Unicode コードポイントを
141 | cmap テーブルから探すとどうなるでしょうか。
142 | 恐らく cmap テーブルを上から順番に検索して最初に見つかった CID+1887
143 | に対応する Unicode コードポイントを返すでしょう。
144 | cmap テーブルは Unicode コードポイントの昇順に並んでいますから、
145 | 最初に見つかるのは番号の小さい U+2F92 の方ということになります。
146 | というわけで、元々は cmap テーブルで U+898B → CID+1887 を使ったのに
147 | ToUnicode CMap には CID+1887 → U+2F92 が書かれてしまい、
148 | 元々 U+898B だった「見」が違う文字である U+2F92 に化けてしまう、
149 | ということになります。
150 | 
151 | この U+2F92 というのは何かというと、
152 | Unicode では 'KANGXI RADICAL SEE' という名前がついていて
153 | 'Kangxi Radicals' というブロックに入っています。
154 | Kangxi Radicals というのは康熙部首（康煕部首）と呼ばれるもので、
155 | 普通の漢字ではなく部首を表す特殊な文字が収められているブロックになります。
156 | 特殊な文字ですから普通は使いませんし、
157 | これらが収録されていないフォントもあります。
158 | 収録されていないフォントで表示しようとすると他のフォントで代替されたりして、
159 | そこだけ変な表示になってしまいます。
160 | また、普通の「見」で検索をかけると U+898B を探すことになり、
161 | U+2F92 は違う文字なのでヒットしなくなってしまいます。
162 | 
163 | 一方の U+898B は 'CJK Unified Ideographs' つまり
164 | CJK 統合漢字のブロックに入っている普通の漢字です。
165 | この問題のもう一つの一端は、
166 | 見た目が同じ文字に複数の Unicode
167 | コードポイントが割り当てられているのは仕方ないにしても、
168 | 普通は使わない文字の方が小さい番号、
169 | 普通に使う漢字の方が大きい番号になっており、
170 | 逆変換で単純に小さい番号の方を採用すると化けてしまう、
171 | というところにあります。
172 | 
173 | ### 文字化けしない PDF 生成ツール
174 | 
175 | cmap テーブルの逆変換に頼らずに
176 | 元の Unicode コードポイントと CID/GID の関係を覚えておいて
177 | ToUnicode CMap を生成するような PDF 生成ツールであれば、
178 | コピペで文字化けしない PDF ができます
179 | （ただし、一つの PDF で同じフォントに対して
180 | U+2F92 と U+898B の両方が使われていたら区別できませんので、
181 | どちらかに寄せなければならなくなります）。
182 | 
183 | また、cmap テーブルの逆変換で ToUnicode CMap を生成するような
184 | PDF 生成ツールでも、n 対 1 対応を考慮して、
185 | 闇雲に最初に見つかった番号の小さい Unicode コードポイントを返すのではなく、
186 | 複数の候補の中から何らかの基準で優先順位をつけて、
187 | よりふさわしい Unicode コードポイントを返すようにすれば、
188 | ほとんどの場合でコピペが文字化けしなくなるハズです。
189 | 
190 | 例えば XeTeX で使われる xdvipdfmx が PDF を生成する場合は、
191 | xdvipdfmx の入力ファイル (.xdv) は既に CID/GID しか記載されておらず
192 | 元の Unicode コードポイントが失われた状態になっているため、
193 | 後者の方法でコピペが文字化けしない PDF を生成しています
194 | （他にもフォントが Adobe-Japan1 の場合は ToUnicode CMap を埋め込まない
195 | という動作でもコピペの文字化けを防いでおり、これと同様になるよう
196 | PDF から Adobe-Japan1 フォントの ToUnicode CMap を削除するツール[
197 | pdf-rm-tuc
198 | ](https://github.com/trueroad/pdf-rm-tuc)
199 | もあります）。
200 | 優先度の決定は Unicode のコードポイントの範囲を区切って行っています。
201 | ソースコードに入っている[
202 | tt_cmap.c
203 | ](https://github.com/texjporg/tex-jp-build/blob/master/source/texk/dvipdfm-x/tt_cmap.c)
204 | にある `is_PUA_or_presentation` 関数を見ると
205 | 以下のようになっていてその範囲がわかります。
206 | 
207 | ```c
208 | /* Soft-hyphen (U+00AD) to lower its priority... added here for convenience */
209 | static int is_PUA_or_presentation (unsigned int uni)
210 | {
211 |   /* Some of CJK Radicals Supplement and Kangxi Radicals
212 |    * are commonly double encoded, lower the priority.
213 |    * CJK Compatibility Ideographs & Supplement added.
214 |    */
215 |   return  ((uni >= 0x2E80 && uni <= 0x2EF3) || (uni >= 0x2F00 && uni <= 0x2FD5) ||
216 |            (uni >= 0xE000 && uni <= 0xF8FF) || (uni >= 0xFB00 && uni <= 0xFB4F) ||
217 |            (uni >= 0xF900 && uni <= 0xFAFF) || (uni >= 0x2F800 && uni <= 0x2FA1F) ||
218 |            (uni >= 0xF0000 && uni <= 0xFFFFD) || (uni >= 0x100000 && uni <= 0x10FFFD) ||
219 |            (uni == 0x00AD));
220 | }
221 | ```
222 | 
223 | 「見」が化ける先の U+2F92 は康熙部首のブロックに入っており、
224 | これを判定する `(uni >= 0x2F00 && uni <= 0x2FD5)`
225 | に含まれているので優先度が低くなり採用されません。
226 | その結果、ToUnicode CMap には正しい方の
227 | CID+1887 → U+898B
228 | が記載されるためコピペが文字化けしなくなるというわけです。
229 | 他にも康煕部首と同様の部首が収められている
230 | 'CJK Radicals Supplement' CJK 部首補助のブロックや、
231 | 'CJK Compatibility Ideographs' CJK 互換漢字、
232 | 'CJK Compatibility Ideographs Supplement' CJK 互換漢字補助、
233 | 'Private Use Area' 私用領域、
234 | などが除外されるようになっています。
235 | 
236 | ### 本ツールの動作
237 | 
238 | 本ツールは、これまでに説明したようなコピペで文字化けする PDF の
239 | ToUnicode CMap に手を入れてコピペが文字化けしないようにします。
240 | 具体的には ToUnicode CMap に 
241 | CID+1887 → U+2F92
242 | のような変換先が優先度の低い Unicode になっているマッピングを見つけたら、
243 | 変換先を優先度が高い Unicode に書き換え
244 | CID+1887 → U+898B
245 | のようにした PDF を出力します。
246 | 
247 | この書き換えを行うテーブルは自動生成したものをソースの[
248 | tounicode_table.cc
249 | ](https://github.com/trueroad/pdf-fix-tuc/blob/master/src/tounicode_table.cc)
250 | に入れてあります。これを自動生成するスクリプトもソースの[
251 | build_table.py
252 | ](https://github.com/trueroad/pdf-fix-tuc/blob/master/src/build_table.py)
253 | に同梱してあります（ただし、普通にビルドしても自動生成は発動しません）。
254 | 自動生成の方法は、[
255 | Adobe-Japan1
256 | ](https://github.com/adobe-type-tools/Adobe-Japan1)
257 | で配布されている漢字グリフの一覧表[
258 | aj17-kanji.txt
259 | ](https://github.com/adobe-type-tools/Adobe-Japan1/blob/master/aj17-kanji.txt)
260 | を使い、
261 | 同じ CID に複数の Unicode コードポイントが割り当てられてるものについて、
262 | xdvipdfmx と同じ方法で優先度の低い Unicode コードポイントを判定し、
263 | 同じグリフ内で優先度が低い Unicode コードポイントを優先度が低くない
264 | Unicode コードポイントへ書き換える、
265 | というテーブルを生成しています。
266 | ただし、今のところ BMP から
267 | BMP 外へ書き換えるようなテーブルは生成しないようにしています。
268 | 
269 | Adobe-Japan1 を元にしてテーブルを生成しているため
270 | 非 Adobe-Japan1 フォントではうまくいかなくなる可能性もありますが、
271 | ほとんどの日本語フォントは非 Adobe-Japan1 であっても
272 | 漢字グリフは Adobe-Japan1 のグリフセットをベースにしていると思いますので、
273 | 大抵は大丈夫だろうと思っています。
274 | 
275 | ## ライセンス
276 | 
277 | Copyright (C) 2021 Masamichi Hosoda. All rights reserved.
278 | 
279 | License: BSD-2-Clause
280 | 
281 | [LICENSE](./LICENSE) をご覧ください。
282 | 


--------------------------------------------------------------------------------
/src/tounicode.cc:
--------------------------------------------------------------------------------
  1 | //
  2 | // Fix ToUnicode CMap in PDF
  3 | // https://github.com/trueroad/pdf-fix-tuc
  4 | //
  5 | // tounicode.cc: ToUnicode CMap handler class
  6 | //
  7 | // Copyright (C) 2021 Masamichi Hosoda.
  8 | // All rights reserved.
  9 | //
 10 | // Redistribution and use in source and binary forms, with or without
 11 | // modification, are permitted provided that the following conditions
 12 | // are met:
 13 | //
 14 | // * Redistributions of source code must retain the above copyright notice,
 15 | //   this list of conditions and the following disclaimer.
 16 | //
 17 | // * Redistributions in binary form must reproduce the above copyright notice,
 18 | //   this list of conditions and the following disclaimer in the documentation
 19 | //   and/or other materials provided with the distribution.
 20 | //
 21 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 22 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 23 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 24 | // ARE DISCLAIMED.
 25 | // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 26 | // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 27 | // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 28 | // OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 29 | // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 30 | // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 31 | // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 32 | // SUCH DAMAGE.
 33 | //
 34 | 
 35 | #include "tounicode.hh"
 36 | 
 37 | #include <iomanip>
 38 | #include <iostream>
 39 | #include <regex>
 40 | #include <sstream>
 41 | #include <string>
 42 | #include <unordered_map>
 43 | 
 44 | #include "regex_dispatcher_m.hh"
 45 | 
 46 | tounicode::tounicode ():
 47 |   regex_dispatcher::member_table<tounicode>
 48 |   ({
 49 |     { std::regex (R"((<[\da-fA-F]+>\s*)<([\da-fA-F]+)>(\r?))"),
 50 |       &tounicode::bfchar_hex},
 51 |     { std::regex (R"(<([\da-fA-F]+)>(\s*)<([\da-fA-F]+)>(\s*)<([\da-fA-F]+)>(\r?))"),
 52 |       &tounicode::bfrange_hex},
 53 |     { std::regex (R"((\d+)(\s*)beginbfchar(\r?))"),
 54 |       &tounicode::beginbfchar},
 55 |     { std::regex (R"((\d+)(\s*)beginbfrange(\r?))"),
 56 |       &tounicode::beginbfrange},
 57 |     { std::regex (R"(endbfchar\r?)"),
 58 |       &tounicode::endbfchar},
 59 |     { std::regex (R"(endbfrange\r?)"),
 60 |       &tounicode::endbfrange},
 61 |     { std::regex (R"(.*\r?)"),
 62 |       &tounicode::other},
 63 |   })
 64 | {
 65 | }
 66 | 
 67 | std::string tounicode::get (void)
 68 | {
 69 |   return ss_.str ();
 70 | }
 71 | 
 72 | bool tounicode::bfchar_hex (const std::smatch &sm)
 73 | {
 74 |   const auto prefix {sm[1].str ()};
 75 |   auto uni {std::stoi (sm[2].str (), nullptr, 16)};
 76 |   const auto cr {sm[3].str ()};
 77 | 
 78 |   if (is_bfchar_ && table_.find (uni) != table_.end ())
 79 |     {
 80 |       uni = table_.at (uni);
 81 | 
 82 |       ss_ << prefix
 83 |           << "<"
 84 |           << std::hex << std::uppercase
 85 |           << std::setw (4) << std::setfill ('0')
 86 |           << uni
 87 |           << std::dec << std::nouppercase
 88 |           << ">"
 89 |           << cr
 90 |           << std::endl;
 91 |     }
 92 |   else
 93 |     {
 94 |       ss_ << sm[0].str () << std::endl;
 95 |     }
 96 | 
 97 |   return true;
 98 | }
 99 | 
100 | void tounicode::bfrange_output (const int cid_start,
101 |                                 const int cid_end,
102 |                                 const int uni_start,
103 |                                 const std::string &space1,
104 |                                 const std::string &space2,
105 |                                 const std::string &cr)
106 | {
107 |   ss_bfrange_ << "<"
108 |               << std::hex << std::uppercase
109 |               << std::setw (4) << std::setfill ('0')
110 |               << cid_start
111 |               << ">"
112 |               << space1
113 |               << "<"
114 |               << std::setw (4) << std::setfill ('0')
115 |               << cid_end
116 |               << ">"
117 |               << space2
118 |               << "<"
119 |               << std::setw (4) << std::setfill ('0')
120 |               << uni_start
121 |               << std::dec << std::nouppercase
122 |               << ">"
123 |               << cr
124 |               << std::endl;
125 |   ++bfranges_;
126 | }
127 | 
128 | bool tounicode::bfrange_hex (const std::smatch &sm)
129 | {
130 |   if (!is_bfrange_)
131 |     {
132 |       ss_ << sm[0].str () << std::endl;
133 |       return true;
134 |     }
135 | 
136 |   const auto cid_start {std::stoi (sm[1].str (), nullptr, 16)};
137 |   const auto cid_end {std::stoi (sm[3].str (), nullptr, 16)};
138 |   const auto uni_start {std::stoi (sm[5].str (), nullptr, 16)};
139 |   const auto space1 {sm[2].str ()};
140 |   const auto space2 {sm[4].str ()};
141 |   const auto cr {sm[6].str ()};
142 | 
143 |   auto cid_new_range_start = cid_start;
144 |   auto uni_new_range_start = uni_start;
145 |   auto cid = cid_start;
146 |   auto uni = uni_start;
147 |   for (; cid <= cid_end; ++cid, ++uni)
148 |     {
149 |       if (table_.find (uni) != table_.end ())
150 |         {
151 |           if (cid_new_range_start < cid)
152 |             bfrange_output (cid_new_range_start, cid - 1,
153 |                             uni_new_range_start,
154 |                             space1, space2, cr);
155 | 
156 |           const auto uni_new = table_.at (uni);
157 |           bfrange_output (cid, cid, uni_new, space1, space2, cr);
158 | 
159 |           cid_new_range_start = cid + 1;
160 |           uni_new_range_start = uni + 1;
161 |         }
162 |     }
163 | 
164 |   if (cid_new_range_start == cid_start)
165 |     {
166 |       ss_bfrange_ << sm[0].str () << std::endl;
167 |       ++bfranges_;
168 |     }
169 |   else if (cid_new_range_start < cid)
170 |     bfrange_output (cid_new_range_start, cid - 1,
171 |                     uni_new_range_start,
172 |                     space1, space2, cr);
173 | 
174 |   return true;
175 | }
176 | 
177 | bool tounicode::beginbfchar (const std::smatch &sm)
178 | {
179 |   const auto chars {std::stoi (sm[1].str ())};
180 |   const auto space {sm[2].str ()};
181 |   const auto cr {sm[3].str ()};
182 | 
183 |   if (is_bfrange_)
184 |     {
185 |       std::cerr << "warning: invalid ToUnicode: no endbfrange" << std::endl;
186 |       is_bfrange_ = false;
187 |     }
188 |   if (is_bfchar_)
189 |     {
190 |       std::cerr << "warning: invalid ToUnicode: no endbfchar" << std::endl;
191 |     }
192 | 
193 |   ss_ << chars
194 |       << space
195 |       << "beginbfchar"
196 |       << cr
197 |       << std::endl;
198 | 
199 |   is_bfchar_ = true;
200 | 
201 |   return true;
202 | }
203 | 
204 | bool tounicode::endbfchar (const std::smatch &sm)
205 | {
206 |   if (is_bfrange_)
207 |     {
208 |       std::cerr << "warning: invalid ToUnicode: no endbfrange" << std::endl;
209 |       is_bfrange_ = false;
210 |     }
211 |   if (is_bfchar_)
212 |     {
213 |       is_bfchar_ = false;
214 |     }
215 |   else
216 |     {
217 |       std::cerr << "warning: invalid ToUnicode: no beginbfchar" << std::endl;
218 |     }
219 | 
220 |   ss_ << sm[0].str ()
221 |       << std::endl;
222 | 
223 |   return true;
224 | }
225 | 
226 | bool tounicode::beginbfrange (const std::smatch &sm)
227 | {
228 |   beginbfrange_space_  = sm[2].str ();
229 |   beginbfrange_cr_ = sm[3].str ();
230 | 
231 |   if (is_bfchar_)
232 |     {
233 |       std::cerr << "warning: invalid ToUnicode: no endbfchar" << std::endl;
234 |       is_bfchar_ = false;
235 |     }
236 |   if (is_bfrange_)
237 |     {
238 |       std::cerr << "warning: invalid ToUnicode: no endbfrange" << std::endl;
239 |     }
240 | 
241 |   is_bfrange_ = true;
242 |   ss_bfrange_.str ("");
243 |   ss_bfrange_.clear ();
244 |   bfranges_ = 0;
245 | 
246 |   return true;
247 | }
248 | 
249 | bool tounicode::endbfrange (const std::smatch &sm)
250 | {
251 |   if (is_bfchar_)
252 |     {
253 |       std::cerr << "warning: invalid ToUnicode: no endbfchar" << std::endl;
254 |       is_bfchar_ = false;
255 |     }
256 |   if (is_bfrange_)
257 |     {
258 |       is_bfrange_ = false;
259 |     }
260 |   else
261 |     {
262 |       std::cerr << "warning: invalid ToUnicode: no beginbfrange" << std::endl;
263 |     }
264 | 
265 |   ss_ << bfranges_
266 |       << beginbfrange_space_
267 |       << "beginbfrange"
268 |       << beginbfrange_cr_
269 |       << std::endl
270 |       << ss_bfrange_.str ()
271 |       << sm[0].str ()
272 |       << std::endl;
273 |   ss_bfrange_.str ("");
274 |   ss_bfrange_.clear ();
275 |   bfranges_ = 0;
276 | 
277 |   return true;
278 | }
279 | 
280 | bool tounicode::other (const std::smatch &sm)
281 | {
282 |   ss_ << sm[0].str ()
283 |       << std::endl;
284 | 
285 |   return true;
286 | }
287 | 


--------------------------------------------------------------------------------
/m4/ax_cxx_compile_stdcxx.m4:
--------------------------------------------------------------------------------
  1 | # ===========================================================================
  2 | #  https://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx.html
  3 | # ===========================================================================
  4 | #
  5 | # SYNOPSIS
  6 | #
  7 | #   AX_CXX_COMPILE_STDCXX(VERSION, [ext|noext], [mandatory|optional])
  8 | #
  9 | # DESCRIPTION
 10 | #
 11 | #   Check for baseline language coverage in the compiler for the specified
 12 | #   version of the C++ standard.  If necessary, add switches to CXX and
 13 | #   CXXCPP to enable support.  VERSION may be '11' (for the C++11 standard)
 14 | #   or '14' (for the C++14 standard).
 15 | #
 16 | #   The second argument, if specified, indicates whether you insist on an
 17 | #   extended mode (e.g. -std=gnu++11) or a strict conformance mode (e.g.
 18 | #   -std=c++11).  If neither is specified, you get whatever works, with
 19 | #   preference for an extended mode.
 20 | #
 21 | #   The third argument, if specified 'mandatory' or if left unspecified,
 22 | #   indicates that baseline support for the specified C++ standard is
 23 | #   required and that the macro should error out if no mode with that
 24 | #   support is found.  If specified 'optional', then configuration proceeds
 25 | #   regardless, after defining HAVE_CXX${VERSION} if and only if a
 26 | #   supporting mode is found.
 27 | #
 28 | # LICENSE
 29 | #
 30 | #   Copyright (c) 2008 Benjamin Kosnik <bkoz@redhat.com>
 31 | #   Copyright (c) 2012 Zack Weinberg <zackw@panix.com>
 32 | #   Copyright (c) 2013 Roy Stogner <roystgnr@ices.utexas.edu>
 33 | #   Copyright (c) 2014, 2015 Google Inc.; contributed by Alexey Sokolov <sokolov@google.com>
 34 | #   Copyright (c) 2015 Paul Norman <penorman@mac.com>
 35 | #   Copyright (c) 2015 Moritz Klammler <moritz@klammler.eu>
 36 | #   Copyright (c) 2016, 2018 Krzesimir Nowak <qdlacz@gmail.com>
 37 | #   Copyright (c) 2019 Enji Cooper <yaneurabeya@gmail.com>
 38 | #
 39 | #   Copying and distribution of this file, with or without modification, are
 40 | #   permitted in any medium without royalty provided the copyright notice
 41 | #   and this notice are preserved.  This file is offered as-is, without any
 42 | #   warranty.
 43 | 
 44 | #serial 11
 45 | 
 46 | dnl  This macro is based on the code from the AX_CXX_COMPILE_STDCXX_11 macro
 47 | dnl  (serial version number 13).
 48 | 
 49 | AC_DEFUN([AX_CXX_COMPILE_STDCXX], [dnl
 50 |   m4_if([$1], [11], [ax_cxx_compile_alternatives="11 0x"],
 51 |         [$1], [14], [ax_cxx_compile_alternatives="14 1y"],
 52 |         [$1], [17], [ax_cxx_compile_alternatives="17 1z"],
 53 |         [m4_fatal([invalid first argument `$1' to AX_CXX_COMPILE_STDCXX])])dnl
 54 |   m4_if([$2], [], [],
 55 |         [$2], [ext], [],
 56 |         [$2], [noext], [],
 57 |         [m4_fatal([invalid second argument `$2' to AX_CXX_COMPILE_STDCXX])])dnl
 58 |   m4_if([$3], [], [ax_cxx_compile_cxx$1_required=true],
 59 |         [$3], [mandatory], [ax_cxx_compile_cxx$1_required=true],
 60 |         [$3], [optional], [ax_cxx_compile_cxx$1_required=false],
 61 |         [m4_fatal([invalid third argument `$3' to AX_CXX_COMPILE_STDCXX])])
 62 |   AC_LANG_PUSH([C++])dnl
 63 |   ac_success=no
 64 | 
 65 |   m4_if([$2], [noext], [], [dnl
 66 |   if test x$ac_success = xno; then
 67 |     for alternative in ${ax_cxx_compile_alternatives}; do
 68 |       switch="-std=gnu++${alternative}"
 69 |       cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx$1_$switch])
 70 |       AC_CACHE_CHECK(whether $CXX supports C++$1 features with $switch,
 71 |                      $cachevar,
 72 |         [ac_save_CXX="$CXX"
 73 |          CXX="$CXX $switch"
 74 |          AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_testbody_$1])],
 75 |           [eval $cachevar=yes],
 76 |           [eval $cachevar=no])
 77 |          CXX="$ac_save_CXX"])
 78 |       if eval test x\$$cachevar = xyes; then
 79 |         CXX="$CXX $switch"
 80 |         if test -n "$CXXCPP" ; then
 81 |           CXXCPP="$CXXCPP $switch"
 82 |         fi
 83 |         ac_success=yes
 84 |         break
 85 |       fi
 86 |     done
 87 |   fi])
 88 | 
 89 |   m4_if([$2], [ext], [], [dnl
 90 |   if test x$ac_success = xno; then
 91 |     dnl HP's aCC needs +std=c++11 according to:
 92 |     dnl http://h21007.www2.hp.com/portal/download/files/unprot/aCxx/PDF_Release_Notes/769149-001.pdf
 93 |     dnl Cray's crayCC needs "-h std=c++11"
 94 |     for alternative in ${ax_cxx_compile_alternatives}; do
 95 |       for switch in -std=c++${alternative} +std=c++${alternative} "-h std=c++${alternative}"; do
 96 |         cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx$1_$switch])
 97 |         AC_CACHE_CHECK(whether $CXX supports C++$1 features with $switch,
 98 |                        $cachevar,
 99 |           [ac_save_CXX="$CXX"
100 |            CXX="$CXX $switch"
101 |            AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_testbody_$1])],
102 |             [eval $cachevar=yes],
103 |             [eval $cachevar=no])
104 |            CXX="$ac_save_CXX"])
105 |         if eval test x\$$cachevar = xyes; then
106 |           CXX="$CXX $switch"
107 |           if test -n "$CXXCPP" ; then
108 |             CXXCPP="$CXXCPP $switch"
109 |           fi
110 |           ac_success=yes
111 |           break
112 |         fi
113 |       done
114 |       if test x$ac_success = xyes; then
115 |         break
116 |       fi
117 |     done
118 |   fi])
119 |   AC_LANG_POP([C++])
120 |   if test x$ax_cxx_compile_cxx$1_required = xtrue; then
121 |     if test x$ac_success = xno; then
122 |       AC_MSG_ERROR([*** A compiler with support for C++$1 language features is required.])
123 |     fi
124 |   fi
125 |   if test x$ac_success = xno; then
126 |     HAVE_CXX$1=0
127 |     AC_MSG_NOTICE([No compiler with C++$1 support was found])
128 |   else
129 |     HAVE_CXX$1=1
130 |     AC_DEFINE(HAVE_CXX$1,1,
131 |               [define if the compiler supports basic C++$1 syntax])
132 |   fi
133 |   AC_SUBST(HAVE_CXX$1)
134 | ])
135 | 
136 | 
137 | dnl  Test body for checking C++11 support
138 | 
139 | m4_define([_AX_CXX_COMPILE_STDCXX_testbody_11],
140 |   _AX_CXX_COMPILE_STDCXX_testbody_new_in_11
141 | )
142 | 
143 | 
144 | dnl  Test body for checking C++14 support
145 | 
146 | m4_define([_AX_CXX_COMPILE_STDCXX_testbody_14],
147 |   _AX_CXX_COMPILE_STDCXX_testbody_new_in_11
148 |   _AX_CXX_COMPILE_STDCXX_testbody_new_in_14
149 | )
150 | 
151 | m4_define([_AX_CXX_COMPILE_STDCXX_testbody_17],
152 |   _AX_CXX_COMPILE_STDCXX_testbody_new_in_11
153 |   _AX_CXX_COMPILE_STDCXX_testbody_new_in_14
154 |   _AX_CXX_COMPILE_STDCXX_testbody_new_in_17
155 | )
156 | 
157 | dnl  Tests for new features in C++11
158 | 
159 | m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_11], [[
160 | 
161 | // If the compiler admits that it is not ready for C++11, why torture it?
162 | // Hopefully, this will speed up the test.
163 | 
164 | #ifndef __cplusplus
165 | 
166 | #error "This is not a C++ compiler"
167 | 
168 | #elif __cplusplus < 201103L
169 | 
170 | #error "This is not a C++11 compiler"
171 | 
172 | #else
173 | 
174 | namespace cxx11
175 | {
176 | 
177 |   namespace test_static_assert
178 |   {
179 | 
180 |     template <typename T>
181 |     struct check
182 |     {
183 |       static_assert(sizeof(int) <= sizeof(T), "not big enough");
184 |     };
185 | 
186 |   }
187 | 
188 |   namespace test_final_override
189 |   {
190 | 
191 |     struct Base
192 |     {
193 |       virtual ~Base() {}
194 |       virtual void f() {}
195 |     };
196 | 
197 |     struct Derived : public Base
198 |     {
199 |       virtual ~Derived() override {}
200 |       virtual void f() override {}
201 |     };
202 | 
203 |   }
204 | 
205 |   namespace test_double_right_angle_brackets
206 |   {
207 | 
208 |     template < typename T >
209 |     struct check {};
210 | 
211 |     typedef check<void> single_type;
212 |     typedef check<check<void>> double_type;
213 |     typedef check<check<check<void>>> triple_type;
214 |     typedef check<check<check<check<void>>>> quadruple_type;
215 | 
216 |   }
217 | 
218 |   namespace test_decltype
219 |   {
220 | 
221 |     int
222 |     f()
223 |     {
224 |       int a = 1;
225 |       decltype(a) b = 2;
226 |       return a + b;
227 |     }
228 | 
229 |   }
230 | 
231 |   namespace test_type_deduction
232 |   {
233 | 
234 |     template < typename T1, typename T2 >
235 |     struct is_same
236 |     {
237 |       static const bool value = false;
238 |     };
239 | 
240 |     template < typename T >
241 |     struct is_same<T, T>
242 |     {
243 |       static const bool value = true;
244 |     };
245 | 
246 |     template < typename T1, typename T2 >
247 |     auto
248 |     add(T1 a1, T2 a2) -> decltype(a1 + a2)
249 |     {
250 |       return a1 + a2;
251 |     }
252 | 
253 |     int
254 |     test(const int c, volatile int v)
255 |     {
256 |       static_assert(is_same<int, decltype(0)>::value == true, "");
257 |       static_assert(is_same<int, decltype(c)>::value == false, "");
258 |       static_assert(is_same<int, decltype(v)>::value == false, "");
259 |       auto ac = c;
260 |       auto av = v;
261 |       auto sumi = ac + av + 'x';
262 |       auto sumf = ac + av + 1.0;
263 |       static_assert(is_same<int, decltype(ac)>::value == true, "");
264 |       static_assert(is_same<int, decltype(av)>::value == true, "");
265 |       static_assert(is_same<int, decltype(sumi)>::value == true, "");
266 |       static_assert(is_same<int, decltype(sumf)>::value == false, "");
267 |       static_assert(is_same<int, decltype(add(c, v))>::value == true, "");
268 |       return (sumf > 0.0) ? sumi : add(c, v);
269 |     }
270 | 
271 |   }
272 | 
273 |   namespace test_noexcept
274 |   {
275 | 
276 |     int f() { return 0; }
277 |     int g() noexcept { return 0; }
278 | 
279 |     static_assert(noexcept(f()) == false, "");
280 |     static_assert(noexcept(g()) == true, "");
281 | 
282 |   }
283 | 
284 |   namespace test_constexpr
285 |   {
286 | 
287 |     template < typename CharT >
288 |     unsigned long constexpr
289 |     strlen_c_r(const CharT *const s, const unsigned long acc) noexcept
290 |     {
291 |       return *s ? strlen_c_r(s + 1, acc + 1) : acc;
292 |     }
293 | 
294 |     template < typename CharT >
295 |     unsigned long constexpr
296 |     strlen_c(const CharT *const s) noexcept
297 |     {
298 |       return strlen_c_r(s, 0UL);
299 |     }
300 | 
301 |     static_assert(strlen_c("") == 0UL, "");
302 |     static_assert(strlen_c("1") == 1UL, "");
303 |     static_assert(strlen_c("example") == 7UL, "");
304 |     static_assert(strlen_c("another\0example") == 7UL, "");
305 | 
306 |   }
307 | 
308 |   namespace test_rvalue_references
309 |   {
310 | 
311 |     template < int N >
312 |     struct answer
313 |     {
314 |       static constexpr int value = N;
315 |     };
316 | 
317 |     answer<1> f(int&)       { return answer<1>(); }
318 |     answer<2> f(const int&) { return answer<2>(); }
319 |     answer<3> f(int&&)      { return answer<3>(); }
320 | 
321 |     void
322 |     test()
323 |     {
324 |       int i = 0;
325 |       const int c = 0;
326 |       static_assert(decltype(f(i))::value == 1, "");
327 |       static_assert(decltype(f(c))::value == 2, "");
328 |       static_assert(decltype(f(0))::value == 3, "");
329 |     }
330 | 
331 |   }
332 | 
333 |   namespace test_uniform_initialization
334 |   {
335 | 
336 |     struct test
337 |     {
338 |       static const int zero {};
339 |       static const int one {1};
340 |     };
341 | 
342 |     static_assert(test::zero == 0, "");
343 |     static_assert(test::one == 1, "");
344 | 
345 |   }
346 | 
347 |   namespace test_lambdas
348 |   {
349 | 
350 |     void
351 |     test1()
352 |     {
353 |       auto lambda1 = [](){};
354 |       auto lambda2 = lambda1;
355 |       lambda1();
356 |       lambda2();
357 |     }
358 | 
359 |     int
360 |     test2()
361 |     {
362 |       auto a = [](int i, int j){ return i + j; }(1, 2);
363 |       auto b = []() -> int { return '0'; }();
364 |       auto c = [=](){ return a + b; }();
365 |       auto d = [&](){ return c; }();
366 |       auto e = [a, &b](int x) mutable {
367 |         const auto identity = [](int y){ return y; };
368 |         for (auto i = 0; i < a; ++i)
369 |           a += b--;
370 |         return x + identity(a + b);
371 |       }(0);
372 |       return a + b + c + d + e;
373 |     }
374 | 
375 |     int
376 |     test3()
377 |     {
378 |       const auto nullary = [](){ return 0; };
379 |       const auto unary = [](int x){ return x; };
380 |       using nullary_t = decltype(nullary);
381 |       using unary_t = decltype(unary);
382 |       const auto higher1st = [](nullary_t f){ return f(); };
383 |       const auto higher2nd = [unary](nullary_t f1){
384 |         return [unary, f1](unary_t f2){ return f2(unary(f1())); };
385 |       };
386 |       return higher1st(nullary) + higher2nd(nullary)(unary);
387 |     }
388 | 
389 |   }
390 | 
391 |   namespace test_variadic_templates
392 |   {
393 | 
394 |     template <int...>
395 |     struct sum;
396 | 
397 |     template <int N0, int... N1toN>
398 |     struct sum<N0, N1toN...>
399 |     {
400 |       static constexpr auto value = N0 + sum<N1toN...>::value;
401 |     };
402 | 
403 |     template <>
404 |     struct sum<>
405 |     {
406 |       static constexpr auto value = 0;
407 |     };
408 | 
409 |     static_assert(sum<>::value == 0, "");
410 |     static_assert(sum<1>::value == 1, "");
411 |     static_assert(sum<23>::value == 23, "");
412 |     static_assert(sum<1, 2>::value == 3, "");
413 |     static_assert(sum<5, 5, 11>::value == 21, "");
414 |     static_assert(sum<2, 3, 5, 7, 11, 13>::value == 41, "");
415 | 
416 |   }
417 | 
418 |   // http://stackoverflow.com/questions/13728184/template-aliases-and-sfinae
419 |   // Clang 3.1 fails with headers of libstd++ 4.8.3 when using std::function
420 |   // because of this.
421 |   namespace test_template_alias_sfinae
422 |   {
423 | 
424 |     struct foo {};
425 | 
426 |     template<typename T>
427 |     using member = typename T::member_type;
428 | 
429 |     template<typename T>
430 |     void func(...) {}
431 | 
432 |     template<typename T>
433 |     void func(member<T>*) {}
434 | 
435 |     void test();
436 | 
437 |     void test() { func<foo>(0); }
438 | 
439 |   }
440 | 
441 | }  // namespace cxx11
442 | 
443 | #endif  // __cplusplus >= 201103L
444 | 
445 | ]])
446 | 
447 | 
448 | dnl  Tests for new features in C++14
449 | 
450 | m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_14], [[
451 | 
452 | // If the compiler admits that it is not ready for C++14, why torture it?
453 | // Hopefully, this will speed up the test.
454 | 
455 | #ifndef __cplusplus
456 | 
457 | #error "This is not a C++ compiler"
458 | 
459 | #elif __cplusplus < 201402L
460 | 
461 | #error "This is not a C++14 compiler"
462 | 
463 | #else
464 | 
465 | namespace cxx14
466 | {
467 | 
468 |   namespace test_polymorphic_lambdas
469 |   {
470 | 
471 |     int
472 |     test()
473 |     {
474 |       const auto lambda = [](auto&&... args){
475 |         const auto istiny = [](auto x){
476 |           return (sizeof(x) == 1UL) ? 1 : 0;
477 |         };
478 |         const int aretiny[] = { istiny(args)... };
479 |         return aretiny[0];
480 |       };
481 |       return lambda(1, 1L, 1.0f, '1');
482 |     }
483 | 
484 |   }
485 | 
486 |   namespace test_binary_literals
487 |   {
488 | 
489 |     constexpr auto ivii = 0b0000000000101010;
490 |     static_assert(ivii == 42, "wrong value");
491 | 
492 |   }
493 | 
494 |   namespace test_generalized_constexpr
495 |   {
496 | 
497 |     template < typename CharT >
498 |     constexpr unsigned long
499 |     strlen_c(const CharT *const s) noexcept
500 |     {
501 |       auto length = 0UL;
502 |       for (auto p = s; *p; ++p)
503 |         ++length;
504 |       return length;
505 |     }
506 | 
507 |     static_assert(strlen_c("") == 0UL, "");
508 |     static_assert(strlen_c("x") == 1UL, "");
509 |     static_assert(strlen_c("test") == 4UL, "");
510 |     static_assert(strlen_c("another\0test") == 7UL, "");
511 | 
512 |   }
513 | 
514 |   namespace test_lambda_init_capture
515 |   {
516 | 
517 |     int
518 |     test()
519 |     {
520 |       auto x = 0;
521 |       const auto lambda1 = [a = x](int b){ return a + b; };
522 |       const auto lambda2 = [a = lambda1(x)](){ return a; };
523 |       return lambda2();
524 |     }
525 | 
526 |   }
527 | 
528 |   namespace test_digit_separators
529 |   {
530 | 
531 |     constexpr auto ten_million = 100'000'000;
532 |     static_assert(ten_million == 100000000, "");
533 | 
534 |   }
535 | 
536 |   namespace test_return_type_deduction
537 |   {
538 | 
539 |     auto f(int& x) { return x; }
540 |     decltype(auto) g(int& x) { return x; }
541 | 
542 |     template < typename T1, typename T2 >
543 |     struct is_same
544 |     {
545 |       static constexpr auto value = false;
546 |     };
547 | 
548 |     template < typename T >
549 |     struct is_same<T, T>
550 |     {
551 |       static constexpr auto value = true;
552 |     };
553 | 
554 |     int
555 |     test()
556 |     {
557 |       auto x = 0;
558 |       static_assert(is_same<int, decltype(f(x))>::value, "");
559 |       static_assert(is_same<int&, decltype(g(x))>::value, "");
560 |       return x;
561 |     }
562 | 
563 |   }
564 | 
565 | }  // namespace cxx14
566 | 
567 | #endif  // __cplusplus >= 201402L
568 | 
569 | ]])
570 | 
571 | 
572 | dnl  Tests for new features in C++17
573 | 
574 | m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_17], [[
575 | 
576 | // If the compiler admits that it is not ready for C++17, why torture it?
577 | // Hopefully, this will speed up the test.
578 | 
579 | #ifndef __cplusplus
580 | 
581 | #error "This is not a C++ compiler"
582 | 
583 | #elif __cplusplus < 201703L
584 | 
585 | #error "This is not a C++17 compiler"
586 | 
587 | #else
588 | 
589 | #include <initializer_list>
590 | #include <utility>
591 | #include <type_traits>
592 | 
593 | namespace cxx17
594 | {
595 | 
596 |   namespace test_constexpr_lambdas
597 |   {
598 | 
599 |     constexpr int foo = [](){return 42;}();
600 | 
601 |   }
602 | 
603 |   namespace test::nested_namespace::definitions
604 |   {
605 | 
606 |   }
607 | 
608 |   namespace test_fold_expression
609 |   {
610 | 
611 |     template<typename... Args>
612 |     int multiply(Args... args)
613 |     {
614 |       return (args * ... * 1);
615 |     }
616 | 
617 |     template<typename... Args>
618 |     bool all(Args... args)
619 |     {
620 |       return (args && ...);
621 |     }
622 | 
623 |   }
624 | 
625 |   namespace test_extended_static_assert
626 |   {
627 | 
628 |     static_assert (true);
629 | 
630 |   }
631 | 
632 |   namespace test_auto_brace_init_list
633 |   {
634 | 
635 |     auto foo = {5};
636 |     auto bar {5};
637 | 
638 |     static_assert(std::is_same<std::initializer_list<int>, decltype(foo)>::value);
639 |     static_assert(std::is_same<int, decltype(bar)>::value);
640 |   }
641 | 
642 |   namespace test_typename_in_template_template_parameter
643 |   {
644 | 
645 |     template<template<typename> typename X> struct D;
646 | 
647 |   }
648 | 
649 |   namespace test_fallthrough_nodiscard_maybe_unused_attributes
650 |   {
651 | 
652 |     int f1()
653 |     {
654 |       return 42;
655 |     }
656 | 
657 |     [[nodiscard]] int f2()
658 |     {
659 |       [[maybe_unused]] auto unused = f1();
660 | 
661 |       switch (f1())
662 |       {
663 |       case 17:
664 |         f1();
665 |         [[fallthrough]];
666 |       case 42:
667 |         f1();
668 |       }
669 |       return f1();
670 |     }
671 | 
672 |   }
673 | 
674 |   namespace test_extended_aggregate_initialization
675 |   {
676 | 
677 |     struct base1
678 |     {
679 |       int b1, b2 = 42;
680 |     };
681 | 
682 |     struct base2
683 |     {
684 |       base2() {
685 |         b3 = 42;
686 |       }
687 |       int b3;
688 |     };
689 | 
690 |     struct derived : base1, base2
691 |     {
692 |         int d;
693 |     };
694 | 
695 |     derived d1 {{1, 2}, {}, 4};  // full initialization
696 |     derived d2 {{}, {}, 4};      // value-initialized bases
697 | 
698 |   }
699 | 
700 |   namespace test_general_range_based_for_loop
701 |   {
702 | 
703 |     struct iter
704 |     {
705 |       int i;
706 | 
707 |       int& operator* ()
708 |       {
709 |         return i;
710 |       }
711 | 
712 |       const int& operator* () const
713 |       {
714 |         return i;
715 |       }
716 | 
717 |       iter& operator++()
718 |       {
719 |         ++i;
720 |         return *this;
721 |       }
722 |     };
723 | 
724 |     struct sentinel
725 |     {
726 |       int i;
727 |     };
728 | 
729 |     bool operator== (const iter& i, const sentinel& s)
730 |     {
731 |       return i.i == s.i;
732 |     }
733 | 
734 |     bool operator!= (const iter& i, const sentinel& s)
735 |     {
736 |       return !(i == s);
737 |     }
738 | 
739 |     struct range
740 |     {
741 |       iter begin() const
742 |       {
743 |         return {0};
744 |       }
745 | 
746 |       sentinel end() const
747 |       {
748 |         return {5};
749 |       }
750 |     };
751 | 
752 |     void f()
753 |     {
754 |       range r {};
755 | 
756 |       for (auto i : r)
757 |       {
758 |         [[maybe_unused]] auto v = i;
759 |       }
760 |     }
761 | 
762 |   }
763 | 
764 |   namespace test_lambda_capture_asterisk_this_by_value
765 |   {
766 | 
767 |     struct t
768 |     {
769 |       int i;
770 |       int foo()
771 |       {
772 |         return [*this]()
773 |         {
774 |           return i;
775 |         }();
776 |       }
777 |     };
778 | 
779 |   }
780 | 
781 |   namespace test_enum_class_construction
782 |   {
783 | 
784 |     enum class byte : unsigned char
785 |     {};
786 | 
787 |     byte foo {42};
788 | 
789 |   }
790 | 
791 |   namespace test_constexpr_if
792 |   {
793 | 
794 |     template <bool cond>
795 |     int f ()
796 |     {
797 |       if constexpr(cond)
798 |       {
799 |         return 13;
800 |       }
801 |       else
802 |       {
803 |         return 42;
804 |       }
805 |     }
806 | 
807 |   }
808 | 
809 |   namespace test_selection_statement_with_initializer
810 |   {
811 | 
812 |     int f()
813 |     {
814 |       return 13;
815 |     }
816 | 
817 |     int f2()
818 |     {
819 |       if (auto i = f(); i > 0)
820 |       {
821 |         return 3;
822 |       }
823 | 
824 |       switch (auto i = f(); i + 4)
825 |       {
826 |       case 17:
827 |         return 2;
828 | 
829 |       default:
830 |         return 1;
831 |       }
832 |     }
833 | 
834 |   }
835 | 
836 |   namespace test_template_argument_deduction_for_class_templates
837 |   {
838 | 
839 |     template <typename T1, typename T2>
840 |     struct pair
841 |     {
842 |       pair (T1 p1, T2 p2)
843 |         : m1 {p1},
844 |           m2 {p2}
845 |       {}
846 | 
847 |       T1 m1;
848 |       T2 m2;
849 |     };
850 | 
851 |     void f()
852 |     {
853 |       [[maybe_unused]] auto p = pair{13, 42u};
854 |     }
855 | 
856 |   }
857 | 
858 |   namespace test_non_type_auto_template_parameters
859 |   {
860 | 
861 |     template <auto n>
862 |     struct B
863 |     {};
864 | 
865 |     B<5> b1;
866 |     B<'a'> b2;
867 | 
868 |   }
869 | 
870 |   namespace test_structured_bindings
871 |   {
872 | 
873 |     int arr[2] = { 1, 2 };
874 |     std::pair<int, int> pr = { 1, 2 };
875 | 
876 |     auto f1() -> int(&)[2]
877 |     {
878 |       return arr;
879 |     }
880 | 
881 |     auto f2() -> std::pair<int, int>&
882 |     {
883 |       return pr;
884 |     }
885 | 
886 |     struct S
887 |     {
888 |       int x1 : 2;
889 |       volatile double y1;
890 |     };
891 | 
892 |     S f3()
893 |     {
894 |       return {};
895 |     }
896 | 
897 |     auto [ x1, y1 ] = f1();
898 |     auto& [ xr1, yr1 ] = f1();
899 |     auto [ x2, y2 ] = f2();
900 |     auto& [ xr2, yr2 ] = f2();
901 |     const auto [ x3, y3 ] = f3();
902 | 
903 |   }
904 | 
905 |   namespace test_exception_spec_type_system
906 |   {
907 | 
908 |     struct Good {};
909 |     struct Bad {};
910 | 
911 |     void g1() noexcept;
912 |     void g2();
913 | 
914 |     template<typename T>
915 |     Bad
916 |     f(T*, T*);
917 | 
918 |     template<typename T1, typename T2>
919 |     Good
920 |     f(T1*, T2*);
921 | 
922 |     static_assert (std::is_same_v<Good, decltype(f(g1, g2))>);
923 | 
924 |   }
925 | 
926 |   namespace test_inline_variables
927 |   {
928 | 
929 |     template<class T> void f(T)
930 |     {}
931 | 
932 |     template<class T> inline T g(T)
933 |     {
934 |       return T{};
935 |     }
936 | 
937 |     template<> inline void f<>(int)
938 |     {}
939 | 
940 |     template<> int g<>(int)
941 |     {
942 |       return 5;
943 |     }
944 | 
945 |   }
946 | 
947 | }  // namespace cxx17
948 | 
949 | #endif  // __cplusplus < 201703L
950 | 
951 | ]])
952 | 


--------------------------------------------------------------------------------
/src/cmdlineparse.hh:
--------------------------------------------------------------------------------
  1 | //
  2 | // One header file Commandline Parse for C++11 2019-11-17.07
  3 | // https://github.com/trueroad/cmdlineparse
  4 | //
  5 | // Copyright (C) 2016, 2017, 2019 Masamichi Hosoda. All rights reserved.
  6 | //
  7 | // Redistribution and use in source and binary forms, with or without
  8 | // modification, are permitted provided that the following conditions
  9 | // are met:
 10 | //
 11 | // * Redistributions of source code must retain the above copyright notice,
 12 | //   this list of conditions and the following disclaimer.
 13 | //
 14 | // * Redistributions in binary form must reproduce the above copyright notice,
 15 | //   this list of conditions and the following disclaimer in the documentation
 16 | //   and/or other materials provided with the distribution.
 17 | //
 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 19 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 20 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 21 | // ARE DISCLAIMED.
 22 | // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 23 | // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 24 | // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 25 | // OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 26 | // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 27 | // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 28 | // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 29 | // SUCH DAMAGE.
 30 | //
 31 | 
 32 | #ifndef INCLUDE_GUARD_CMDLINEPARSE_HH
 33 | #define INCLUDE_GUARD_CMDLINEPARSE_HH
 34 | 
 35 | #include <vector>
 36 | #include <map>
 37 | #include <string>
 38 | #include <iostream>
 39 | #include <sstream>
 40 | #include <functional>
 41 | 
 42 | // gettext macros
 43 | #ifndef _
 44 | 
 45 | #define _(DUMMY) DUMMY
 46 | #define CMDLINEPARSE_HH_DEFINE_DUMMY_GETTEXT
 47 | 
 48 | #endif
 49 | 
 50 | namespace cmdlineparse
 51 | {
 52 | 
 53 |   enum class arg_mode
 54 |     {
 55 |       no_argument = 0,
 56 |       required_argument = 1,
 57 |       optional_argument = 2
 58 |     };
 59 | 
 60 |   enum class abort_reason
 61 |     {
 62 |       option_handler = -1,
 63 |       no_abort = 0,
 64 |       error_extra_arg,
 65 |       error_no_arg,
 66 |       error_ambiguous_option,
 67 |       error_unknown_option,
 68 |       error_no_arg_short,
 69 |       error_unknown_option_short,
 70 |     };
 71 | 
 72 |   class parser
 73 |   {
 74 |   public:
 75 |     // Add an option handler
 76 |     bool add_handler (char short_name, const std::string &long_name,
 77 |                       arg_mode has_arg,
 78 |                       std::function<bool(const std::string&)> option_handler,
 79 |                       const std::string &description = "",
 80 |                       const std::string &typestr = "",
 81 |                       const std::string &header = "",
 82 |                       const std::string &group = "");
 83 | 
 84 |     // Add a flag option
 85 |     bool add_flag (char short_name, const std::string &long_name,
 86 |                    bool *flag,
 87 |                    const std::string &description = "",
 88 |                    const std::string &group = "");
 89 | 
 90 |     // Add a string option
 91 |     bool add_string (char short_name, const std::string &long_name,
 92 |                      std::string *var, const std::string &defval,
 93 |                      const std::string &description = "",
 94 |                      const std::string &typestr = "STRING",
 95 |                      const std::string &group = "");
 96 | 
 97 |     // Add help description
 98 |     void add_description (char short_name, const std::string &long_name,
 99 |                           arg_mode has_arg,
100 |                           const std::string &description,
101 |                           const std::string &typestr = "",
102 |                           const std::string &header = "",
103 |                           const std::string &group = "");
104 | 
105 |     // Add default handler
106 |     bool add_default_help (void);
107 |     bool add_default_version (void);
108 |     bool add_default (void)
109 |     {
110 |       if (!add_default_help ())
111 |         return false;
112 |       return add_default_version ();
113 |     }
114 | 
115 |     // Parse options
116 |     bool parse (int argc, char const* const* argv, int optind = 1);
117 | 
118 |     // Build default strings
119 |     std::string build_usage (void) const;
120 |     std::string build_help (void) const;
121 | 
122 |     // Get version_string
123 |     const std::string &get_version_string () const noexcept
124 |     {
125 |       return version_string;
126 |     }
127 | 
128 |     // Set version_string
129 |     void set_version_string (const std::string &s)
130 |     {
131 |       version_string = s;
132 |     }
133 | 
134 |     // Get unamed_args
135 |     const std::vector<std::string> &get_unamed_args () const noexcept
136 |     {
137 |       return unamed_args;
138 |     }
139 | 
140 |     // Set usage_unamed_opts
141 |     void set_usage_unamed_opts (const std::string &s)
142 |     {
143 |       usage_unamed_opts = s;
144 |     }
145 | 
146 |     // Get abort reason
147 |     abort_reason get_abort_reason () const noexcept
148 |     {
149 |       return abort;
150 |     }
151 | 
152 |     // Get abort option
153 |     const std::string &get_abort_option () const noexcept
154 |     {
155 |       return abort_option;
156 |     }
157 | 
158 |     // Set opterr
159 |     void set_opterr (bool flag) noexcept
160 |     {
161 |       opterr = flag;
162 |     }
163 | 
164 |     // Set long_only
165 |     void set_long_only (bool flag) noexcept
166 |     {
167 |       long_only = flag;
168 |     }
169 | 
170 |     // Set continue_on_error
171 |     void set_continue_on_error (bool flag) noexcept
172 |     {
173 |       continue_on_error = flag;
174 |     }
175 | 
176 |     // Set abbreviated_long_name
177 |     void set_abbreviated_long_name (bool flag) noexcept
178 |     {
179 |       abbreviated_long_name = flag;
180 |     }
181 | 
182 |     // Constructor
183 |     parser ();
184 | 
185 |     // Const
186 |     const std::string h_indent {"  "};
187 |     const std::string h_space {"   "};
188 |     const std::string d_indent {"    "};
189 | 
190 |   private:
191 |     // Internal functions
192 |     void add_short (char short_name, arg_mode has_arg,
193 |                     std::function<bool(const std::string&)> option_handler);
194 |     void add_long (const std::string &long_name, arg_mode has_arg,
195 |                     std::function<bool(const std::string&)> option_handler);
196 |     bool find_unique_long_name (const std::string &long_name,
197 |                                 std::pair<arg_mode,
198 |                                 std::function<bool(const std::string&)>>
199 |                                 *target,
200 |                                 bool *ambiguous);
201 |     bool parse_long_name (std::vector<std::string>::const_iterator *it);
202 |     bool parse_short_name (std::vector<std::string>::const_iterator *it);
203 | 
204 |     // Error handlers
205 |     std::function<bool(const std::string&,
206 |                        const std::string&)> error_extra_arg
207 |     {
208 |       [this](const std::string &long_name, const std::string &)->bool
209 |         {
210 |           if (opterr)
211 |             std::cerr << argvs[0]
212 |                       << _(": option doesn't take an argument -- ")
213 |                       << long_name << std::endl;
214 |           return continue_on_error;
215 |         }
216 |     };
217 |     std::function<bool(const std::string&)> error_no_arg
218 |     {
219 |       [this](const std::string &long_name)->bool
220 |         {
221 |           if (opterr)
222 |             std::cerr << argvs[0]
223 |                       << _(": option requires an argument -- ")
224 |                       << long_name << std::endl;
225 |           return continue_on_error;
226 |         }
227 |     };
228 |     std::function<bool(const std::string&)> error_ambiguous_option
229 |     {
230 |       [this](const std::string &optchars)->bool
231 |         {
232 |           if (opterr)
233 |             std::cerr << argvs[0]
234 |                       << _(": ambiguous option -- ")
235 |                       << optchars << std::endl;
236 |           return continue_on_error;
237 |         }
238 |     };
239 |     std::function<bool(const std::string&)> error_unknown_option
240 |     {
241 |       [this](const std::string &optchars)->bool
242 |         {
243 |           if (opterr)
244 |             std::cerr << argvs[0]
245 |                       << _(": unknown option -- ")
246 |                       << optchars << std::endl;
247 |           return continue_on_error;
248 |         }
249 |     };
250 |     std::function<bool(char)> error_no_arg_short
251 |     {
252 |       [this](char optchar)->bool
253 |         {
254 |           if (opterr)
255 |             std::cerr << argvs[0]
256 |                       << _(": option requires an argument -- ")
257 |                       << optchar << std::endl;
258 |           return continue_on_error;
259 |         }
260 |     };
261 |     std::function<bool(char)> error_unknown_option_short
262 |     {
263 |       [this](char optchar)->bool
264 |         {
265 |           if (opterr)
266 |             std::cerr << argvs[0]
267 |                       << _(": unknown option -- ")
268 |                       << optchar << std::endl;
269 |           return continue_on_error;
270 |         }
271 |     };
272 | 
273 |     // Help strings
274 |     std::string version_string;
275 |     std::string usage_unamed_opts;
276 | 
277 |     // Flags
278 |     bool opterr {true};
279 |     bool continue_on_error {false};
280 |     bool long_only {false};
281 |     bool abbreviated_long_name {true};
282 | 
283 |     // Abort reason
284 |     abort_reason abort {abort_reason::no_abort};
285 |     std::string abort_option;
286 | 
287 |     // Arguments
288 |     std::vector<std::string> argvs;
289 |     std::vector<std::string> unamed_args;
290 | 
291 |     // Maps
292 |     std::map<char,
293 |              std::pair<arg_mode,
294 |                        std::function <bool(const std::string&)>>> short_map;
295 |     std::map<std::string,
296 |              std::pair<arg_mode,
297 |                        std::function <bool(const std::string&)>>> long_map;
298 |     std::map<std::string, std::vector<std::string>> help_map;
299 |   };
300 | 
301 |   inline
302 |   parser::parser ()
303 |   {
304 | #ifdef PACKAGE_STRING
305 |     // Build version_string
306 |     std::stringstream ss;
307 |     ss << PACKAGE_STRING << std::endl;
308 | 
309 | #ifdef PACKAGE_COPYRIGHT
310 |     ss << PACKAGE_COPYRIGHT << std::endl;
311 | #ifdef PACKAGE_LICENSE
312 |     ss << PACKAGE_LICENSE << std::endl;
313 | #endif // PACKAGE_LICENSE
314 | #endif // PACKAGE_COPYRIGHT
315 | 
316 | #ifdef PACKAGE_URL
317 |     ss << std::endl << PACKAGE_URL << std::endl;
318 | #endif // PACKAGE_URL
319 | 
320 |     version_string = ss.str ();
321 | #endif // PACKAGE_STRING
322 |   }
323 | 
324 |   inline void
325 |   parser::add_short (char short_name, arg_mode has_arg,
326 |                      std::function<bool(const std::string&)> option_handler)
327 |   {
328 |     if (short_name)
329 |       {
330 |         short_map[short_name] =
331 |           std::pair<arg_mode, std::function <bool(const std::string&)>>
332 |           (has_arg, option_handler);
333 |       }
334 |   }
335 | 
336 |   inline void
337 |   parser::add_long (const std::string &long_name, arg_mode has_arg,
338 |                     std::function<bool(const std::string&)> option_handler)
339 |   {
340 |     if (!long_name.empty ())
341 |       {
342 |         long_map[long_name] =
343 |           std::pair<arg_mode, std::function <bool(const std::string&)>>
344 |           (has_arg, option_handler);
345 |       }
346 |   }
347 | 
348 |   inline void
349 |   parser::add_description (char short_name, const std::string &long_name,
350 |                            arg_mode has_arg,
351 |                            const std::string &description,
352 |                            const std::string &typestr,
353 |                            const std::string &header,
354 |                            const std::string &group)
355 |   {
356 |     std::stringstream ss;
357 | 
358 |     if (short_name || !long_name.empty () || !header.empty ())
359 |       {
360 |         ss << h_indent;
361 |         if (short_name)
362 |           {
363 |             ss << "-" << short_name;
364 |             if (long_name.empty () && !typestr.empty ())
365 |               ss << typestr;
366 |           }
367 |         if (!long_name.empty ())
368 |           {
369 |             if (short_name)
370 |               ss << ", ";
371 |             ss << "--" << long_name;
372 |             if (!typestr.empty ())
373 |               {
374 |                 switch (has_arg)
375 |                   {
376 |                   case arg_mode::no_argument:
377 |                     // Nothing to do
378 |                     break;
379 |                   case arg_mode::required_argument:
380 |                     ss << "=" << typestr;
381 |                     break;
382 |                   case arg_mode::optional_argument:
383 |                     ss << "[=" << typestr << "]";
384 |                     break;
385 |                   }
386 |               }
387 |           }
388 |         if (!header.empty ())
389 |           ss << header;
390 |         ss << std::endl;
391 |       }
392 | 
393 |     if (!description.empty ())
394 |       ss << description << std::endl;
395 | 
396 |     help_map[group].push_back (ss.str ());
397 |   }
398 | 
399 |   inline bool
400 |   parser::add_handler (char short_name, const std::string &long_name,
401 |                        arg_mode has_arg,
402 |                        std::function<bool(const std::string&)> option_handler,
403 |                        const std::string &description,
404 |                        const std::string &typestr,
405 |                        const std::string &header,
406 |                        const std::string &group)
407 |   {
408 |     if (short_name && short_map.find (short_name) != short_map.end ())
409 |       {
410 |         // The short name is already registerd.
411 |         return false;
412 |       }
413 |     if (!long_name.empty() && long_map.find (long_name) != long_map.end ())
414 |       {
415 |         // The long name is already registerd.
416 |         return false;
417 |       }
418 | 
419 |     add_short (short_name, has_arg, option_handler);
420 |     add_long (long_name, has_arg, option_handler);
421 | 
422 |     if (!description.empty () || !typestr.empty ())
423 |       add_description (short_name, long_name, has_arg,
424 |                        description, typestr, header, group);
425 | 
426 |     return true;
427 |   }
428 | 
429 |   inline bool
430 |   parser::add_flag (char short_name, const std::string &long_name,
431 |                     bool *flag,
432 |                     const std::string &description,
433 |                     const std::string &group)
434 |   {
435 |     *flag = false;
436 |     return add_handler (short_name, long_name, arg_mode::no_argument,
437 |                         [flag](const std::string &)->bool
438 |                         {
439 |                           *flag = true;
440 |                           return true;
441 |                         },
442 |                         description, "", "", group);
443 |   }
444 | 
445 |   inline bool
446 |   parser::add_string (char short_name, const std::string &long_name,
447 |                       std::string *var, const std::string &defval,
448 |                       const std::string &description,
449 |                       const std::string &typestr,
450 |                       const std::string &group)
451 |   {
452 |     *var = defval;
453 |     std::string header;
454 | 
455 |     if(!defval.empty ())
456 |       {
457 |         // Three spaces for separator (same as h_space)
458 |         header = _("   (default=") + defval + ")";
459 |       }
460 |     return add_handler (short_name, long_name, arg_mode::required_argument,
461 |                         [var](const std::string &optarg)->bool
462 |                         {
463 |                           *var = optarg;
464 |                           return true;
465 |                         },
466 |                         description, typestr, header, group);
467 |   }
468 | 
469 |   inline bool
470 |   parser::add_default_help (void)
471 |   {
472 |     return add_handler ('h', "help", arg_mode::no_argument,
473 |                         [this](const std::string &)->bool
474 |                         {
475 |                           std::cout << build_help ();
476 |                           return false;
477 |                         },
478 |                         // Four spaces for indent (same as d_indent)
479 |                         _("    Print help and exit"));
480 |   }
481 | 
482 |   inline bool
483 |   parser::add_default_version (void)
484 |   {
485 |     return add_handler ('V', "version", arg_mode::no_argument,
486 |                         [this](const std::string &)->bool
487 |                         {
488 |                           std::cout << version_string;
489 |                           return false;
490 |                         },
491 |                         // Four spaces for indent (same as d_indent)
492 |                         _("    Print version and exit"));
493 |   }
494 | 
495 |   inline bool
496 |   parser::find_unique_long_name (const std::string &long_name,
497 |                                  std::pair<arg_mode,
498 |                                  std::function<bool(const std::string&)>>
499 |                                  *target,
500 |                                  bool *ambiguous)
501 |   {
502 |     if (long_map.find (long_name) != long_map.end ())
503 |       {
504 |         // Long option name found
505 |         *target = long_map[long_name];
506 |         return true;
507 |       }
508 |     if (abbreviated_long_name)
509 |       {
510 |         // Search abbreviated long option name
511 |         for (decltype (long_map.cbegin()) it =
512 |                long_map.upper_bound (long_name);
513 |              it != long_map.cend ();
514 |              ++it)
515 |           {
516 |             if (it->first.substr (0, long_name.size ()) == long_name)
517 |               {
518 |                 auto next_it = it;
519 |                 ++next_it;
520 |                 if (next_it != long_map.cend () &&
521 |                     next_it->first.substr (0, long_name.size ()) == long_name)
522 |                   {
523 |                     // Failed: ambiguous option
524 |                     *ambiguous = true;
525 |                     return false;
526 |                   }
527 |                 // Unique abbreviated long option name fount
528 |                 *target = it->second;
529 |                 return true;
530 |               }
531 |           }
532 |       }
533 |     // Failed: unknown long option name
534 |     *ambiguous = false;
535 |     return false;
536 |   }
537 | 
538 |   inline bool
539 |   parser::parse_long_name (std::vector<std::string>::const_iterator *it)
540 |   {
541 |     size_t optchars_pos = (*it)->at(1) == '-' ?
542 |       2: // Double hyphen (--long_name) style
543 |       1; // Single hyphen (-long_name) style
544 |     auto delimiter_pos = (*it)->find ('=');
545 |     std::string long_name = (*it)->substr (optchars_pos,
546 |                                            delimiter_pos - optchars_pos);
547 |     std::string optarg;
548 | 
549 |     if (delimiter_pos != std::string::npos)
550 |       {
551 |         // Option characters have an option argument
552 |         // (something like --long_name=optarg)
553 |         optarg = (*it)->substr (delimiter_pos + 1, std::string::npos);
554 |       }
555 | 
556 |     std::pair<arg_mode, std::function<bool(const std::string&)>> target;
557 |     bool ambiguous;
558 |     if (find_unique_long_name (long_name, &target, &ambiguous))
559 |       {
560 |         // Long option name found
561 |         switch (target.first)
562 |           {
563 |           case arg_mode::no_argument:
564 |             if (delimiter_pos != std::string::npos)
565 |               {
566 |                 if (!error_extra_arg (long_name, optarg))
567 |                   {
568 |                     abort = abort_reason::error_extra_arg;
569 |                     abort_option = long_name;
570 |                     return false;
571 |                   }
572 |                 return true;
573 |               }
574 |             break;
575 |           case arg_mode::required_argument:
576 |             if (delimiter_pos == std::string::npos)
577 |               {
578 |                 if ((*it + 1) != argvs.cend ())
579 |                   {
580 |                     // Next argv-element is an option argument
581 |                     // (something like --long_name optarg)
582 |                     optarg = *(++(*it));
583 |                   }
584 |                 else
585 |                   {
586 |                     if (!error_no_arg (long_name))
587 |                       {
588 |                         abort = abort_reason::error_no_arg;
589 |                         abort_option = long_name;
590 |                         return false;
591 |                       }
592 |                     return true;
593 |                   }
594 |               }
595 |             break;
596 |           case arg_mode::optional_argument:
597 |             // Nothing to do
598 |             break;
599 |           }
600 |         // Call option handler
601 |         if (!target.second (optarg))
602 |           {
603 |             abort = abort_reason::option_handler;
604 |             abort_option = long_name;
605 |             return false;
606 |           }
607 |         return true;
608 |       }
609 | 
610 |     // Long option name did not find
611 |     if (optchars_pos == 1)
612 |       {
613 |         // Fallback to short option name
614 |         return parse_short_name (it);
615 |       }
616 |     if (ambiguous)
617 |       {
618 |         if (!error_ambiguous_option ((*it)->substr (optchars_pos,
619 |                                                     std::string::npos)))
620 |           {
621 |             abort = abort_reason::error_ambiguous_option;
622 |             abort_option = long_name;
623 |             return false;
624 |           }
625 |         return true;
626 |       }
627 |     if (!error_unknown_option ((*it)->substr (optchars_pos,
628 |                                               std::string::npos)))
629 |       {
630 |         abort = abort_reason::error_unknown_option;
631 |         abort_option = long_name;
632 |         return false;
633 |       }
634 |     return true;
635 |   }
636 | 
637 |   inline bool
638 |   parser::parse_short_name (std::vector<std::string>::const_iterator *it)
639 |   {
640 |     for (auto name_it = (*it)->cbegin () + 1;
641 |          name_it != (*it)->cend ();
642 |          ++name_it)
643 |       {
644 |         if (short_map.find (*name_it) != short_map.end())
645 |           {
646 |             // Short option name found
647 |             auto target = short_map[*name_it];
648 |             switch (target.first)
649 |               {
650 |               case arg_mode::no_argument:
651 |                 // Option characters doesn't have an option argument
652 |                 // (something like -o)
653 |                 if (!target.second (""))  // Call option handler
654 |                   {
655 |                     abort = abort_reason::option_handler;
656 |                     abort_option = *name_it;
657 |                     return false;
658 |                   }
659 |                 break;
660 |               case arg_mode::required_argument:
661 |                 if ((name_it + 1) != (*it)->cend ())
662 |                   {
663 |                     // Option characters have an option argument
664 |                     // (something like -ooptarg)
665 |                     std::string optarg (name_it + 1, (*it)->cend ());
666 |                     if (!target.second (optarg))  // Call option handler
667 |                       {
668 |                         abort = abort_reason::option_handler;
669 |                         abort_option = *name_it;
670 |                         return false;
671 |                       }
672 |                     return true;
673 |                   }
674 |                 else if ((*it + 1) != argvs.cend ())
675 |                   {
676 |                     // Next argv-element is an option argument
677 |                     // (something like -o optarg)
678 |                     std::string optarg = *(++(*it));
679 |                     if (!target.second (optarg))  // Call option handler
680 |                       {
681 |                         abort = abort_reason::option_handler;
682 |                         abort_option = *name_it;
683 |                         return false;
684 |                       }
685 |                     return true;
686 |                   }
687 |                 else
688 |                   {
689 |                     if (!error_no_arg_short (*name_it))
690 |                       {
691 |                         abort = abort_reason::error_no_arg_short;
692 |                         abort_option = *name_it;
693 |                         return false;
694 |                       }
695 |                   }
696 |                 break;
697 |               case arg_mode::optional_argument:
698 |                 if ((name_it + 1) != (*it)->cend ())
699 |                   {
700 |                     // Option characters have an option argument
701 |                     // (something like -ooptarg)
702 |                     std::string optarg (name_it + 1, (*it)->cend ());
703 |                     if (!target.second (optarg))  // Call option handler
704 |                       {
705 |                         abort = abort_reason::option_handler;
706 |                         abort_option = *name_it;
707 |                         return false;
708 |                       }
709 |                     return true;
710 |                   }
711 |                 // Option characters doesn't have an option argument
712 |                 // (something like -o)
713 |                 if (!target.second (""))  // Call option handler
714 |                   {
715 |                     abort = abort_reason::option_handler;
716 |                     abort_option = *name_it;
717 |                     return false;
718 |                   }
719 |                 break;
720 |               }
721 |           }
722 |         else
723 |           {
724 |             if (!error_unknown_option_short (*name_it))
725 |               {
726 |                 abort = abort_reason::error_unknown_option_short;
727 |                 abort_option = *name_it;
728 |                 return false;
729 |               }
730 |           }
731 |       }
732 |     return true;
733 |   }
734 | 
735 |   inline bool
736 |   parser::parse (int argc, char const* const* argv, int optind)
737 |   {
738 |     argvs.assign (argv, argv + argc);
739 | 
740 |     if (version_string.empty ())
741 |       version_string = argvs[0] + "\n";
742 | 
743 |     bool all_skip = false;
744 | 
745 |     for (auto argv_it = argvs.cbegin () + optind;
746 |          argv_it != argvs.cend ();
747 |          ++argv_it)
748 |       {
749 |         // Check skip
750 |         if (all_skip)
751 |           {
752 |             unamed_args.push_back (*argv_it);
753 |             continue;
754 |           }
755 | 
756 |         // Check "--" and "-"
757 |         // They are not option element.
758 |         if (*argv_it == "--")
759 |           {
760 |             all_skip = true;
761 |             continue;
762 |           }
763 |         else if (*argv_it == "-")
764 |           {
765 |             unamed_args.push_back (*argv_it);
766 |             continue;
767 |           }
768 | 
769 |         // Check option element
770 |         if (argv_it->substr (0, 2) == "--")
771 |           {
772 |             // Long option
773 |             if (!parse_long_name (&argv_it))
774 |               return false;
775 |           }
776 |         else if (argv_it->substr (0, 1) == "-")
777 |           {
778 |             if (long_only)
779 |               {
780 |                 // Long option
781 |                 if (!parse_long_name (&argv_it))
782 |                   return false;
783 |               }
784 |             else
785 |               {
786 |                 // Short option
787 |                 if (!parse_short_name (&argv_it))
788 |                   return false;
789 |               }
790 |           }
791 |         else
792 |           {
793 |             // It is not an option element.
794 |             unamed_args.push_back (*argv_it);
795 |           }
796 |       }
797 |     return true;
798 |   }
799 | 
800 |   inline std::string
801 |   parser::build_usage (void) const
802 |   {
803 |     std::stringstream ss;
804 | 
805 |     ss << _("Usage: ") << argvs[0] << _(" [options] ");
806 |     if (!usage_unamed_opts.empty ())
807 |       ss << "[" << usage_unamed_opts << "] ";
808 |     ss << "..." << std::endl;
809 | 
810 |     return ss.str ();
811 |   }
812 | 
813 |   inline std::string
814 |   parser::build_help (void) const
815 |   {
816 |     std::stringstream ss;
817 | 
818 |     ss << version_string << std::endl << build_usage () << std::endl;
819 | 
820 |     for (const auto &group: help_map)
821 |       {
822 |         if (!group.first.empty ())
823 |           ss << std::endl << group.first << ":" << std::endl;
824 |         for (const auto &description: group.second)
825 |           {
826 |             ss << description;
827 |           }
828 |       }
829 | 
830 |     return ss.str ();
831 |   }
832 | 
833 | }
834 | 
835 | #if defined (_) && defined (CMDLINEPARSE_HH_DEFINE_DUMMY_GETTEXT)
836 | #undef _
837 | #undef CMDLINEPARSE_HH_DEFINE_DUMMY_GETTEXT
838 | #endif
839 | 
840 | #endif
841 | 


--------------------------------------------------------------------------------