├── .clang-format ├── .gitignore ├── 1bit.png ├── COPYING ├── blocktest.cpp ├── bookmaker.cpp ├── bookparser.cpp ├── bookparser.hpp ├── chaptercommon.hpp ├── chapterformatter.cpp ├── chapterformatter.hpp ├── draftpaginator.cpp ├── draftpaginator.hpp ├── epub.cpp ├── epub.hpp ├── fchelpers.cpp ├── fchelpers.hpp ├── formatting.cpp ├── formatting.hpp ├── guitool.cpp ├── mdtool.cpp ├── meson.build ├── metadata.cpp ├── metadata.hpp ├── paginationtest.cpp ├── pangotest.cpp ├── paragraphformatter.cpp ├── paragraphformatter.hpp ├── pdfrenderer.cpp ├── pdfrenderer.hpp ├── printpaginator.cpp ├── printpaginator.hpp ├── readme.md ├── testdoc ├── book.css ├── chapter1.bd ├── chapter2.bd ├── chapterlong.bd ├── colophon.txt ├── credits.txt ├── dedication.txt ├── epub_cover.png ├── largesample.json ├── postcredits.txt ├── sample.json ├── signing.txt └── testimage.png ├── tests.cpp ├── textstats.cpp ├── textstats.hpp ├── units.hpp ├── utils.cpp ├── utils.hpp ├── voikkotest.cpp ├── wordhyphenator.cpp └── wordhyphenator.hpp /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: LLVM 2 | ColumnLimit: 100 3 | IndentWidth: 4 4 | AccessModifierOffset: -4 5 | UseTab: Never 6 | BreakBeforeBraces: Attach 7 | AllowShortIfStatementsOnASingleLine: false 8 | IndentCaseLabels: false 9 | Standard: Cpp11 10 | ReflowComments: true 11 | SortIncludes: false 12 | BinPackArguments: false 13 | BinPackParameters: false 14 | SpaceBeforeParens: Never 15 | SpaceAfterTemplateKeyword: false 16 | 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | .settings 3 | .project 4 | .cproject 5 | .pydevproject 6 | /build* 7 | *.user 8 | 9 | -------------------------------------------------------------------------------- /1bit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jpakkane/chapterizer/ec384ca2c766a0a90babbaadac7e60a1b2cd31b9/1bit.png -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /blocktest.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | const int space_width = 1; 7 | 8 | double best_penalty = 1e9; 9 | std::vector best_splits; 10 | 11 | int64_t total_width; 12 | 13 | int64_t width(const std::string &s) { return s.length(); } 14 | 15 | int64_t width_between(const std::vector &words, size_t f, size_t t) { 16 | if(f == t) { 17 | return 0; 18 | } 19 | assert(f < t); 20 | int64_t total_w = 0; 21 | for(size_t i = f; i < t; ++i) { 22 | total_w += width(words[i]); 23 | } 24 | total_w += std::max(int64_t(t) - int64_t(f) - 1, int64_t(0)); 25 | return total_w; 26 | } 27 | 28 | double penalty_for_splits(const std::vector &words, const std::vector &splits) { 29 | const double average_width = total_width / (splits.size() + 1.0); 30 | double total_penalty = 0.0; 31 | assert(!splits.empty()); 32 | const double first_error = width_between(words, 0, splits[0]) - average_width; 33 | total_penalty += first_error * first_error; 34 | for(size_t i = 1; i < splits.size() - 1; ++i) { 35 | const double line_error = width_between(words, splits[i], splits[i + 1]) - average_width; 36 | total_penalty += line_error * line_error; 37 | } 38 | const double last_error = width_between(words, splits.back(), words.size()) - average_width; 39 | total_penalty += last_error * last_error; 40 | return total_penalty; 41 | } 42 | 43 | void determine_best_recursive(const std::vector &words, 44 | const int64_t target_width, 45 | const int64_t line_estimate, 46 | const int word_index, 47 | std::vector &splits) { 48 | if((int64_t)splits.size() > line_estimate + 1) { 49 | return; 50 | } 51 | 52 | int i = word_index; 53 | int64_t running_width = width(words[i]); 54 | int overflow_steps = 0; 55 | while(i < (int)words.size() && overflow_steps < 2) { 56 | ++i; 57 | if(i < (int)words.size()) { 58 | splits.push_back(i); 59 | determine_best_recursive(words, target_width, line_estimate, i + 1, splits); 60 | splits.pop_back(); 61 | running_width += space_width + width(words[i]); 62 | } else { 63 | // End reached. 64 | const double total_penalty = penalty_for_splits(words, splits); 65 | if(total_penalty < best_penalty) { 66 | best_penalty = total_penalty; 67 | best_splits = splits; 68 | } 69 | } 70 | if(running_width > target_width) { 71 | ++overflow_steps; 72 | } 73 | } 74 | } 75 | 76 | std::vector splits_to_lines(const std::vector &words, 77 | const std::vector &splits) { 78 | std::vector lines; 79 | lines.push_back(""); 80 | size_t split_ind = 0; 81 | for(size_t i = 0; i < words.size(); ++i) { 82 | if(split_ind < splits.size() && i >= (size_t)splits[split_ind]) { 83 | lines.push_back(""); 84 | ++split_ind; 85 | } 86 | lines.back() += words[i]; 87 | lines.back() += ' '; 88 | } 89 | return lines; 90 | } 91 | std::vector determine_best(const std::vector &words, 92 | const int64_t target_width, 93 | const int64_t line_estimate) { 94 | std::vector splits; 95 | determine_best_recursive(words, target_width, line_estimate, 0, splits); 96 | return splits_to_lines(words, best_splits); 97 | } 98 | 99 | std::vector spread(const std::vector &words, const int target_width) { 100 | std::vector lines; 101 | total_width = space_width * (words.size() - 1); 102 | for(const auto &w : words) { 103 | total_width += width(w); 104 | } 105 | const auto line_estimate = total_width / target_width + 1; 106 | // FIXME, try line_estimate \pm 1 and pick the "best". 107 | lines = determine_best(words, target_width, line_estimate); 108 | return lines; 109 | } 110 | 111 | void do_it(const std::vector &words, const int target_width) { 112 | best_penalty = 1e9; 113 | best_splits.clear(); 114 | auto spread_lines = spread(words, target_width); 115 | for(const auto &l : spread_lines) { 116 | printf("%s\n", l.c_str()); 117 | } 118 | } 119 | 120 | void test1() { 121 | std::vector text{ 122 | "Aaaaaaa", "aaaaa", "aaaaaa", "aaaaaaaaaaaaa,", "aaaaa", "aaaaaaaaa", 123 | "aaaaa", "aaaaa", "aaaaaaaaaaa", "aaaaaaaaa", "aaa", "aaaaaa,", 124 | "aaaa", "aaaaaa", "aaaa", "aaaaaaaaaaa", "aaaaaaaaaaa", "aaaaaaa.", 125 | "Aa", "aaaaa", "aaaaaaaaaaa", "aaaaa", "aaaaaaaaa", "aaaaa.", 126 | }; 127 | const int target_width = 60; 128 | do_it(text, target_width); 129 | } 130 | 131 | void test2() { 132 | std::vector text{"Pitkadana", "jotain", "pidempaeae", "kaikkeinpisin"}; 133 | const int target_width = 20; 134 | do_it(text, target_width); 135 | } 136 | 137 | int main(int, char **) { 138 | test1(); 139 | test2(); 140 | return 0; 141 | } 142 | -------------------------------------------------------------------------------- /bookmaker.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Jussi Pakkanen 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | 21 | #include 22 | #include 23 | #include 24 | 25 | Document load_document(const char *fname) { 26 | Document doc; 27 | doc.data = load_book_json(fname); 28 | 29 | StructureParser strucp(doc); 30 | for(const auto &s : doc.data.sources) { 31 | const auto fpath = doc.data.top_dir / s; 32 | MMapper map(fpath.c_str()); 33 | if(!g_utf8_validate(map.data(), map.size(), nullptr)) { 34 | printf("Invalid utf-8.\n"); 35 | std::abort(); 36 | } 37 | for(const auto c : map.view()) { 38 | if(c == '\t') { 39 | printf("Input file %s contains a TAB character. These are prohibited in input " 40 | "files.\n", 41 | fpath.c_str()); 42 | std::abort(); 43 | } 44 | if(c == 0 || c == '\n' || c >= 32 || c < 0) { 45 | // OK. 46 | } else { 47 | printf( 48 | "Input file %s contains a prohibited invisible ASCII control character %d.\n", 49 | fpath.c_str(), 50 | int(c)); 51 | std::abort(); 52 | } 53 | } 54 | 55 | GError *err = nullptr; 56 | if(err) { 57 | std::abort(); 58 | } 59 | 60 | LineParser linep(map.data(), map.size()); 61 | line_token token = linep.next(); 62 | while(!std::holds_alternative(token)) { 63 | strucp.push(token); 64 | token = linep.next(); 65 | } 66 | strucp.push(token); 67 | } 68 | return doc; 69 | } 70 | 71 | int main(int argc, char **argv) { 72 | if(argc != 2) { 73 | printf("%s \n", argv[0]); 74 | return 1; 75 | } 76 | auto doc = load_document(argv[1]); 77 | if(doc.data.generate_pdf) { 78 | if(doc.data.is_draft) { 79 | DraftPaginator p(doc); 80 | auto ofile = doc.data.top_dir / doc.data.pdf.ofname; 81 | p.generate_pdf(ofile.c_str()); 82 | } else { 83 | PrintPaginator p(doc); 84 | auto ofile = doc.data.top_dir / doc.data.pdf.ofname; 85 | p.generate_pdf(ofile.c_str()); 86 | } 87 | } 88 | if(doc.data.generate_epub) { 89 | Epub epub(doc); 90 | epub.generate(doc.data.epub.ofname.c_str()); 91 | } 92 | return 0; 93 | } 94 | -------------------------------------------------------------------------------- /bookparser.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Jussi Pakkanen 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | 21 | #include 22 | 23 | namespace { 24 | 25 | static const std::array superscript_numbers{ 26 | "⁰", "¹", "²", "³", "⁴", "⁵", "⁶", "⁷", "⁸", "⁹"}; 27 | 28 | const std::unordered_map specialmap{ 29 | {"code", SpecialBlockType::Code}, 30 | {"footnote", SpecialBlockType::Footnote}, 31 | {"numberlist", SpecialBlockType::NumberList}, 32 | {"letter", SpecialBlockType::Letter}, 33 | {"sign", SpecialBlockType::Sign}, 34 | }; 35 | 36 | } // namespace 37 | 38 | std::string get_normalized_string(std::string_view v) { 39 | gchar *norm = g_utf8_normalize(v.data(), v.length(), G_NORMALIZE_NFC); 40 | std::string result{norm}; 41 | g_free(norm); 42 | return result; 43 | } 44 | 45 | ReMatchOffsets ReMatchResult::offsets_for(int group) { 46 | gint start_pos, end_pos; 47 | g_match_info_fetch_pos(minfo.get(), gint(group), &start_pos, &end_pos); 48 | assert(group != 0 || start_pos == 0); 49 | return ReMatchOffsets{offset_to_match_start + start_pos, offset_to_match_start + end_pos}; 50 | } 51 | 52 | std::string_view ReMatchResult::view_for(int group, const char *original_data) { 53 | ReMatchOffsets off = offsets_for(group); 54 | return std::string_view(original_data + off.start_pos, off.end_pos - off.start_pos); 55 | } 56 | 57 | line_token LineParser::next() { 58 | if(offset >= data_size) { 59 | return EndOfFile{}; 60 | } 61 | 62 | if(parsing_specialblock) { 63 | auto block_end = try_match(specialblock_end, GRegexMatchFlags(0)); 64 | if(block_end) { 65 | parsing_specialblock = false; 66 | return EndOfSpecialBlock{}; 67 | } else { 68 | auto full_line = try_match(line, GRegexMatchFlags(0)); 69 | auto nl = try_match(single_newline, GRegexMatchFlags(0)); 70 | if(!nl) { 71 | std::abort(); 72 | } 73 | if(full_line) { 74 | return PlainLine{full_line->whole_match}; 75 | } 76 | return PlainLine{std::string_view{}}; // Empty line. 77 | } 78 | } 79 | auto match_result = try_match(multi_newline, GRegexMatchFlags(0)); 80 | if(match_result) { 81 | if(match_result->whole_match.length() > 1) { 82 | return NewBlock{}; 83 | } 84 | return NewLine{}; 85 | } 86 | match_result = try_match(specialblock_start, GRegexMatchFlags(0)); 87 | if(match_result) { 88 | if(parsing_specialblock) { 89 | printf("Nested codeblocks not supported.\n"); 90 | std::abort(); 91 | } 92 | parsing_specialblock = true; 93 | if(!try_match(multi_newline, GRegexMatchFlags(0))) { 94 | std::abort(); 95 | } 96 | const auto block_name = match_result->view_for(1, data); 97 | auto it = specialmap.find(std::string{block_name}); 98 | if(it != specialmap.end()) { 99 | return StartOfSpecialBlock{it->second}; 100 | } 101 | std::string tmp{block_name}; 102 | printf("Unknown special block type: %s\n", tmp.c_str()); 103 | std::abort(); 104 | } 105 | match_result = try_match(specialblock_end, GRegexMatchFlags(0)); 106 | if(match_result) { 107 | printf("End of codeblock without start of same.\n"); 108 | std::abort(); 109 | } 110 | match_result = try_match(directive, GRegexMatchFlags(0)); 111 | if(match_result) { 112 | auto dir_name = match_result->view_for(1, data); 113 | if(dir_name == "s") { 114 | return SceneDecl{}; 115 | } else if(dir_name == "figure") { 116 | auto fname = match_result->view_for(2, data); 117 | while(!fname.empty() && fname.front() == ' ') { 118 | fname.remove_prefix(1); 119 | } 120 | return FigureDecl{std::string{fname}}; 121 | } else { 122 | std::string tmp{dir_name}; 123 | printf("Unknown directive '%s'.\n", tmp.c_str()); 124 | std::abort(); 125 | } 126 | } 127 | match_result = try_match(section, GRegexMatchFlags(0)); 128 | if(match_result) { 129 | const auto hash_offsets = match_result->offsets_for(1); 130 | const int depth = hash_offsets.end_pos - hash_offsets.start_pos; 131 | assert(depth == 1); // Fix eventually. 132 | 133 | return SectionDecl{depth, match_result->view_for(2, data)}; 134 | } 135 | match_result = try_match(line, GRegexMatchFlags(0)); 136 | if(match_result) { 137 | return PlainLine{match_result->whole_match}; 138 | } 139 | 140 | printf("Parsing failed."); 141 | std::abort(); 142 | } 143 | 144 | StructureParser::~StructureParser() { 145 | if(!stored_lines.empty()) { 146 | printf("Stored lines not fully drained.\n"); 147 | std::abort(); 148 | } 149 | g_regex_unref(escaping_command); 150 | } 151 | 152 | std::string StructureParser::pop_lines_to_string() { 153 | std::string line; 154 | for(const auto &l : stored_lines) { 155 | line += l; 156 | line += ' '; 157 | } 158 | line.pop_back(); 159 | stored_lines.clear(); 160 | return line; 161 | } 162 | 163 | std::vector StructureParser::pop_lines_to_paragraphs() { 164 | std::vector paras; 165 | std::string buf; 166 | for(auto &l : stored_lines) { 167 | if(l.empty()) { 168 | if(!buf.empty()) { 169 | paras.emplace_back(std::move(buf)); 170 | buf.clear(); 171 | } 172 | } else { 173 | if(buf.empty()) { 174 | buf = std::move(l); 175 | } else { 176 | buf += ' '; 177 | buf += l; 178 | } 179 | } 180 | } 181 | if(!buf.empty()) { 182 | paras.emplace_back(std::move(buf)); 183 | } 184 | stored_lines.clear(); 185 | return paras; 186 | } 187 | 188 | void StructureParser::build_element() { 189 | switch(current_state) { 190 | case ParsingState::unset: 191 | std::abort(); 192 | case ParsingState::specialblock: 193 | if(current_special == SpecialBlockType::Code) { 194 | doc.elements.emplace_back(CodeBlock{std::move(stored_lines)}); 195 | } else if(current_special == SpecialBlockType::Footnote) { 196 | doc.elements.emplace_back(Footnote{footnote_number, pop_lines_to_string()}); 197 | } else if(current_special == SpecialBlockType::NumberList) { 198 | doc.elements.emplace_back(NumberList{pop_lines_to_paragraphs()}); 199 | } else if(current_special == SpecialBlockType::Letter) { 200 | doc.elements.emplace_back(Letter{pop_lines_to_paragraphs()}); 201 | } else if(current_special == SpecialBlockType::Sign) { 202 | doc.elements.emplace_back(SignBlock{std::move(stored_lines)}); 203 | } else { 204 | std::abort(); 205 | } 206 | break; 207 | case ParsingState::section: 208 | doc.elements.emplace_back(Section{1, section_number, pop_lines_to_string()}); 209 | break; 210 | case ParsingState::paragraph: 211 | unquote_lines(); 212 | number_super_fix(); 213 | doc.elements.emplace_back(Paragraph{pop_lines_to_string()}); 214 | break; 215 | default: 216 | std::abort(); 217 | } 218 | } 219 | 220 | static gboolean eval_quote_cb(const GMatchInfo *info, GString *res, gpointer) { 221 | char tmp[2] = {0, 0}; 222 | gchar *match = g_match_info_fetch(info, 1); 223 | gchar *cur = match; 224 | while(*cur) { 225 | tmp[0] = special2internal(*cur); 226 | g_string_append(res, tmp); 227 | ++cur; 228 | } 229 | g_free(match); 230 | return FALSE; 231 | } 232 | 233 | static gboolean eval_supernum_cb(const GMatchInfo *info, GString *res, gpointer) { 234 | gchar *match = g_match_info_fetch(info, 1); 235 | gchar *cur = match; 236 | while(*cur) { 237 | const int offset = *cur - '0'; 238 | assert(offset >= 0 || offset < 10); 239 | g_string_append(res, superscript_numbers[offset]); 240 | ++cur; 241 | } 242 | g_free(match); 243 | return FALSE; 244 | } 245 | 246 | void StructureParser::unquote_lines() { 247 | std::string buf; 248 | for(auto &line : stored_lines) { 249 | GError *err = nullptr; 250 | auto replaced = g_regex_replace_eval(escaping_command, 251 | line.c_str(), 252 | line.length(), 253 | 0, 254 | GRegexMatchFlags(0), 255 | eval_quote_cb, 256 | nullptr, 257 | &err); 258 | if(err) { 259 | printf("Replacement error: %s\n", err->message); 260 | g_error_free(err); 261 | std::abort(); 262 | } 263 | line = replaced; 264 | g_free(replaced); 265 | } 266 | } 267 | 268 | void StructureParser::number_super_fix() { 269 | // https://gitlab.gnome.org/GNOME/pango/-/issues/702 270 | 271 | // This is not the correct place for this, especially when 272 | // considering footnotes. They should be stored externally 273 | // and formatted at the end. Do this to get started. 274 | std::string buf; 275 | for(auto &line : stored_lines) { 276 | GError *err = nullptr; 277 | auto replaced = g_regex_replace_eval(supernum_command, 278 | line.c_str(), 279 | line.length(), 280 | 0, 281 | GRegexMatchFlags(0), 282 | eval_supernum_cb, 283 | nullptr, 284 | &err); 285 | if(err) { 286 | printf("Replacement error: %s\n", err->message); 287 | g_error_free(err); 288 | std::abort(); 289 | } 290 | line = replaced; 291 | g_free(replaced); 292 | } 293 | } 294 | 295 | void StructureParser::set_state(ParsingState new_state) { 296 | assert(current_state == ParsingState::unset || (new_state != current_state)); 297 | if(current_state != ParsingState::unset) { 298 | build_element(); 299 | } 300 | assert(stored_lines.empty()); 301 | current_state = new_state; 302 | if(current_state == ParsingState::specialblock) { 303 | current_special = SpecialBlockType::Unset; 304 | } 305 | } 306 | 307 | void StructureParser::push(const line_token &l) { 308 | if(has_finished) { 309 | std::abort(); 310 | } 311 | if(std::holds_alternative(l)) { 312 | if(current_state == ParsingState::unset) { 313 | set_state(ParsingState::paragraph); 314 | } 315 | stored_lines.emplace_back(std::get(l).text); 316 | return; 317 | } 318 | 319 | if(std::holds_alternative(l)) { 320 | return; 321 | } 322 | 323 | if(std::holds_alternative(l)) { 324 | set_state(ParsingState::section); 325 | ++section_number; 326 | stored_lines.emplace_back(std::get(l).text); 327 | } else if(std::holds_alternative(l)) { 328 | set_state(ParsingState::specialblock); 329 | current_special = std::get(l).type; 330 | if(current_special == SpecialBlockType::Footnote) { 331 | ++footnote_number; 332 | } 333 | } else if(std::holds_alternative(l)) { 334 | set_state(ParsingState::unset); 335 | } else if(std::holds_alternative(l)) { 336 | set_state(ParsingState::unset); 337 | } else if(std::holds_alternative(l)) { 338 | set_state(ParsingState::unset); 339 | doc.elements.push_back(SceneChange{}); 340 | } else if(std::holds_alternative(l)) { 341 | set_state(ParsingState::unset); 342 | doc.elements.push_back(Figure{std::get(l).fname}); 343 | } else if(std::holds_alternative(l)) { 344 | set_state(ParsingState::unset); 345 | } else { 346 | std::abort(); 347 | } 348 | } 349 | 350 | void StructureParser::finish() { 351 | if(has_finished) { 352 | std::abort(); 353 | } 354 | // Get data from pending declarations (i.e. the last paragraph) 355 | has_finished = true; 356 | } 357 | -------------------------------------------------------------------------------- /bookparser.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Jussi Pakkanen 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | struct MatchDeleter final { 30 | void operator()(GMatchInfo *mi) { g_match_info_unref(mi); } 31 | }; 32 | 33 | typedef std::unique_ptr re_match; 34 | 35 | struct ReMatchOffsets { 36 | int64_t start_pos; 37 | int64_t end_pos; 38 | 39 | std::string get_normalized_string(const char *original_data) const { 40 | gchar *norm = 41 | g_utf8_normalize(original_data + start_pos, end_pos - start_pos, G_NORMALIZE_NFC); 42 | std::string result{norm}; 43 | g_free(norm); 44 | return result; 45 | } 46 | }; 47 | 48 | std::string get_normalized_string(std::string_view v); 49 | 50 | enum class SpecialBlockType : int { Code, Footnote, NumberList, Letter, Sign, Unset }; 51 | 52 | struct ReMatchResult { 53 | re_match minfo; 54 | int64_t offset_to_match_start; 55 | std::string_view whole_match; 56 | 57 | ReMatchOffsets offsets_for(int group); 58 | std::string_view view_for(int group, const char *original_data); 59 | }; 60 | 61 | struct SectionDecl { 62 | int level; 63 | std::string_view text; 64 | }; 65 | 66 | struct PlainLine { 67 | std::string_view text; 68 | }; 69 | 70 | struct NewLine {}; 71 | 72 | struct SceneDecl {}; 73 | 74 | struct FigureDecl { 75 | std::string fname; 76 | }; 77 | 78 | struct NewBlock {}; 79 | 80 | struct StartOfSpecialBlock { 81 | SpecialBlockType type; 82 | }; 83 | 84 | struct EndOfSpecialBlock {}; 85 | 86 | struct EndOfFile {}; 87 | 88 | typedef std::variant 97 | line_token; 98 | 99 | class LineParser { 100 | public: 101 | LineParser(const char *data_, const int64_t data_size_) : data(data_), data_size(data_size_) { 102 | whitespace = g_regex_new(" \\s+", GRegexCompileFlags(0), G_REGEX_MATCH_ANCHORED, nullptr); 103 | section = 104 | g_regex_new("(#+)\\s+(.*)", GRegexCompileFlags(0), G_REGEX_MATCH_ANCHORED, nullptr); 105 | line = g_regex_new(".+", GRegexCompileFlags(0), G_REGEX_MATCH_ANCHORED, nullptr); 106 | multi_newline = g_regex_new("\\n+", G_REGEX_MULTILINE, G_REGEX_MATCH_ANCHORED, nullptr); 107 | single_newline = g_regex_new("\\n", G_REGEX_MULTILINE, G_REGEX_MATCH_ANCHORED, nullptr); 108 | directive = g_regex_new( 109 | "#(\\w+)( +[^ ].*)?", GRegexCompileFlags(0), G_REGEX_MATCH_ANCHORED, nullptr); 110 | specialblock_start = 111 | g_regex_new("```(\\w+)", GRegexCompileFlags(0), G_REGEX_MATCH_ANCHORED, nullptr); 112 | specialblock_end = 113 | g_regex_new("``` *\n", G_REGEX_MULTILINE, G_REGEX_MATCH_ANCHORED, nullptr); 114 | } 115 | 116 | ~LineParser() { 117 | g_regex_unref(multi_newline); 118 | g_regex_unref(single_newline); 119 | g_regex_unref(section); 120 | g_regex_unref(line); 121 | g_regex_unref(whitespace); 122 | g_regex_unref(specialblock_start); 123 | g_regex_unref(specialblock_end); 124 | } 125 | 126 | line_token next(); 127 | 128 | private: 129 | std::optional try_match(GRegex *regex, GRegexMatchFlags flags) { 130 | GMatchInfo *minfo = nullptr; 131 | if(g_regex_match(regex, data + offset, flags, &minfo)) { 132 | ReMatchResult match_info{re_match{minfo}, offset, std::string_view{}}; 133 | match_info.whole_match = match_info.view_for(0, data); 134 | offset += match_info.whole_match.length(); 135 | return match_info; 136 | } 137 | g_match_info_free(minfo); 138 | return {}; 139 | } 140 | 141 | const char *data; 142 | bool parsing_specialblock = false; 143 | int64_t data_size; 144 | int64_t offset = 0; 145 | GRegex *whitespace; 146 | GRegex *section; 147 | GRegex *line; 148 | GRegex *single_newline; 149 | GRegex *multi_newline; 150 | GRegex *directive; 151 | GRegex *specialblock_start; 152 | GRegex *specialblock_end; 153 | }; 154 | 155 | class StructureParser { 156 | public: 157 | explicit StructureParser(Document &in_doc) : doc(in_doc) { 158 | escaping_command = 159 | g_regex_new(R"(\\c{([^}]+)})", GRegexCompileFlags(0), GRegexMatchFlags(0), nullptr); 160 | supernum_command = g_regex_new( 161 | R"(\\footnote{(\d+)})", GRegexCompileFlags(0), GRegexMatchFlags(0), nullptr); 162 | } 163 | 164 | ~StructureParser(); 165 | void push(const line_token &l); 166 | void finish(); 167 | 168 | private: 169 | enum class ParsingState : int { 170 | unset, 171 | paragraph, 172 | section, 173 | specialblock, 174 | }; 175 | 176 | void unquote_lines(); 177 | void number_super_fix(); 178 | 179 | void set_state(ParsingState new_state); 180 | 181 | void build_element(); 182 | 183 | std::string pop_lines_to_string(); 184 | 185 | std::vector pop_lines_to_paragraphs(); 186 | 187 | Document &doc; 188 | bool has_finished = false; 189 | int section_level = 1; // FIXME 190 | int section_number = 0; 191 | int footnote_number = 0; 192 | ParsingState current_state = ParsingState::unset; 193 | SpecialBlockType current_special = SpecialBlockType::Unset; 194 | std::vector stored_lines; 195 | GRegex *escaping_command; 196 | GRegex *supernum_command; 197 | }; 198 | -------------------------------------------------------------------------------- /chaptercommon.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Jussi Pakkanen 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | 21 | #include 22 | #include 23 | #include 24 | 25 | enum class ExtraPenaltyTypes : int { 26 | ConsecutiveDashes, 27 | // River, 28 | SingleWordLastLine, 29 | SplitWordLastLine, 30 | }; 31 | 32 | struct ExtraPenaltyStatistics { 33 | ExtraPenaltyTypes type; 34 | int line; 35 | double penalty; 36 | }; 37 | 38 | struct ExtraPenaltyAmounts { 39 | double multiple_dashes = 10; // Total is num_consecutive_dashes * multiple_dashes. 40 | // double river; 41 | double single_word_line = 10; 42 | double single_split_word_line = 500; 43 | }; 44 | 45 | // Some fonts have "medium" instead of "regular" as their weight. 46 | // For example Nimbus Roman. This is currently ignored, might need 47 | // to be fixed. 48 | enum class FontStyle : int { 49 | Regular, 50 | Italic, 51 | Bold, 52 | BoldItalic, 53 | }; 54 | 55 | struct FontParameters { 56 | std::string name; // Fontconfig name as a string. 57 | Length size = Length::from_pt(1000); // Be careful with comparisons. 58 | FontStyle type = FontStyle::Regular; 59 | 60 | bool operator==(const FontParameters &o) const noexcept { 61 | return name == o.name && (fabs((size - o.size).pt()) < 0.05) && type == o.type; 62 | } 63 | }; 64 | 65 | template<> struct std::hash { 66 | std::size_t operator()(FontParameters const &s) const noexcept { 67 | auto h1 = std::hash{}(int(10 * s.size.pt())); 68 | auto h2 = std::hash{}(int(s.type)); 69 | auto h3 = std::hash{}(s.name); 70 | return ((h1 * 13) + h2) * 13 + h3; 71 | } 72 | }; 73 | 74 | struct ChapterParameters { 75 | Length line_height; 76 | Length indent; // Of first line. 77 | FontParameters font; 78 | bool indent_last_line = false; 79 | }; 80 | 81 | struct FontStyles { 82 | FontParameters basic; 83 | FontParameters heading; 84 | FontParameters code; 85 | FontParameters footnote; 86 | }; 87 | -------------------------------------------------------------------------------- /chapterformatter.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2025 Jussi Pakkanen 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | 21 | ChapterFormatter::ChapterFormatter(const TextElementIterator &start_, 22 | const TextElementIterator &end_, 23 | const std::vector &elms, 24 | size_t target_height_) 25 | : start{start_}, end{end_}, elements{elms}, target_height{target_height_} {} 26 | 27 | PageLayoutResult ChapterFormatter::optimize_pages() { 28 | PageLayoutResult r; 29 | 30 | auto run_start = start; 31 | try { 32 | optimize_recursive(run_start, r, 0); 33 | } catch(const OptimalResultFound &) { 34 | } 35 | return std::move(best_layout); 36 | } 37 | 38 | bool ChapterFormatter::stop_recursing(TextElementIterator loc, const PageLayoutResult &r) { 39 | size_t max_reaches = 5; 40 | auto current_penalty = compute_penalties(r.pages).total_penalty; 41 | auto &reaches = best_reaches[loc]; 42 | if(reaches.size() >= max_reaches) { 43 | if(current_penalty >= reaches.back()) { 44 | return true; 45 | } 46 | reaches.pop_back(); 47 | } 48 | auto insertion_point = std::lower_bound(reaches.begin(), reaches.end(), current_penalty); 49 | reaches.insert(insertion_point, current_penalty); 50 | 51 | return false; 52 | } 53 | 54 | void ChapterFormatter::optimize_recursive( 55 | TextElementIterator run_start, 56 | PageLayoutResult &r, 57 | size_t previous_page_height, 58 | const std::optional incoming_pending_image) { 59 | size_t lines_on_page = 0; 60 | std::optional page_section_number; 61 | std::optional current_page_image; 62 | std::optional outgoing_pending_image; 63 | if(compute_penalties(r.pages).total_penalty > best_penalty) { 64 | return; 65 | } 66 | 67 | if(incoming_pending_image) { 68 | // FIXME, check for full page images. 69 | current_page_image = incoming_pending_image; 70 | lines_on_page = current_page_image->height_in_lines; 71 | } 72 | for(TextElementIterator current = run_start; current != end; ++current) { 73 | auto push_and_resume = [&](const TextElementIterator &startpoint, 74 | const TextElementIterator &endpoint) { 75 | TextLimits limits{startpoint, endpoint}; 76 | if(page_section_number) { 77 | assert(!current_page_image); 78 | r.pages.emplace_back(SectionPage{*page_section_number, limits}); 79 | } else { 80 | r.pages.emplace_back(RegularPage{limits, {}, current_page_image}); 81 | } 82 | const size_t height_validation = r.pages.size(); 83 | optimize_recursive(endpoint, r, lines_on_page); 84 | assert(height_validation == r.pages.size()); 85 | r.pages.pop_back(); 86 | }; 87 | // Have we filled the current page? 88 | if(lines_on_page >= target_height) { 89 | // Exact. 90 | auto endpoint = current; 91 | push_and_resume(run_start, endpoint); 92 | // One line short 93 | --endpoint; 94 | push_and_resume(run_start, endpoint); 95 | // One line long 96 | if(current != end) { 97 | endpoint = current; 98 | ++endpoint; 99 | push_and_resume(run_start, endpoint); 100 | } 101 | // It's a bit weird to return here, but that's recursion for you. 102 | return; 103 | } 104 | if(auto *sec = std::get_if(¤t.element())) { 105 | // There can be only one of these in a chapter and it must come first. 106 | assert(current == run_start); 107 | assert(lines_on_page == 0); 108 | const size_t chapter_heading_top_whitespace = 8; 109 | lines_on_page += 110 | chapter_heading_top_whitespace; // Hack, replace with a proper whitespace element. 111 | page_section_number = sec->chapter_number; 112 | ++lines_on_page; 113 | } else if(auto *par = std::get_if(¤t.element())) { 114 | (void)par; 115 | ++lines_on_page; 116 | } else if(const auto *empty = std::get_if(¤t.element())) { 117 | if(lines_on_page != 0) { 118 | // Ignore empty space at the beginning of the line. 119 | lines_on_page += empty->num_lines; 120 | } 121 | } else if(const auto *cb = std::get_if(¤t.element())) { 122 | (void)cb; 123 | ++lines_on_page; 124 | } else if(const auto *imel = std::get_if(¤t.element())) { 125 | if(lines_on_page + imel->height_in_lines > target_height) { 126 | assert(!outgoing_pending_image); 127 | outgoing_pending_image = *imel; 128 | } else { 129 | assert(!current_page_image); 130 | lines_on_page += imel->height_in_lines; 131 | current_page_image = *imel; 132 | } 133 | } else { 134 | // FIXME, add images etc. 135 | std::abort(); 136 | } 137 | } 138 | if(lines_on_page > 0) { 139 | TextLimits limits; 140 | limits.start = run_start; 141 | limits.end = end; 142 | if(page_section_number) { 143 | r.pages.emplace_back(SectionPage{page_section_number.value(), limits}); 144 | } else { 145 | r.pages.emplace_back(RegularPage{limits, {}, {}}); 146 | } 147 | } 148 | r.stats = compute_penalties(r.pages); 149 | if(r.stats.total_penalty < best_penalty) { 150 | best_layout = r; 151 | best_penalty = r.stats.total_penalty; 152 | if(best_penalty == 0) { 153 | throw OptimalResultFound{}; 154 | } 155 | } 156 | if(lines_on_page > 0) { 157 | r.pages.pop_back(); 158 | } 159 | } 160 | 161 | PageStatistics ChapterFormatter::compute_penalties(const std::vector &pages) const { 162 | PageStatistics stats; 163 | const size_t page_number_offset = 1; 164 | size_t even_page_height = 0; 165 | size_t odd_page_height = 0; 166 | for(size_t page_num = 0; page_num < pages.size(); ++page_num) { 167 | const auto &p = pages[page_num]; 168 | const size_t num_lines_on_page = lines_on_page(p); 169 | if((page_num + 1) % 2) { 170 | odd_page_height = num_lines_on_page; 171 | } else { 172 | even_page_height = num_lines_on_page; 173 | } 174 | const bool on_last_page = page_num == pages.size() - 1; 175 | const bool on_first_page = page_num == 0; 176 | const TextLimits *limits = nullptr; 177 | if(auto *rp = std::get_if(&p)) { 178 | limits = &rp->main_text; 179 | } else if(const auto *sp = std::get_if(&p)) { 180 | limits = &sp->main_text; 181 | } else { 182 | fprintf(stderr, "Unsupported.\n"); 183 | std::abort(); 184 | } 185 | const size_t first_element_id = limits->start.element_id; 186 | const size_t first_line_id = limits->start.line_id; 187 | const size_t last_element_id = limits->end.element_id; 188 | const size_t last_line_id = limits->end.line_id; 189 | 190 | const auto &start_lines = get_lines(elements[first_element_id]); 191 | if(last_element_id >= elements.size()) { 192 | if(first_element_id == elements.size() - 1 && first_line_id == start_lines.size() - 1) { 193 | stats.single_line_last_page = true; 194 | stats.total_penalty += SingleLinePage; 195 | continue; 196 | } 197 | } 198 | if(last_element_id < elements.size()) { 199 | const auto &end_lines = get_lines(elements[last_element_id]); 200 | 201 | // Orphan (single line at the end of a page) 202 | if(end_lines.size() > 1 && last_line_id == 1) { 203 | stats.orphans.push_back(page_number_offset + page_num); 204 | stats.total_penalty += OrphanPenalty; 205 | } 206 | } 207 | // These are only counted for "regular" pages. 208 | if(std::holds_alternative(p)) { 209 | // Widow 210 | if(start_lines.size() > 1 && first_line_id == start_lines.size() - 1) { 211 | stats.widows.push_back(page_number_offset + page_num); 212 | stats.total_penalty += WidowPenalty; 213 | } 214 | // Mismatch 215 | if(!on_first_page && !on_last_page && (((page_num + 1) % 2) == 1)) { 216 | if(even_page_height != odd_page_height) { 217 | const auto mismatch_amount = 218 | (int64_t)even_page_height - (int64_t)odd_page_height; 219 | stats.mismatches.emplace_back( 220 | HeightMismatch{page_number_offset + page_num, mismatch_amount}); 221 | stats.total_penalty += abs(mismatch_amount) * MismatchPenalty; 222 | } 223 | } 224 | } 225 | } 226 | return stats; 227 | } 228 | -------------------------------------------------------------------------------- /chapterformatter.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2025 Jussi Pakkanen 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | 21 | struct OptimalResultFound {}; 22 | 23 | class ChapterFormatter { 24 | public: 25 | ChapterFormatter(const TextElementIterator &start, 26 | const TextElementIterator &end, 27 | const std::vector &elms, 28 | size_t target_height); 29 | 30 | PageLayoutResult optimize_pages(); 31 | 32 | private: 33 | static constexpr size_t WidowPenalty = 10; 34 | static constexpr size_t OrphanPenalty = 10; 35 | static constexpr size_t MismatchPenalty = 7; 36 | static constexpr size_t SingleLinePage = 1000; 37 | 38 | PageStatistics compute_penalties(const std::vector &pages) const; 39 | 40 | void optimize_recursive(TextElementIterator run_start, 41 | PageLayoutResult &r, 42 | size_t previous_page_height, 43 | const std::optional incoming_pending_image = {}); 44 | 45 | bool stop_recursing(TextElementIterator loc, const PageLayoutResult &r); 46 | 47 | const TextElementIterator start; 48 | const TextElementIterator end; 49 | const std::vector elements; 50 | 51 | size_t best_penalty = size_t(-1); 52 | PageLayoutResult best_layout; 53 | const size_t target_height; 54 | 55 | std::unordered_map> best_reaches; 56 | }; 57 | -------------------------------------------------------------------------------- /draftpaginator.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Jussi Pakkanen 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | 21 | #include 22 | #include 23 | #include 24 | 25 | #include 26 | 27 | struct MarkupDrawCommand { 28 | std::string markup; 29 | const FontParameters *font; 30 | Length x; 31 | Length y; 32 | TextAlignment alignment; 33 | }; 34 | 35 | struct JustifiedMarkupDrawCommand { 36 | std::vector markup_words; 37 | const FontParameters *font; 38 | Length x; 39 | Length y; 40 | Length width; 41 | }; 42 | 43 | struct ImageCommand { 44 | ImageInfo i; 45 | Length x; // Relative to left edge of text block. 46 | Length y; 47 | Length display_height; 48 | Length display_width; 49 | }; 50 | 51 | typedef std::variant TextCommands; 52 | 53 | struct PageLayout { 54 | std::vector images; 55 | std::vector text; 56 | std::vector footnote; 57 | 58 | bool empty() const { return text.empty() && footnote.empty() && images.empty(); } 59 | 60 | void clear() { 61 | images.clear(); 62 | text.clear(); 63 | footnote.clear(); 64 | } 65 | }; 66 | 67 | struct Heights { 68 | Length figure_height; 69 | Length text_height; 70 | Length footnote_height; 71 | Length whitespace_height; 72 | 73 | Length total_height() const { 74 | return figure_height + text_height + footnote_height + whitespace_height; 75 | } 76 | 77 | void clear() { 78 | figure_height = text_height = footnote_height = whitespace_height = Length::zero(); 79 | } 80 | }; 81 | 82 | std::vector extract_styling(StyleStack ¤t_style, std::string &word); 83 | 84 | class DraftPaginator { 85 | public: 86 | DraftPaginator(const Document &d); 87 | 88 | void generate_pdf(const char *outfile); 89 | 90 | void draw_debug_bars(int num_bars, const Length bar_start_y); 91 | 92 | private: 93 | void render_page_num(const FontParameters &par); 94 | std::vector 95 | build_justified_paragraph(const std::vector> &lines, 96 | const ChapterParameters &text_par, 97 | const Length target_width, 98 | const Length x_off = Length::zero(), 99 | const Length y_off = Length::zero()); 100 | std::vector 101 | build_ragged_paragraph(const std::vector> &lines, 102 | const ChapterParameters &text_par, 103 | const TextAlignment alignment, 104 | Length rel_y); 105 | std::vector text_to_formatted_words(const std::string &text, 106 | bool permit_hyphenation = true); 107 | 108 | Length current_left_margin() const { return current_page % 2 ? m.inner : m.outer; } 109 | 110 | void new_page(bool draw_page_num); 111 | 112 | void flush_draw_commands(); 113 | 114 | Length textblock_width() const { return page.w - m.inner - m.outer; } 115 | Length textblock_height() const { return page.h - m.upper - m.lower; } 116 | 117 | void add_pending_figure(const ImageInfo &f); 118 | void add_top_image(const ImageInfo &image); 119 | 120 | void create_draft_title_page(); 121 | void create_maintext(); 122 | 123 | void create_section(const Section &s, 124 | const ExtraPenaltyAmounts &extras, 125 | Length &rel_y, 126 | bool &first_section, 127 | bool &first_paragraph); 128 | void create_paragraph(const Paragraph &p, 129 | const ExtraPenaltyAmounts &extras, 130 | Length &rel_y, 131 | const Length &bottom_watermark, 132 | const ChapterParameters &chpar, 133 | Length extra_indent); 134 | void create_footnote(const Footnote &f, 135 | const ExtraPenaltyAmounts &extras, 136 | const Length &bottom_watermark); 137 | void create_numberlist(const NumberList &nl, Length &rel_y, const ExtraPenaltyAmounts &extras); 138 | 139 | int count_words(); 140 | 141 | const Document &doc; 142 | // These are just helpers to cut down on typing. 143 | const PageSize &page; 144 | const ChapterStyles &styles; 145 | const Spaces &spaces; 146 | const Margins &m; 147 | std::unique_ptr rend; 148 | WordHyphenator hyphen; 149 | int current_page = 1; 150 | int chapter_start_page = -1; 151 | 152 | // These keep track of the current page stats. 153 | PageLayout layout; 154 | Heights heights; 155 | std::vector pending_figures; 156 | std::vector pending_footnotes; 157 | }; 158 | -------------------------------------------------------------------------------- /epub.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Jussi Pakkanen 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | class Epub { 26 | public: 27 | explicit Epub(const Document &d); 28 | ~Epub(); 29 | 30 | void generate(const char *ofilename); 31 | 32 | private: 33 | void write_opf(const std::filesystem::path &ofile); 34 | void write_ncx(const char *ofile); 35 | void write_frontmatter(const std::filesystem::path &outdir); 36 | void write_chapters(const std::filesystem::path &outdir); 37 | void write_footnotes(const std::filesystem::path &outdir); 38 | void write_navmap(tinyxml2::XMLElement *root); 39 | void generate_epub_manifest(tinyxml2::XMLNode *manifest); 40 | void generate_spine(tinyxml2::XMLNode *spine); 41 | 42 | void write_paragraph(tinyxml2::XMLDocument &epubdoc, 43 | tinyxml2::XMLElement *body, 44 | const Paragraph &par, 45 | const char *classname); 46 | 47 | std::string get_epub_image_path(const std::string &fs_name); 48 | const Document &doc; 49 | 50 | std::filesystem::path oebpsdir; 51 | std::unordered_map imagenames; 52 | std::vector embedded_images; 53 | std::string current_chapter_filename; 54 | std::vector footnote_filenames; // Zero-indexed whereas footnotes are one-indexed. 55 | GRegex *supernumbers; 56 | }; 57 | -------------------------------------------------------------------------------- /fchelpers.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Jussi Pakkanen 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include "fchelpers.hpp" 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | namespace { 24 | 25 | const FcChar8 tmpl[] = "%{family[0]}"; 26 | 27 | bool startswith(const char *string_to_search, const char *to_search) { 28 | return strncmp(string_to_search, to_search, strlen(to_search)) == 0; 29 | } 30 | 31 | const std::array wanted_words{ 32 | "Gentium", 33 | "Liberation", 34 | "DejaVu", 35 | "Nimbus", 36 | "URW ", 37 | "Ubuntu", 38 | "P05", 39 | "C059", 40 | "Hofshi", 41 | "IBM", 42 | }; 43 | 44 | // The default Linux install has a gazillion fonts that makes all font 45 | // lists unreadably large. Pick only a reasonable subset here. Feel 46 | // free to change this to add more useful fonts. 47 | // 48 | // This app only deals with roman text, so any font aimed at languages 49 | // like japanese, arabic, indian scripts etc need to be removed because 50 | // they would never make sense in the font list. 51 | 52 | bool is_bad(char *fontname) { 53 | if(startswith(fontname, "Noto ")) { 54 | if(strcmp(fontname, "Noto Sans") == 0) { 55 | return false; 56 | } 57 | if(strcmp(fontname, "Noto Serif") == 0) { 58 | return false; 59 | } 60 | return true; 61 | } 62 | if(strstr(fontname, "TeX")) { 63 | return true; 64 | } 65 | for(const char *w : wanted_words) { 66 | if(strstr(fontname, w) != 0) { 67 | return false; 68 | } 69 | } 70 | return true; 71 | } 72 | 73 | } // namespace 74 | 75 | std::vector get_fontnames_smart() { 76 | std::set store; 77 | std::vector fontnames; 78 | FcPattern *pattern = FcPatternCreate(); 79 | FcObjectSet *oset = FcObjectSetBuild(FC_FAMILY, FC_STYLE, FC_FILE, (char *)0); 80 | 81 | FcFontSet *fset = FcFontList(0, pattern, oset); 82 | 83 | for(int i = 0; i < fset->nfont; ++i) { 84 | FcChar8 *s = FcPatternFormat(fset->fonts[i], tmpl); 85 | if(!is_bad((char *)s)) { 86 | store.insert((char *)s); 87 | } 88 | FcStrFree(s); 89 | } 90 | 91 | for(const auto &s : store) { 92 | fontnames.push_back(s); 93 | printf("%s\n", s.c_str()); 94 | } 95 | printf("Found %d fonts.\n", (int)fontnames.size()); 96 | FcFontSetDestroy(fset); 97 | FcObjectSetDestroy(oset); 98 | FcPatternDestroy(pattern); 99 | return fontnames; 100 | } 101 | -------------------------------------------------------------------------------- /fchelpers.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Jussi Pakkanen 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | 22 | std::vector get_fontnames_smart(); 23 | -------------------------------------------------------------------------------- /formatting.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Jussi Pakkanen 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include "formatting.hpp" 18 | -------------------------------------------------------------------------------- /formatting.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Jussi Pakkanen 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | 21 | #include 22 | #include 23 | #include 24 | 25 | const char ITALIC_S = 1; 26 | const char BOLD_S = (1 << 1); 27 | const char TT_S = (1 << 2); 28 | const char SMALLCAPS_S = (1 << 3); 29 | const char SUPERSCRIPT_S = (1 << 4); 30 | const char SUBSCRIPT_S = (1 << 5); 31 | 32 | const char italic_character = '/'; 33 | const char bold_character = '*'; 34 | const char tt_character = '`'; 35 | const char smallcaps_character = '|'; 36 | const char superscript_character = '^'; 37 | const char subscript_character = '_'; 38 | 39 | inline bool is_stylechar(char c) { 40 | return c == italic_character || c == bold_character || c == tt_character || 41 | c == smallcaps_character || c == superscript_character || c == subscript_character; 42 | } 43 | 44 | const uint32_t italic_codepoint = '/'; 45 | const uint32_t bold_codepoint = '*'; 46 | const uint32_t tt_codepoint = '`'; 47 | const uint32_t smallcaps_codepoint = '|'; 48 | const uint32_t superscript_codepoint = '^'; 49 | const uint32_t subscript_codepoint = '_'; 50 | 51 | template class SmallStack final { 52 | 53 | public: 54 | typedef T value_type; 55 | 56 | SmallStack() { 57 | tt_start_tag = ""; 58 | tt_end_tag = ""; 59 | } 60 | 61 | explicit SmallStack(const FontParameters &inline_typewriter_font) { 62 | const int buf_size = 1024; 63 | char buf[buf_size]; 64 | 65 | if(inline_typewriter_font.name.find('"') != std::string::npos) { 66 | std::abort(); 67 | } 68 | snprintf(buf, 69 | buf_size, 70 | R"()", 71 | inline_typewriter_font.name.c_str(), 72 | inline_typewriter_font.size.pt()); 73 | tt_start_tag = buf; 74 | tt_end_tag = ""; 75 | } 76 | 77 | bool empty() const { return size == 0; } 78 | 79 | bool contains(T val) const { 80 | for(int i = 0; i < size; ++i) { 81 | if(arr[i] == val) { 82 | return true; 83 | } 84 | } 85 | return false; 86 | } 87 | 88 | void push(T new_val) { 89 | if(contains(new_val)) { 90 | printf("Tried to push an element that is already in the stack.\n"); 91 | std::abort(); 92 | } 93 | if(size >= max_elements) { 94 | printf("Stack overflow.\n"); 95 | std::abort(); 96 | } 97 | arr[size] = new_val; 98 | ++size; 99 | } 100 | 101 | void pop(T new_val) { 102 | if(empty()) { 103 | printf("Tried to pop an empty stack.\n"); 104 | std::abort(); 105 | } 106 | if(arr[size - 1] != new_val) { 107 | printf("Tried to pop a different value than is at the end of the stack.\n"); 108 | std::abort(); 109 | } 110 | --size; 111 | } 112 | 113 | bool operator==(const SmallStack &other) const { 114 | if(size != other.size) { 115 | return false; 116 | } 117 | for(int i = 0; i < size; ++i) { 118 | if(arr[i] != other.arr[i]) { 119 | return false; 120 | } 121 | } 122 | return true; 123 | } 124 | 125 | void write_buildup_markup(std::string &buf) const { 126 | for(int i = 0; i < size; ++i) { 127 | switch(arr[i]) { 128 | case ITALIC_S: 129 | buf += ""; 130 | break; 131 | case BOLD_S: 132 | buf += ""; 133 | break; 134 | case TT_S: 135 | buf += tt_start_tag; 136 | break; 137 | case SMALLCAPS_S: 138 | buf += ""; 139 | break; 140 | case SUPERSCRIPT_S: 141 | buf += ""; 142 | break; 143 | case SUBSCRIPT_S: 144 | buf += ""; 145 | break; 146 | default: 147 | std::abort(); 148 | } 149 | } 150 | } 151 | 152 | void write_teardown_markup(std::string &buf) const { 153 | 154 | for(int i = size - 1; i >= 0; --i) { 155 | switch(arr[i]) { 156 | case ITALIC_S: 157 | buf += ""; 158 | break; 159 | case BOLD_S: 160 | buf += ""; 161 | break; 162 | case TT_S: 163 | buf += tt_end_tag; 164 | break; 165 | case SMALLCAPS_S: 166 | buf += ""; 167 | break; 168 | case SUPERSCRIPT_S: 169 | buf += ""; 170 | break; 171 | case SUBSCRIPT_S: 172 | buf += ""; 173 | break; 174 | default: 175 | std::abort(); 176 | } 177 | } 178 | } 179 | 180 | const T *cbegin() const { return arr; } 181 | const T *cend() const { return arr + size; } 182 | 183 | const T *crbegin() const { return arr + size - 1; } // FIXME, need to use -- to progress. 184 | const T *crend() const { return arr - 1; } 185 | 186 | const std::string &inline_code_start_tag() const { return tt_start_tag; } 187 | const std::string &inline_code_end_tag() const { return tt_end_tag; } 188 | 189 | private: 190 | T arr[max_elements]; 191 | std::string tt_start_tag; 192 | std::string tt_end_tag; 193 | int size = 0; 194 | }; 195 | 196 | typedef SmallStack StyleStack; 197 | 198 | struct FormattingChange { 199 | size_t offset; 200 | char format; 201 | }; 202 | 203 | struct EnrichedWord { 204 | std::string text; 205 | std::vector hyphen_points; 206 | std::vector f; 207 | StyleStack start_style; 208 | }; 209 | -------------------------------------------------------------------------------- /mdtool.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Jussi Pakkanen 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | 20 | int main(int argc, char **argv) { 21 | if(argc != 2) { 22 | printf("Fail.\n"); 23 | return 1; 24 | } 25 | Metadata m; 26 | try { 27 | m = load_book_json(argv[1]); 28 | } catch(const std::exception &e) { 29 | printf("%s\n", e.what()); 30 | return 1; 31 | } 32 | printf("Author is: %s\n", m.author.c_str()); 33 | printf("%d source files\n", (int)m.sources.size()); 34 | 35 | return 0; 36 | } 37 | -------------------------------------------------------------------------------- /meson.build: -------------------------------------------------------------------------------- 1 | project('chapterizer', 'cpp', default_options : ['cpp_std=c++20']) 2 | 3 | cpp = meson.get_compiler('cpp') 4 | hyphen_dep = cpp.find_library('hyphen') 5 | ft_dep = dependency('freetype2') 6 | cairo_dep = dependency('cairo') 7 | pangocairo_dep = dependency('pangocairo') 8 | gtk4_dep = dependency('gtk4', required: false, method: 'pkg-config') 9 | fc_dep = dependency('fontconfig') 10 | glib_dep = dependency('glib-2.0') 11 | tixml_dep = dependency('tinyxml2') 12 | nljson_dep = dependency('nlohmann_json') 13 | voikko_dep = dependency('libvoikko') 14 | 15 | add_project_arguments('-Wshadow', language: 'cpp') 16 | 17 | l = static_library('chap', 18 | 'wordhyphenator.cpp', 19 | 'paragraphformatter.cpp', 20 | 'textstats.cpp', 21 | 'pdfrenderer.cpp', 22 | 'utils.cpp', 23 | 'fchelpers.cpp', 24 | 'formatting.cpp', 25 | 'bookparser.cpp', 26 | 'epub.cpp', 27 | 'draftpaginator.cpp', 28 | 'printpaginator.cpp', 29 | 'chapterformatter.cpp', 30 | 'metadata.cpp', 31 | dependencies: [hyphen_dep, pangocairo_dep, cairo_dep, fc_dep, voikko_dep] 32 | ) 33 | 34 | executable('pangotest', 'pangotest.cpp', dependencies: pangocairo_dep) 35 | 36 | executable('bookmaker', 'bookmaker.cpp', 37 | link_with: l, 38 | dependencies: [pangocairo_dep, tixml_dep]) 39 | 40 | if gtk4_dep.found() 41 | executable('guitool', 'guitool.cpp', 42 | link_with: l, 43 | dependencies: [gtk4_dep]) 44 | endif 45 | 46 | executable('tests', 'tests.cpp', 47 | link_with: [l], 48 | dependencies: glib_dep) 49 | 50 | executable('mdtool', 'mdtool.cpp', 51 | link_with: l) 52 | 53 | executable('voikkotest', 'voikkotest.cpp', 54 | dependencies: voikko_dep) 55 | 56 | executable('blocktest', 'blocktest.cpp') 57 | 58 | executable('paginationtest', 'paginationtest.cpp', 59 | dependencies: cairo_dep) 60 | -------------------------------------------------------------------------------- /metadata.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Jussi Pakkanen 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | namespace { 24 | 25 | const std::unordered_map langmap{ 26 | {"unk", Language::Unset}, {"en", Language::English}, {"fi", Language::Finnish}}; 27 | 28 | const std::unordered_map stylemap{{"regular", FontStyle::Regular}, 29 | {"italic", FontStyle::Italic}, 30 | {"bold", FontStyle::Bold}, 31 | {"bolditalic", FontStyle::BoldItalic}}; 32 | 33 | std::vector extract_stringarray(const nlohmann::json &data, const char *entryname) { 34 | std::vector result; 35 | auto arr = data[entryname]; 36 | if(!arr.is_array()) { 37 | printf("%s must be an array of strings.\n", entryname); 38 | std::abort(); 39 | } 40 | for(const auto &e : arr) { 41 | if(!e.is_string()) { 42 | printf("Source array %s entry is not a string.\n", entryname); 43 | std::abort(); 44 | } 45 | result.push_back(e.get()); 46 | } 47 | return result; 48 | } 49 | 50 | } // namespace 51 | 52 | using json = nlohmann::json; 53 | 54 | std::string get_string(const json &data, const char *key) { 55 | if(!data.contains(key)) { 56 | printf("Missing required key %s.\n", key); 57 | std::abort(); 58 | } 59 | auto value = data[key]; 60 | if(!value.is_string()) { 61 | printf("Element %s is not a string.\n", key); 62 | std::abort(); 63 | } 64 | return value.get(); 65 | } 66 | 67 | double get_double(const json &data, const char *key) { 68 | if(!data.contains(key)) { 69 | printf("Missing required key %s.\n", key); 70 | std::abort(); 71 | } 72 | auto value = data[key]; 73 | if(!value.is_number()) { 74 | printf("Element %s is not a string.\n", key); 75 | std::abort(); 76 | } 77 | return value.get(); 78 | } 79 | 80 | int get_int(const json &data, const char *key) { 81 | if(!data.contains(key)) { 82 | printf("Missing required key %s.\n", key); 83 | std::abort(); 84 | } 85 | auto value = data[key]; 86 | if(!value.is_number_integer()) { 87 | printf("Element %s is not a string.\n", key); 88 | std::abort(); 89 | } 90 | return value.get(); 91 | } 92 | 93 | ChapterParameters parse_chapterstyle(const json &data) { 94 | ChapterParameters chapter_style; 95 | chapter_style.line_height = Length::from_pt(get_double(data, "line_height")); 96 | chapter_style.indent = Length::from_mm(get_double(data, "indent")); 97 | auto font = data["font"]; 98 | chapter_style.font.name = get_string(font, "name"); 99 | const auto stylestr = get_string(font, "type"); 100 | auto it = stylemap.find(stylestr); 101 | if(it == stylemap.end()) { 102 | printf("Unknown type \"%s\".", stylestr.c_str()); 103 | std::abort(); 104 | } 105 | chapter_style.font.type = it->second; 106 | chapter_style.font.size = Length::from_pt(get_double(font, "pointsize")); 107 | auto fit = data.find("justify_last"); 108 | if(fit != data.end()) { 109 | chapter_style.indent_last_line = fit->get(); 110 | } 111 | return chapter_style; 112 | } 113 | 114 | void setup_draft_settings(Metadata &m) { 115 | // Fonts 116 | m.pdf.styles.normal.font.name = "Liberation Serif"; 117 | m.pdf.styles.normal.font.size = Length::from_pt(12); 118 | m.pdf.styles.normal.font.type = FontStyle::Regular; 119 | m.pdf.styles.normal.indent = Length::from_mm(10); 120 | m.pdf.styles.normal.line_height = Length::from_pt(20); 121 | const auto &normal = m.pdf.styles.normal; 122 | m.pdf.styles.normal_noindent = normal; 123 | m.pdf.styles.normal_noindent.indent = Length::zero(); 124 | 125 | m.pdf.styles.code = normal; 126 | m.pdf.styles.code.font.name = "Liberation Mono"; 127 | m.pdf.styles.code.font.size = Length::from_pt(10); 128 | m.pdf.styles.colophon = normal; 129 | m.pdf.styles.dedication = normal; 130 | m.pdf.styles.footnote = normal; 131 | m.pdf.styles.lists = normal; 132 | m.pdf.styles.letter = normal; 133 | m.pdf.styles.letter.font.type = FontStyle::Italic; 134 | 135 | m.pdf.styles.section.font.name = "Liberation Sans"; 136 | m.pdf.styles.section.font.size = Length::from_pt(14); 137 | m.pdf.styles.section.font.type = FontStyle::Bold; 138 | m.pdf.styles.section.line_height = Length::from_pt(25); 139 | 140 | m.pdf.styles.title = m.pdf.styles.section; 141 | m.pdf.styles.author = m.pdf.styles.section; 142 | m.pdf.styles.author.font.type = FontStyle::Regular; 143 | 144 | // Page 145 | m.pdf.page.w = Length::from_mm(210); 146 | m.pdf.page.h = Length::from_mm(297); 147 | m.pdf.margins.inner = Length::from_mm(25.4); 148 | m.pdf.margins.outer = Length::from_mm(25.4); 149 | // Should be exactly an inch, but the font does not divide it 150 | // cleanly so there is more empty space at the bottom than 151 | // there should be. 152 | m.pdf.margins.upper = Length::from_mm(20.4) + m.pdf.styles.normal.line_height; 153 | m.pdf.margins.lower = Length::from_mm(20.4); 154 | 155 | // Spaces 156 | m.pdf.spaces.below_section = Length::from_mm(0); 157 | m.pdf.spaces.above_section = Length::from_mm(60); 158 | m.pdf.spaces.codeblock_indent = Length::from_mm(20); 159 | m.pdf.spaces.letter_indent = Length::from_mm(20); 160 | m.pdf.spaces.different_paragraphs = Length::from_mm(5); 161 | m.pdf.spaces.footnote_separation = Length::from_mm(5); 162 | } 163 | 164 | void load_pdf_element(Metadata &m, const json &pdf) { 165 | m.pdf.ofname = get_string(pdf, "filename"); 166 | if(m.is_draft) { 167 | std::filesystem::path ofname = m.pdf.ofname; 168 | auto draftname = ofname.stem(); 169 | draftname += "-draft"; 170 | draftname.replace_extension(ofname.extension()); 171 | ofname.replace_filename(draftname); 172 | m.pdf.ofname = ofname.c_str(); 173 | } 174 | auto page = pdf["page"]; 175 | auto margins = pdf["margins"]; 176 | if(pdf.contains("bleed")) { 177 | m.pdf.bleed = Length::from_mm(get_double(pdf, "bleed")); 178 | } else { 179 | m.pdf.bleed = Length::zero(); 180 | } 181 | 182 | if(pdf.contains("colophon")) { 183 | auto colophon_file = m.top_dir / get_string(pdf, "colophon"); 184 | m.pdf.colophon = read_lines(colophon_file.c_str()); 185 | } 186 | if(m.is_draft) { 187 | setup_draft_settings(m); 188 | } else { 189 | 190 | m.pdf.page.w = Length::from_mm(get_int(page, "width")); 191 | m.pdf.page.h = Length::from_mm(get_int(page, "height")); 192 | m.pdf.margins.inner = Length::from_mm(get_int(margins, "inner")); 193 | m.pdf.margins.outer = Length::from_mm(get_int(margins, "outer")); 194 | m.pdf.margins.upper = Length::from_mm(get_int(margins, "upper")); 195 | m.pdf.margins.lower = Length::from_mm(get_int(margins, "lower")); 196 | 197 | auto styles = pdf["styles"]; 198 | m.pdf.styles.normal = parse_chapterstyle(styles["normal"]); 199 | m.pdf.styles.normal_noindent = m.pdf.styles.normal; 200 | m.pdf.styles.normal_noindent.indent = Length::zero(); 201 | m.pdf.styles.section = parse_chapterstyle(styles["section"]); 202 | m.pdf.styles.code = parse_chapterstyle(styles["code"]); 203 | m.pdf.styles.letter = parse_chapterstyle(styles["letter"]); 204 | m.pdf.styles.footnote = parse_chapterstyle(styles["footnote"]); 205 | m.pdf.styles.lists = parse_chapterstyle(styles["lists"]); 206 | m.pdf.styles.title = parse_chapterstyle(styles["title"]); 207 | m.pdf.styles.author = parse_chapterstyle(styles["author"]); 208 | m.pdf.styles.colophon = parse_chapterstyle(styles["colophon"]); 209 | m.pdf.styles.dedication = parse_chapterstyle(styles["dedication"]); 210 | 211 | auto spaces = pdf["spaces"]; 212 | m.pdf.spaces.above_section = Length::from_mm(get_double(spaces, "above_section")); 213 | m.pdf.spaces.below_section = Length::from_mm(get_double(spaces, "below_section")); 214 | m.pdf.spaces.different_paragraphs = 215 | Length::from_mm(get_double(spaces, "different_paragraphs")); 216 | m.pdf.spaces.codeblock_indent = Length::from_mm(get_double(spaces, "codeblock_indent")); 217 | m.pdf.spaces.letter_indent = Length::from_mm(get_double(spaces, "letter_indent")); 218 | m.pdf.spaces.footnote_separation = 219 | Length::from_mm(get_double(spaces, "footnote_separation")); 220 | } 221 | } 222 | 223 | void load_epub_element(Metadata &m, const json &epub) { 224 | m.epub.ofname = get_string(epub, "filename"); 225 | m.epub.cover = get_string(epub, "cover"); 226 | m.epub.ISBN = get_string(epub, "ISBN"); 227 | m.epub.stylesheet = get_string(epub, "stylesheet"); 228 | m.epub.file_as = get_string(epub, "file_as"); 229 | } 230 | 231 | void strip(std::string &s) { 232 | while(!s.empty() && s.back() == ' ') { 233 | s.pop_back(); 234 | } 235 | while(!s.empty() && s.front() == ' ') { 236 | s.erase(0, 1); 237 | } 238 | } 239 | 240 | std::vector load_credits(const char *credits_path) { 241 | std::vector c; 242 | std::string key, val; 243 | for(const auto &l : read_lines(credits_path)) { 244 | const auto p = l.find('+'); 245 | if(p == std::string::npos) { 246 | val = l; 247 | strip(val); 248 | c.emplace_back(CreditsTitle{std::move(val)}); 249 | } else { 250 | key = l.substr(0, p); 251 | val = l.substr(p + 1, std::string::npos); 252 | strip(key); 253 | strip(val); 254 | c.emplace_back(CreditsEntry{std::move(key), std::move(val)}); 255 | } 256 | } 257 | return c; 258 | } 259 | 260 | Metadata load_book_json(const char *path) { 261 | Metadata m; 262 | std::filesystem::path json_file(path); 263 | m.top_dir = json_file.parent_path(); 264 | std::ifstream ifile(path); 265 | if(ifile.fail()) { 266 | printf("Could not open file %s.\n", path); 267 | std::abort(); 268 | } 269 | 270 | json data = json::parse(ifile); 271 | assert(data.is_object()); 272 | m.author = get_string(data, "author"); 273 | m.title = get_string(data, "title"); 274 | auto draft = data["draft"]; 275 | if(!draft.is_null()) { 276 | m.is_draft = true; 277 | m.draftdata.email = get_string(draft, "email"); 278 | m.draftdata.phone = get_string(draft, "phone"); 279 | m.draftdata.surname = get_string(draft, "surname"); 280 | m.draftdata.page_number_template = m.draftdata.surname + " / " + m.title + " / "; 281 | } 282 | if(data.contains("debug_draw")) { 283 | m.debug_draw = data["debug_draw"].get(); 284 | } 285 | const auto langstr = get_string(data, "language"); 286 | auto it = langmap.find(langstr); 287 | if(it == langmap.end()) { 288 | printf("Unsupported language %s\n", langstr.c_str()); 289 | std::abort(); 290 | } 291 | m.language = it->second; 292 | 293 | auto frontmatter = extract_stringarray(data, "frontmatter"); 294 | for(const auto &text : frontmatter) { 295 | if(text == "empty") { 296 | m.frontmatter.emplace_back(Empty{}); 297 | } else if(text == "colophon.txt") { 298 | auto cfile = m.top_dir / text; 299 | m.frontmatter.emplace_back(Colophon{read_lines(cfile.c_str())}); 300 | } else if(text == "dedication.txt") { 301 | auto dfile = m.top_dir / text; 302 | m.frontmatter.emplace_back(Dedication{read_lines(dfile.c_str())}); 303 | } else if(text == "firstpage") { 304 | m.frontmatter.emplace_back(FirstPage{}); 305 | } else if(text == "signing.txt") { 306 | auto sfile = m.top_dir / text; 307 | m.frontmatter.emplace_back(Signing{read_lines(sfile.c_str())}); 308 | } else { 309 | fprintf(stderr, "Not supported yet.\n"); 310 | std::abort(); 311 | } 312 | } 313 | m.sources = extract_stringarray(data, "sources"); 314 | auto backmatter = extract_stringarray(data, "backmatter"); 315 | for(const auto &text : backmatter) { 316 | if(text == "credits.txt") { 317 | auto credits_path = m.top_dir / text; 318 | m.credits = load_credits(credits_path.c_str()); 319 | } else { 320 | fprintf(stderr, "Backmatter not yet supported.\n"); 321 | std::abort(); 322 | } 323 | } 324 | 325 | if(data.contains("pdf")) { 326 | m.generate_pdf = true; 327 | load_pdf_element(m, data["pdf"]); 328 | } else { 329 | m.generate_pdf = false; 330 | } 331 | 332 | if(data.contains("epub")) { 333 | m.generate_epub = true; 334 | load_epub_element(m, data["epub"]); 335 | } else { 336 | m.generate_epub = false; 337 | } 338 | 339 | return m; 340 | } 341 | 342 | int Document::num_chapters() const { 343 | return std::count_if(elements.begin(), elements.end(), [](const DocElement &e) { 344 | return std::holds_alternative
(e); 345 | }); 346 | } 347 | 348 | int Document::num_footnotes() const { 349 | return std::count_if(elements.begin(), elements.end(), [](const DocElement &e) { 350 | return std::holds_alternative(e); 351 | }); 352 | } 353 | -------------------------------------------------------------------------------- /metadata.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Jussi Pakkanen 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | enum class Language : int { 28 | Unset, 29 | English, 30 | Finnish, 31 | }; 32 | 33 | struct Margins { 34 | Length inner = Length::from_mm(20); 35 | Length outer = Length::from_mm(15); 36 | Length upper = Length::from_mm(15); 37 | Length lower = Length::from_mm(15); 38 | }; 39 | 40 | struct PageSize { 41 | Length w; 42 | Length h; 43 | }; 44 | 45 | struct CreditsTitle { 46 | std::string line; 47 | }; 48 | 49 | struct CreditsEntry { 50 | std::string key; 51 | std::string value; 52 | }; 53 | 54 | typedef std::variant Credits; 55 | 56 | struct ChapterStyles { 57 | ChapterParameters normal; 58 | ChapterParameters normal_noindent; 59 | ChapterParameters code; 60 | ChapterParameters section; 61 | ChapterParameters letter; 62 | ChapterParameters footnote; 63 | ChapterParameters lists; 64 | ChapterParameters title; 65 | ChapterParameters author; 66 | ChapterParameters colophon; 67 | ChapterParameters dedication; 68 | }; 69 | 70 | struct Spaces { 71 | Length above_section; 72 | Length below_section; 73 | Length different_paragraphs; 74 | Length letter_indent; 75 | Length codeblock_indent; 76 | Length footnote_separation; 77 | }; 78 | 79 | struct PdfMetadata { 80 | std::string ofname; 81 | std::vector colophon; 82 | PageSize page; 83 | Length bleed; 84 | Margins margins; 85 | ChapterStyles styles; 86 | Spaces spaces; 87 | }; 88 | 89 | struct EpubMetadata { 90 | std::string ofname; 91 | std::string ISBN; 92 | std::string cover; 93 | std::string stylesheet; 94 | std::string file_as; 95 | }; 96 | 97 | struct DraftData { 98 | std::string surname; 99 | std::string email; 100 | std::string phone; 101 | std::string page_number_template; 102 | }; 103 | 104 | struct Colophon { 105 | std::vector lines; 106 | }; 107 | 108 | struct Dedication { 109 | std::vector lines; 110 | }; 111 | 112 | struct Empty {}; 113 | 114 | struct FirstPage {}; 115 | 116 | struct Signing { 117 | std::vector lines; 118 | }; 119 | 120 | typedef std::variant FrontMatterPage; 121 | 122 | struct Metadata { 123 | // All paths in metadata are relative to this (i.e. where the JSON file was) 124 | std::filesystem::path top_dir; 125 | std::string title; 126 | std::string author; 127 | bool is_draft = false; 128 | DraftData draftdata; 129 | Language language; 130 | std::vector frontmatter; 131 | std::vector sources; 132 | std::vector backmatter; 133 | bool generate_pdf; 134 | bool generate_epub; 135 | PdfMetadata pdf; 136 | EpubMetadata epub; 137 | // FIXME, remove these three. 138 | std::vector dedication; 139 | std::vector credits; 140 | std::vector postcredits; 141 | bool debug_draw = false; 142 | }; 143 | 144 | struct Paragraph { 145 | std::string text; 146 | }; 147 | 148 | struct Section { 149 | int level; 150 | int number; 151 | std::string text; 152 | }; 153 | 154 | struct Letter { 155 | std::vector paragraphs; 156 | }; 157 | 158 | struct CodeBlock { 159 | std::vector raw_lines; 160 | }; 161 | 162 | struct SignBlock { 163 | std::vector raw_lines; 164 | }; 165 | 166 | struct Footnote { 167 | int number; 168 | std::string text; 169 | }; 170 | 171 | struct NumberList { 172 | std::vector items; 173 | }; 174 | 175 | struct Figure { 176 | std::string file; 177 | }; 178 | 179 | struct SceneChange {}; 180 | 181 | // Also needs images, footnotes, unformatted text etc. 182 | typedef std::variant 191 | DocElement; 192 | 193 | struct Document { 194 | Metadata data; 195 | std::vector elements; 196 | 197 | int num_chapters() const; 198 | int num_footnotes() const; 199 | }; 200 | 201 | Metadata load_book_json(const char *path); 202 | -------------------------------------------------------------------------------- /pangotest.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Jussi Pakkanen 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | 20 | #include 21 | #include 22 | 23 | int main() { 24 | setlocale(LC_ALL, ""); 25 | // cairo_status_t status; 26 | cairo_surface_t *surface = cairo_pdf_surface_create("pangocairotest.pdf", 595, 842); 27 | cairo_t *cr = cairo_create(surface); 28 | PangoLayout *layout = pango_cairo_create_layout(cr); 29 | PangoFontDescription *desc; 30 | desc = pango_font_description_from_string("Gentium"); 31 | assert(desc); 32 | pango_font_description_set_absolute_size(desc, 10 * PANGO_SCALE); 33 | pango_layout_set_font_description(layout, desc); 34 | pango_font_description_free(desc); 35 | 36 | cairo_move_to(cr, 72, 72); 37 | pango_layout_set_markup(layout, "This is a line of text.", -1); 38 | pango_cairo_update_layout(cr, layout); 39 | const auto plain_baseline = pango_layout_get_baseline(layout); 40 | pango_cairo_show_layout(cr, layout); 41 | pango_layout_set_attributes(layout, NULL); 42 | 43 | cairo_move_to(cr, 72, 72 + 1 * 12); 44 | pango_layout_set_markup(layout, "This is a line of text.", -1); 45 | pango_cairo_update_layout(cr, layout); 46 | pango_cairo_show_layout(cr, layout); 47 | pango_layout_set_attributes(layout, NULL); 48 | 49 | pango_layout_set_markup(layout, "This is a line of text.", -1); 50 | pango_cairo_update_layout(cr, layout); 51 | auto current_baseline = pango_layout_get_baseline(layout); 52 | cairo_move_to(cr, 72, 72 + 2 * 12 + (plain_baseline - current_baseline) / PANGO_SCALE); 53 | pango_cairo_show_layout(cr, layout); 54 | pango_layout_set_attributes(layout, NULL); 55 | 56 | cairo_move_to(cr, 72, 72 + 3 * 12); 57 | pango_layout_set_markup(layout, "This is a line of text & an ampersand.", -1); 58 | pango_cairo_update_layout(cr, layout); 59 | pango_cairo_show_layout(cr, layout); 60 | pango_layout_set_attributes(layout, NULL); 61 | 62 | cairo_move_to(cr, 72, 72 + 4 * 12); 63 | pango_layout_set_markup( 64 | layout, "This is a line of text.", -1); 65 | pango_cairo_update_layout(cr, layout); 66 | pango_cairo_show_layout(cr, layout); 67 | pango_layout_set_attributes(layout, NULL); 68 | 69 | cairo_move_to(cr, 72, 72 + 5 * 12); 70 | pango_layout_set_markup(layout, "This is a line of text.", -1); 71 | pango_cairo_update_layout(cr, layout); 72 | pango_cairo_show_layout(cr, layout); 73 | pango_layout_set_attributes(layout, NULL); 74 | 75 | cairo_move_to(cr, 72, 72 + 6 * 12); 76 | pango_layout_set_markup( 77 | layout, "This is a line of text.", -1); 78 | pango_cairo_update_layout(cr, layout); 79 | pango_cairo_show_layout(cr, layout); 80 | pango_layout_set_attributes(layout, NULL); 81 | 82 | cairo_move_to(cr, 72, 72 + 7 * 12); 83 | pango_layout_set_markup(layout, "This is a line of text.", -1); 84 | pango_cairo_update_layout(cr, layout); 85 | pango_cairo_show_layout(cr, layout); 86 | pango_layout_set_attributes(layout, NULL); 87 | cairo_surface_destroy(surface); 88 | 89 | cairo_destroy(cr); 90 | return 0; 91 | } 92 | -------------------------------------------------------------------------------- /paragraphformatter.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Jussi Pakkanen 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | class TextStats; 27 | 28 | // clang-format off 29 | 30 | /* 31 | * Numbering of splitting points is tricky. 32 | * 33 | * You want to be able to do build_line(from_ind, to_ind) 34 | * 35 | * 0 1 2 3 4 36 | * | | | | | 37 | * V V V V V 38 | * foo-bar fu-baz 39 | * 40 | * Thus it needs a split point at both ends. 41 | * 42 | * Word index _can_ point one-past-the-end. 43 | * 44 | * Hyphen index can _not_ point one-past-the-end. 45 | */ 46 | 47 | // clang-format on 48 | 49 | struct BetweenWordSplit { 50 | size_t word_index; 51 | }; 52 | 53 | struct WithinWordSplit { 54 | size_t word_index; 55 | size_t hyphen_index; 56 | }; 57 | 58 | struct TextLocation { 59 | size_t word_index; 60 | size_t offset; // in characters 61 | }; 62 | 63 | struct LineStats { 64 | size_t end_split; 65 | Length text_width; 66 | // int num_spaces; 67 | bool ends_in_dash; 68 | }; 69 | 70 | struct UpTo { 71 | double penalty; 72 | std::vector splits; 73 | 74 | bool operator<(const UpTo &o) const { return penalty < o.penalty; } 75 | }; 76 | 77 | struct WordStart { 78 | size_t word; 79 | size_t from_bytes; 80 | }; 81 | 82 | struct WordEnd { 83 | size_t word; 84 | size_t to_bytes; 85 | bool add_dash; 86 | }; 87 | 88 | struct WordsOnLine { 89 | std::optional first; 90 | size_t full_word_begin; 91 | size_t full_word_end; 92 | std::optional last; 93 | }; 94 | 95 | struct SplitStates { 96 | size_t cache_size = 12; 97 | std::vector> best_to; 98 | 99 | void clear() { best_to.clear(); } 100 | 101 | bool abandon_search(const std::vector &new_splits, const double new_penalty); 102 | }; 103 | 104 | struct LinePenaltyStatistics { 105 | Length delta; 106 | double penalty; 107 | }; 108 | 109 | typedef std::variant SplitPoint; 110 | 111 | struct PenaltyStatistics { 112 | std::vector lines; 113 | std::vector extras; 114 | }; 115 | 116 | PenaltyStatistics compute_stats(const std::vector &lines, 117 | const Length paragraph_width, 118 | const ChapterParameters &par, 119 | const ExtraPenaltyAmounts &amounts); 120 | 121 | class ParagraphFormatter { 122 | public: 123 | ParagraphFormatter(const std::vector &words, 124 | const Length target_width, 125 | const ChapterParameters &in_params, 126 | const ExtraPenaltyAmounts &ea); 127 | 128 | std::vector split_lines(); 129 | std::vector> split_formatted_lines(); 130 | 131 | double paragraph_end_penalty(const std::vector &lines) const; 132 | 133 | private: 134 | void precompute(); 135 | TextLocation point_to_location(const SplitPoint &p) const; 136 | LineStats 137 | get_closest_line_end(size_t start_split, const TextStats &shaper, size_t line_num) const; 138 | LineStats 139 | compute_closest_line_end(size_t start_split, const TextStats &shaper, size_t line_num) const; 140 | 141 | std::vector 142 | get_line_end_choices(size_t start_split, const TextStats &shaper, size_t line_num) const; 143 | 144 | std::vector simple_split(TextStats &shaper); 145 | std::vector> global_split_markup(const TextStats &shaper); 146 | void global_split_recursive(const TextStats &shaper, 147 | std::vector &line_stats, 148 | size_t split_pos); 149 | std::vector> 150 | stats_to_markup_lines(const std::vector &linestats) const; 151 | Length current_line_width(size_t line_num) const; 152 | double total_penalty(const std::vector &lines, bool is_complete = false) const; 153 | 154 | WordsOnLine words_for_splits(size_t from_split_ind, size_t to_split_ind) const; 155 | std::string build_line_markup(size_t from_split_ind, size_t to_split_ind) const; 156 | std::string build_line_text_debug(size_t from_split_ind, size_t to_split_ind) const; 157 | std::vector build_line_words_markup(size_t from_split_ind, 158 | size_t to_split_ind) const; 159 | 160 | Length paragraph_width; 161 | std::vector words; 162 | std::vector split_points; 163 | std::vector split_locations; 164 | 165 | StyleStack determine_style(TextLocation t) const; 166 | 167 | double best_penalty = 1e100; 168 | std::vector best_split; 169 | 170 | // Cached results of best states we have achieved thus far. 171 | SplitStates state_cache; 172 | ChapterParameters params; 173 | ExtraPenaltyAmounts extras; 174 | 175 | mutable std::unordered_map closest_line_ends; 176 | }; 177 | -------------------------------------------------------------------------------- /pdfrenderer.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Jussi Pakkanen 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include "pdfrenderer.hpp" 18 | #include 19 | 20 | #include 21 | #include 22 | 23 | #include 24 | #include 25 | 26 | static TextStats hack{}; 27 | 28 | #include 29 | 30 | namespace { 31 | 32 | const char *workname = "turbotempfile.pdf"; 33 | 34 | std::vector hack_split(const std::string &in_text) { 35 | std::string text; 36 | text.reserve(in_text.size()); 37 | for(size_t i = 0; i < in_text.size(); ++i) { 38 | if(in_text[i] == '\n') { 39 | text.push_back(' '); 40 | } else { 41 | text.push_back(in_text[i]); 42 | } 43 | } 44 | std::string val; 45 | const char separator = ' '; 46 | std::vector words; 47 | std::stringstream sstream(text); 48 | while(std::getline(sstream, val, separator)) { 49 | words.push_back(val); 50 | } 51 | return words; 52 | } 53 | 54 | uint32_t get_last_char(const std::string &markup) { 55 | const gchar *txt = markup.c_str(); 56 | int angle_depth = 0; 57 | uint32_t last_char = 0; 58 | while(*txt) { 59 | const uint32_t cur_char = g_utf8_get_char(txt); 60 | if(cur_char == '<') { 61 | ++angle_depth; 62 | } else if(cur_char == '>') { 63 | --angle_depth; 64 | } else if(angle_depth == 0) { 65 | last_char = cur_char; // FIXME, won't work with quoted angle brackets. 66 | } 67 | txt = g_utf8_next_char(txt); 68 | } 69 | return last_char; 70 | } 71 | 72 | } // namespace 73 | 74 | PdfRenderer::PdfRenderer(const char *ofname, 75 | Length pagew, 76 | Length pageh, 77 | Length bleed_, 78 | const char *title, 79 | const char *author) 80 | : bleed{bleed_.pt()}, mediaw{pagew.pt() + 2 * bleed}, mediah{pageh.pt() + 2 * bleed} { 81 | 82 | surf = cairo_pdf_surface_create(workname, mediaw, mediah); 83 | cairo_pdf_surface_set_metadata(surf, CAIRO_PDF_METADATA_TITLE, title); 84 | cairo_pdf_surface_set_metadata(surf, CAIRO_PDF_METADATA_AUTHOR, author); 85 | cairo_pdf_surface_set_metadata(surf, CAIRO_PDF_METADATA_CREATOR, "Superpdf from Outer Space!"); 86 | cr = cairo_create(surf); 87 | layout = pango_cairo_create_layout(cr); 88 | PangoContext *context = pango_layout_get_context(layout); 89 | pango_context_set_round_glyph_positions(context, FALSE); 90 | init_page(); 91 | 92 | // Can't use scale to draw in millimeters because it also scales text size. 93 | // cairo_scale(cr, 595.0 / 21.0, 595.0 / 21.0); 94 | 95 | // cairo_select_font_face(cr, "Gentium", CAIRO_FONT_SLANT_NORMAL, CAIRO_FONT_WEIGHT_NORMAL); 96 | // cairo_set_font_size(cr, 10.0); 97 | cairo_set_line_width(cr, 0.1); 98 | outname = ofname; 99 | } 100 | 101 | PdfRenderer::~PdfRenderer() { 102 | for(auto &it : loaded_images) { 103 | cairo_surface_destroy(it.second); 104 | } 105 | g_object_unref(G_OBJECT(layout)); 106 | cairo_destroy(cr); 107 | cairo_surface_destroy(surf); 108 | 109 | // Cairo only supports RGB output, so convert to Gray. 110 | assert(outname.find('"') == std::string::npos); 111 | std::string graycmd{"gs \"-sOutputFile="}; 112 | graycmd += outname; 113 | graycmd += "\" -sDEVICE=pdfwrite -sColorConversionStrategy=Gray " 114 | "-dProcessColorModel=/DeviceGray -dCompatibilityLevel=1.6 -dNOPAUSE -dBATCH \""; 115 | graycmd += workname; 116 | graycmd += "\""; 117 | // printf("%s\n", graycmd.c_str()); 118 | auto rc = system(graycmd.c_str()); 119 | if(rc != 0) { 120 | std::abort(); 121 | } 122 | unlink(workname); 123 | } 124 | 125 | void PdfRenderer::draw_grid() { 126 | cairo_set_source_rgb(cr, 0.9, 0.9, 0.9); 127 | cairo_set_line_width(cr, 0.1); 128 | for(double x = 0.0; x < mm2pt(210); x += mm2pt(10)) { 129 | cairo_move_to(cr, x, 0); 130 | cairo_line_to(cr, x, mm2pt(300)); 131 | } 132 | for(double y = 0.0; y < mm2pt(300); y += mm2pt(10)) { 133 | cairo_move_to(cr, 0, y); 134 | cairo_line_to(cr, mm2pt(210), y); 135 | } 136 | cairo_stroke(cr); 137 | } 138 | 139 | void PdfRenderer::draw_box(Length x, Length y, Length w, Length h, Length thickness) { 140 | cairo_save(cr); 141 | cairo_set_line_width(cr, thickness.pt()); 142 | cairo_set_source_rgb(cr, 0.0, 0.0, 0.0); 143 | cairo_move_to(cr, x.pt(), y.pt()); 144 | cairo_line_to(cr, (x + w).pt(), y.pt()); 145 | cairo_line_to(cr, (x + w).pt(), (y + h).pt()); 146 | cairo_line_to(cr, x.pt(), (y + h).pt()); 147 | cairo_close_path(cr); 148 | cairo_stroke(cr); 149 | cairo_restore(cr); 150 | } 151 | 152 | void PdfRenderer::fill_box(Length x, Length y, Length w, Length h, double color) { 153 | cairo_save(cr); 154 | cairo_set_source_rgb(cr, color, color, color); 155 | cairo_move_to(cr, x.pt(), y.pt()); 156 | cairo_line_to(cr, (x + w).pt(), y.pt()); 157 | cairo_line_to(cr, (x + w).pt(), (y + h).pt()); 158 | cairo_line_to(cr, x.pt(), (y + h).pt()); 159 | cairo_close_path(cr); 160 | cairo_fill(cr); 161 | cairo_restore(cr); 162 | } 163 | 164 | void PdfRenderer::fill_rounded_corner_box(Length x, Length y, Length w, Length h, double color) { 165 | const double round_fraction = 0.5; 166 | const auto round_distance = round_fraction * w; 167 | 168 | cairo_save(cr); 169 | cairo_set_source_rgb(cr, color, color, color); 170 | 171 | cairo_move_to(cr, (x + round_fraction * w).pt(), y.pt()); 172 | cairo_line_to(cr, (x + w - round_distance).pt(), y.pt()); 173 | cairo_curve_to( 174 | cr, (x + w).pt(), y.pt(), (x + w).pt(), y.pt(), (x + w).pt(), (y + round_distance).pt()); 175 | 176 | cairo_line_to(cr, (x + w).pt(), (y + h - round_distance).pt()); 177 | cairo_curve_to(cr, 178 | (x + w).pt(), 179 | (y + h).pt(), 180 | (x + w).pt(), 181 | (y + h).pt(), 182 | (x + w - round_distance).pt(), 183 | (y + h).pt()); 184 | 185 | cairo_line_to(cr, (x + round_distance).pt(), (y + h).pt()); 186 | cairo_curve_to(cr, 187 | (x).pt(), 188 | (y + h).pt(), 189 | (x).pt(), 190 | (y + h).pt(), 191 | (x).pt(), 192 | (y + h - round_distance).pt()); 193 | 194 | cairo_line_to(cr, x.pt(), (y + round_distance).pt()); 195 | cairo_curve_to(cr, x.pt(), y.pt(), x.pt(), y.pt(), (x + round_distance).pt(), y.pt()); 196 | cairo_close_path(cr); 197 | cairo_fill(cr); 198 | cairo_restore(cr); 199 | } 200 | 201 | void PdfRenderer::draw_dash_line(const std::vector &points, double line_width) { 202 | if(points.size() < 2) { 203 | return; 204 | } 205 | cairo_save(cr); 206 | cairo_set_source_rgb(cr, 0, 0, 0); 207 | cairo_set_line_width(cr, line_width); 208 | const double dashes[2] = {4.0, 2.0}; 209 | cairo_set_dash(cr, dashes, 2, 0.0); 210 | cairo_set_line_cap(cr, CAIRO_LINE_CAP_ROUND); 211 | cairo_move_to(cr, points[0].x.pt(), points[0].y.pt()); 212 | for(size_t i = 1; i < points.size(); ++i) { 213 | cairo_line_to(cr, points[i].x.pt(), points[i].y.pt()); 214 | } 215 | cairo_stroke(cr); 216 | cairo_restore(cr); 217 | } 218 | 219 | void PdfRenderer::draw_poly_line(const std::vector &points, Length thickness) { 220 | if(points.size() < 2) { 221 | return; 222 | } 223 | cairo_save(cr); 224 | cairo_set_source_rgb(cr, 0, 0, 0); 225 | cairo_set_line_width(cr, thickness.pt()); 226 | cairo_move_to(cr, points[0].x.pt(), points[0].y.pt()); 227 | for(size_t i = 1; i < points.size(); ++i) { 228 | cairo_line_to(cr, points[i].x.pt(), points[i].y.pt()); 229 | } 230 | cairo_stroke(cr); 231 | cairo_restore(cr); 232 | } 233 | 234 | void PdfRenderer::draw_arc( 235 | Length x, Length y, Length r, double angle1, double angle2, Length thickness) { 236 | cairo_save(cr); 237 | cairo_new_sub_path(cr); 238 | cairo_set_source_rgb(cr, 0, 0, 0); 239 | cairo_set_line_width(cr, thickness.pt()); 240 | cairo_arc_negative(cr, x.pt(), y.pt(), r.pt(), angle1, angle2); 241 | cairo_stroke(cr); 242 | cairo_restore(cr); 243 | } 244 | 245 | void PdfRenderer::render_line_justified(const std::string &line_text, 246 | const FontParameters &par, 247 | Length line_width, 248 | Length x, 249 | Length y) { 250 | assert(line_text.find('\n') == std::string::npos); 251 | setup_pango(par); 252 | const auto words = hack_split(line_text); 253 | Length text_width = hack.text_width(line_text.c_str(), par); 254 | const double num_spaces = std::count(line_text.begin(), line_text.end(), ' '); 255 | const Length space_extra_width{num_spaces > 0 ? ((line_width - text_width) / num_spaces) 256 | : Length::zero()}; 257 | 258 | std::string tmp; 259 | for(size_t i = 0; i < words.size(); ++i) { 260 | cairo_move_to(cr, x.pt(), y.pt()); 261 | PangoRectangle r; 262 | 263 | tmp = words[i]; 264 | tmp += ' '; 265 | pango_layout_set_attributes(layout, nullptr); 266 | pango_layout_set_text(layout, tmp.c_str(), -1); 267 | pango_layout_get_extents(layout, nullptr, &r); 268 | pango_cairo_update_layout(cr, layout); 269 | pango_cairo_show_layout(cr, layout); 270 | x += Length::from_pt(double(r.width) / PANGO_SCALE); 271 | x += space_extra_width; 272 | } 273 | } 274 | 275 | void PdfRenderer::render_line_justified(const std::vector &markup_words, 276 | const FontParameters &par, 277 | Length line_width, 278 | Length x, 279 | Length y) { 280 | if(markup_words.empty()) { 281 | return; 282 | } 283 | const uint32_t last_char = get_last_char(markup_words.back()); 284 | const Length overhang_right = hack.codepoint_right_overhang(last_char, par); 285 | setup_pango(par); 286 | std::string full_line; 287 | for(const auto &w : markup_words) { 288 | full_line += w; // Markup words end in spaces (except the last one). 289 | } 290 | 291 | pango_layout_set_attributes(layout, nullptr); 292 | // Pango aligns by top, we want alignment by baseline. 293 | // https://gitlab.gnome.org/GNOME/pango/-/issues/698 294 | pango_layout_set_text(layout, "A", 1); 295 | const auto desired_baseline = pango_layout_get_baseline(layout) / PANGO_SCALE; 296 | 297 | const Length target_width = line_width; 298 | pango_layout_set_markup(layout, full_line.c_str(), full_line.length()); 299 | const Length text_width = hack.markup_width(full_line.c_str(), par); 300 | const double num_spaces = double(markup_words.size() - 1); 301 | const Length space_extra_width = 302 | num_spaces > 0 ? (target_width - text_width + overhang_right) / num_spaces : Length::zero(); 303 | 304 | for(const auto &markup_word : markup_words) { 305 | cairo_move_to(cr, x.pt(), y.pt()); 306 | PangoRectangle r; 307 | 308 | pango_layout_set_attributes(layout, nullptr); 309 | pango_layout_set_markup(layout, markup_word.c_str(), -1); 310 | const auto current_baseline = pango_layout_get_baseline(layout) / PANGO_SCALE; 311 | 312 | cairo_rel_move_to(cr, 0, desired_baseline - current_baseline); 313 | pango_layout_get_extents(layout, nullptr, &r); 314 | pango_cairo_update_layout(cr, layout); 315 | pango_cairo_show_layout(cr, layout); 316 | x += Length::from_pt(double(r.width) / PANGO_SCALE); 317 | x += space_extra_width; 318 | } 319 | } 320 | 321 | void PdfRenderer::setup_pango(const FontParameters &par) { 322 | PangoFontDescription *desc = pango_font_description_from_string(par.name.c_str()); 323 | if(par.type == FontStyle::Bold || par.type == FontStyle::BoldItalic) { 324 | pango_font_description_set_weight(desc, PANGO_WEIGHT_BOLD); 325 | } else { 326 | pango_font_description_set_weight(desc, PANGO_WEIGHT_NORMAL); 327 | } 328 | if(par.type == FontStyle::Italic || par.type == FontStyle::BoldItalic) { 329 | pango_font_description_set_style(desc, PANGO_STYLE_ITALIC); 330 | } else { 331 | pango_font_description_set_style(desc, PANGO_STYLE_NORMAL); 332 | } 333 | pango_font_description_set_absolute_size(desc, par.size.pt() * PANGO_SCALE); 334 | pango_layout_set_font_description(layout, desc); 335 | pango_font_description_free(desc); 336 | } 337 | 338 | void PdfRenderer::render_text_as_is(const char *line, 339 | const FontParameters &par, 340 | Length x, 341 | Length y) { 342 | setup_pango(par); 343 | cairo_move_to(cr, x.pt(), y.pt()); 344 | pango_layout_set_attributes(layout, nullptr); 345 | pango_layout_set_text(layout, line, -1); 346 | pango_cairo_update_layout(cr, layout); 347 | pango_cairo_show_layout(cr, layout); 348 | } 349 | 350 | void PdfRenderer::render_markup_as_is( 351 | const char *line, const FontParameters &par, Length x, Length y, TextAlignment alignment) { 352 | setup_pango(par); 353 | pango_layout_set_attributes(layout, nullptr); 354 | pango_layout_set_markup(layout, line, -1); 355 | 356 | switch(alignment) { 357 | case TextAlignment::Left: 358 | cairo_move_to(cr, x.pt(), y.pt()); 359 | break; 360 | case TextAlignment::Centered: 361 | PangoRectangle r; 362 | pango_layout_get_extents(layout, nullptr, &r); 363 | cairo_move_to(cr, (x - Length::from_pt(r.width / (2 * PANGO_SCALE))).pt(), y.pt()); 364 | break; 365 | case TextAlignment::Right: 366 | pango_layout_get_extents(layout, nullptr, &r); 367 | cairo_move_to(cr, (x - Length::from_pt(r.width / PANGO_SCALE)).pt(), y.pt()); 368 | } 369 | 370 | pango_cairo_update_layout(cr, layout); 371 | pango_cairo_show_layout(cr, layout); 372 | } 373 | 374 | void PdfRenderer::render_markup_as_is(const std::vector markup_words, 375 | const FontParameters &par, 376 | Length x, 377 | Length y, 378 | TextAlignment alignment) { 379 | std::string full_line; 380 | for(const auto &w : markup_words) { 381 | full_line += w; 382 | } 383 | render_markup_as_is(full_line.c_str(), par, x, y, alignment); 384 | } 385 | 386 | void PdfRenderer::render_line_centered(const char *line, 387 | const FontParameters &par, 388 | Length x, 389 | Length y) { 390 | PangoRectangle r; 391 | 392 | setup_pango(par); 393 | pango_layout_set_attributes(layout, nullptr); 394 | pango_layout_set_text(layout, line, -1); 395 | pango_layout_get_extents(layout, nullptr, &r); 396 | render_text_as_is(line, par, x - Length::from_pt(r.width / (2 * PANGO_SCALE)), y); 397 | } 398 | 399 | void PdfRenderer::render_wonky_text(const char *text, 400 | const FontParameters &par, 401 | Length raise, 402 | Length shift, 403 | double tilt, 404 | double color, 405 | Length x, 406 | Length y) { 407 | cairo_save(cr); 408 | cairo_set_source_rgb(cr, color, color, color); 409 | cairo_translate(cr, (x + shift).pt(), (y + raise).pt()); 410 | cairo_rotate(cr, tilt); 411 | render_text_as_is(text, par, Length::zero(), Length::zero()); 412 | cairo_restore(cr); 413 | } 414 | 415 | void PdfRenderer::new_page() { 416 | finalize_page(); 417 | cairo_surface_show_page(surf); 418 | init_page(); 419 | ++pages; 420 | } 421 | 422 | void PdfRenderer::init_page() { 423 | if(bleed > 0) { 424 | cairo_save(cr); 425 | cairo_translate(cr, bleed, bleed); 426 | } 427 | } 428 | 429 | void PdfRenderer::finalize_page() { 430 | if(bleed > 0) { 431 | cairo_restore(cr); 432 | draw_cropmarks(); 433 | } 434 | } 435 | 436 | void PdfRenderer::draw_cropmarks() { 437 | const auto b = bleed; 438 | 439 | cairo_save(cr); 440 | cairo_move_to(cr, b, 0); 441 | cairo_rel_line_to(cr, 0, b / 2); 442 | cairo_move_to(cr, 0, b); 443 | cairo_rel_line_to(cr, b / 2, 0); 444 | 445 | cairo_move_to(cr, b, 0); 446 | cairo_rel_line_to(cr, 0, b / 2); 447 | cairo_move_to(cr, 0, b); 448 | cairo_rel_line_to(cr, b / 2, 0); 449 | 450 | cairo_move_to(cr, mediaw - b, 0); 451 | cairo_rel_line_to(cr, 0, b / 2); 452 | cairo_move_to(cr, mediaw - b / 2, b); 453 | cairo_rel_line_to(cr, b / 2, 0); 454 | 455 | cairo_move_to(cr, b, mediah); 456 | cairo_rel_line_to(cr, 0, -b / 2); 457 | cairo_move_to(cr, 0, mediah - b); 458 | cairo_rel_line_to(cr, b / 2, 0); 459 | 460 | cairo_move_to(cr, mediaw - b, mediah); 461 | cairo_rel_line_to(cr, 0, -b / 2); 462 | cairo_move_to(cr, mediaw, mediah - b); 463 | cairo_rel_line_to(cr, -b / 2, 0); 464 | 465 | cairo_set_line_width(cr, 5); 466 | cairo_set_source_rgb(cr, 1, 1, 1); 467 | cairo_stroke_preserve(cr); 468 | cairo_set_line_width(cr, 1); 469 | cairo_set_source_rgb(cr, 0, 0, 0); 470 | cairo_stroke(cr); 471 | 472 | cairo_restore(cr); 473 | } 474 | 475 | void PdfRenderer::draw_line(Length x0, Length y0, Length x1, Length y1, Length thickness) { 476 | cairo_set_line_width(cr, thickness.pt()); 477 | cairo_move_to(cr, x0.pt(), y0.pt()); 478 | cairo_line_to(cr, x1.pt(), y1.pt()); 479 | cairo_stroke(cr); 480 | } 481 | 482 | void PdfRenderer::draw_line( 483 | Length x0, Length y0, Length x1, Length y1, Length thickness, double g, cairo_line_cap_t cap) { 484 | cairo_save(cr); 485 | cairo_set_source_rgb(cr, g, g, g); 486 | cairo_set_line_cap(cr, cap); 487 | draw_line(x0, y0, x1, y1, thickness); 488 | cairo_restore(cr); 489 | } 490 | 491 | ImageInfo PdfRenderer::get_image(const std::string &path) { 492 | ImageInfo result; 493 | auto it = loaded_images.find(path); 494 | if(it != loaded_images.end()) { 495 | result.surf = it->second; 496 | } else { 497 | result.surf = cairo_image_surface_create_from_png(path.c_str()); 498 | if(cairo_surface_status(result.surf) != CAIRO_STATUS_SUCCESS) { 499 | printf("Failed to load image %s.\n", path.c_str()); 500 | std::abort(); 501 | } 502 | loaded_images[path] = result.surf; 503 | } 504 | result.w = cairo_image_surface_get_width(result.surf); 505 | result.h = cairo_image_surface_get_height(result.surf); 506 | return result; 507 | } 508 | 509 | void PdfRenderer::draw_image(const ImageInfo &image, Length x, Length y, Length w, Length h) { 510 | cairo_save(cr); 511 | cairo_rectangle(cr, x.pt(), y.pt(), w.pt(), h.pt()); 512 | cairo_translate(cr, x.pt(), y.pt()); 513 | cairo_scale(cr, w.pt() / image.w, h.pt() / image.h); 514 | cairo_set_source_surface(cr, image.surf, 0, 0); 515 | cairo_fill(cr); 516 | cairo_restore(cr); 517 | } 518 | 519 | void PdfRenderer::add_section_outline(int section_number, const std::string &text) { 520 | std::string outline = std::to_string(section_number); 521 | outline += ". "; 522 | outline += text; 523 | std::string link = "page="; 524 | link += std::to_string(page_num()); 525 | cairo_pdf_surface_add_outline( 526 | surf, CAIRO_PDF_OUTLINE_ROOT, outline.c_str(), link.c_str(), (cairo_pdf_outline_flags_t)0); 527 | } 528 | -------------------------------------------------------------------------------- /pdfrenderer.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Jussi Pakkanen 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | 22 | #include 23 | #include 24 | 25 | #include 26 | #include 27 | 28 | enum class TextAlignment : int { 29 | Left, 30 | Centered, 31 | Right, 32 | // Justified is stored in a separate struct. 33 | }; 34 | 35 | struct ImageInfo { 36 | cairo_surface_t *surf; 37 | int w, h; 38 | }; 39 | 40 | struct Coord { 41 | Length x; 42 | Length y; 43 | }; 44 | 45 | class PdfRenderer { 46 | public: 47 | explicit PdfRenderer(const char *ofname, 48 | Length pagew, 49 | Length pageh, 50 | Length bleed, 51 | const char *title, 52 | const char *author); 53 | ~PdfRenderer(); 54 | 55 | void render_line_justified(const std::string &text, 56 | const FontParameters &par, 57 | Length line_width_mm, 58 | Length x, 59 | Length y); 60 | 61 | void render_line_justified(const std::vector &markup_words, 62 | const FontParameters &par, 63 | Length line_width, 64 | Length x, 65 | Length y); 66 | 67 | void render_text_as_is(const char *line, const FontParameters &par, Length x, Length y); 68 | 69 | void render_markup_as_is( 70 | const char *line, const FontParameters &par, Length x, Length y, TextAlignment alignment); 71 | void render_markup_as_is(const std::vector markup_words, 72 | const FontParameters &par, 73 | Length x, 74 | Length y, 75 | TextAlignment alignment); 76 | void render_line_centered(const char *line, const FontParameters &par, Length x, Length y); 77 | 78 | void render_wonky_text(const char *text, 79 | const FontParameters &par, 80 | Length raise, 81 | Length shift, 82 | double tilt, 83 | double color, 84 | Length x, 85 | Length y); 86 | 87 | void new_page(); 88 | int page_num() const { return pages; } 89 | 90 | void draw_box(Length x, Length y, Length w, Length h, Length thickness); 91 | void fill_box(Length x, Length y, Length w, Length h, double color); 92 | 93 | void fill_rounded_corner_box(Length x, Length y, Length w, Length h, double color); 94 | 95 | void draw_line(Length x0, Length y0, Length x1, Length y1, Length thickness); 96 | 97 | void draw_line(Length x0, 98 | Length y0, 99 | Length x1, 100 | Length y1, 101 | Length thickness, 102 | double g, 103 | cairo_line_cap_t cap); 104 | 105 | ImageInfo get_image(const std::string &path); 106 | 107 | void draw_image(const ImageInfo &image, Length x, Length y, Length w, Length h); 108 | 109 | void draw_dash_line(const std::vector &points, double line_width); 110 | 111 | void draw_arc(Length x, Length y, Length r, double angle1, double angle2, Length thickness); 112 | 113 | void draw_poly_line(const std::vector &points, Length thickness); 114 | 115 | void add_section_outline(int section_number, const std::string &text); 116 | 117 | void init_page(); 118 | void finalize_page(); 119 | 120 | private: 121 | void draw_grid(); 122 | void draw_cropmarks(); 123 | void setup_pango(const FontParameters &par); 124 | 125 | int pages = 1; 126 | double bleed; 127 | double mediaw, mediah; 128 | cairo_t *cr; 129 | cairo_surface_t *surf; 130 | PangoLayout *layout; 131 | std::unordered_map loaded_images; 132 | std::string outname; 133 | }; 134 | -------------------------------------------------------------------------------- /printpaginator.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2025 Jussi Pakkanen 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | 21 | struct SectionElement { 22 | std::vector lines; 23 | size_t chapter_number; 24 | }; 25 | 26 | struct EmptyLineElement { 27 | size_t num_lines = 1; 28 | }; 29 | 30 | struct ParagraphElement { 31 | std::vector lines; 32 | ChapterParameters params; 33 | Length paragraph_width; 34 | }; 35 | 36 | struct SpecialTextElement { 37 | std::vector lines; 38 | Length extra_indent; 39 | const FontParameters *font = nullptr; 40 | TextAlignment alignment; 41 | }; 42 | 43 | struct ImageElement { 44 | std::filesystem::path path; 45 | double ppi; 46 | size_t height_in_lines; 47 | ImageInfo info; 48 | }; 49 | 50 | struct FootnoteElement {}; 51 | 52 | typedef std::variant 58 | TextElement; 59 | 60 | struct TextElementIterator { 61 | TextElementIterator() { 62 | element_id = 0; 63 | line_id = 0; 64 | elems = nullptr; 65 | } 66 | 67 | explicit TextElementIterator(std::vector &original) { 68 | elems = &original; 69 | element_id = 0; 70 | line_id = 0; 71 | } 72 | 73 | const TextElement &element(); 74 | const TextCommands &line(); 75 | 76 | void operator++(); 77 | void operator--(); 78 | 79 | void next_element() { 80 | if(element_id >= elems->size()) { 81 | return; 82 | } 83 | ++element_id; 84 | line_id = 0; 85 | } 86 | 87 | bool operator==(const TextElementIterator &o) const { 88 | return elems == o.elems && element_id == o.element_id && line_id == o.line_id; 89 | } 90 | 91 | bool operator!=(const TextElementIterator &o) const { return !(*this == o); } 92 | 93 | size_t element_id; 94 | size_t line_id; 95 | std::vector *elems; 96 | }; 97 | 98 | template<> struct std::hash { 99 | std::size_t operator()(const TextElementIterator &s) const noexcept { 100 | auto h1 = std::hash{}(s.element_id); 101 | auto h2 = std::hash{}(s.line_id); 102 | return ((h1 * 13) + h2); 103 | } 104 | }; 105 | 106 | struct TextLimits { 107 | TextElementIterator start; 108 | TextElementIterator end; 109 | }; 110 | 111 | struct SectionPage { 112 | size_t section; 113 | TextLimits main_text; 114 | }; 115 | 116 | struct RegularPage { 117 | TextLimits main_text; 118 | std::optional footnotes; 119 | std::optional image; 120 | }; 121 | 122 | struct ImagePage { 123 | size_t image_id; 124 | }; 125 | 126 | struct EmptyPage {}; 127 | 128 | typedef std::variant Page; 129 | 130 | struct HeightMismatch { 131 | size_t page_number; 132 | int64_t delta; 133 | }; 134 | 135 | struct PageStatistics { 136 | std::vector widows; 137 | std::vector orphans; 138 | std::vector mismatches; 139 | bool single_line_last_page = false; 140 | size_t total_penalty = 0; 141 | }; 142 | 143 | struct PageLayoutResult { 144 | std::vector pages; 145 | PageStatistics stats; 146 | }; 147 | 148 | const std::vector &get_lines(const TextElement &e); 149 | size_t lines_on_page(const Page &p); 150 | 151 | class PrintPaginator { 152 | public: 153 | PrintPaginator(const Document &d); 154 | ~PrintPaginator(); 155 | 156 | void generate_pdf(const char *outfile); 157 | 158 | private: 159 | void build_main_text(); 160 | 161 | std::vector 162 | build_justified_paragraph(const std::vector> &lines, 163 | const ChapterParameters &text_par, 164 | const Length target_width); 165 | std::vector 166 | build_ragged_paragraph(const std::vector> &lines, 167 | const ChapterParameters &text_par, 168 | const TextAlignment alignment); 169 | 170 | void create_section(const Section &s, const ExtraPenaltyAmounts &extras); 171 | void create_codeblock(const CodeBlock &cb); 172 | void create_sign(const SignBlock &cb); 173 | void create_letter(const Letter &letter); 174 | 175 | void create_paragraph(const Paragraph &p, 176 | const ExtraPenaltyAmounts &extras, 177 | const ChapterParameters &chpar, 178 | Length extra_indent); 179 | 180 | void optimize_page_splits(); 181 | 182 | void render_output(); 183 | void render_frontmatter(); 184 | void render_mainmatter(); 185 | void render_backmatter(); 186 | 187 | void render_floating_image(const ImageElement &imel); 188 | 189 | std::vector text_to_formatted_words(const std::string &text, 190 | bool permit_hyphenation = true); 191 | 192 | Length textblock_width() const { return page.w - m.inner - m.outer; } 193 | Length textblock_height() const { return page.h - m.upper - m.lower; } 194 | 195 | void dump_text(const char *path); 196 | void print_stats(const PageLayoutResult &res, size_t section_number); 197 | 198 | void new_page(); 199 | 200 | void draw_edge_markers(size_t chapter_number, size_t page_number); 201 | void draw_page_number(size_t page_number); 202 | 203 | void render_signing_page(const Signing &s); 204 | void render_maintext_lines(const TextElementIterator &start_loc, 205 | const TextElementIterator &end_loc, 206 | size_t book_page_number, 207 | Length y, 208 | int current_line = -1); 209 | 210 | Length current_left_margin() const { return rend->page_num() % 2 ? m.inner : m.outer; } 211 | 212 | const Document &doc; 213 | // These are just helpers to cut down on typing. 214 | const PageSize &page; 215 | const ChapterStyles &styles; 216 | const Spaces &spaces; 217 | const Margins &m; 218 | std::unique_ptr rend; 219 | WordHyphenator hyphen; 220 | int current_page = 1; 221 | int chapter_start_page = -1; 222 | 223 | // Add frontmatter 224 | std::vector> maintext_sections; 225 | // Add backmatter 226 | std::vector elements; 227 | FILE *stats; 228 | }; 229 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Book PDF creator 2 | 3 | This repository contains code to create a print-ready PDF and epub 4 | versions of a book written in a custom markdownish syntax called 5 | _bookdown_. 6 | 7 | The core algorithm is a global line splitter algorithm (a fairly 8 | heavily tweaked Knuth-Plass). There is also a GUI tool for testing its 9 | behaviour with different fonts, chapter widths et al. 10 | 11 | The code only works with latin text. This is because the point of the 12 | code is to create aesthetically pleasing results rather than "correct" 13 | ones. The aesthetic requirements for languages that use a different 14 | writing system like CJK, Arabic and Kannada are such that they would 15 | probably require their own custom algorithms. This might even be the 16 | case for "latin-like" scripts like Greek. At the very least they would 17 | need to be written by a native speaker. 18 | 19 | Other limitations include: 20 | 21 | - mixed-direction texts are obviously not supported 22 | 23 | - the GUI is very utilitarian "engineering UI", it might 24 | cause eye bleeding in people sensitive to UI purism 25 | 26 | - hyphenation only supports English and Finnish 27 | 28 | - there are some weird bugs, patches welcome 29 | 30 | - probably only works on Linux because it uses Cairo, GTK 4, 31 | Fontconfig et al quite heavily 32 | 33 | - some visual aspects are defined in the code itself rather than the 34 | JSON conf file as the current main use case is to generate my own 35 | books rather than provide a fully general tool 36 | 37 | ## Building and using 38 | 39 | Add dependencies with: 40 | 41 | ``` 42 | sudo apt install libhyphen-dev libgtk-4-dev libtinyxml2-dev libvoikko-dev 43 | ``` 44 | 45 | Building is done in the standard Meson way: 46 | 47 | ``` 48 | 49 | mkdir builddir 50 | cd builddir 51 | meson .. 52 | ninja 53 | ``` 54 | 55 | Then run either the GUI tool: 56 | 57 | ``` 58 | ./guitool 59 | ``` 60 | 61 | Or process a book json: 62 | 63 | ``` 64 | ./bookmaker ../testdoc/sample.json 65 | ``` 66 | -------------------------------------------------------------------------------- /testdoc/book.css: -------------------------------------------------------------------------------- 1 | h1, h2, h3, h4, h5 { 2 | font-family: Sans-Serif; 3 | } 4 | 5 | h1 { 6 | padding-top: 100px; 7 | padding-bottom: 30px; 8 | } 9 | 10 | p { 11 | font-family: Serif; 12 | text-align: justify; 13 | text-indent: 25px; 14 | margin-top: 0px; 15 | margin-bottom: 0px; 16 | } 17 | 18 | span.inlinecode { 19 | font-family: Mono; 20 | } 21 | 22 | p.preformatted { 23 | font-family: Mono; 24 | text-indent: 0px; 25 | margin-left: 40px; 26 | margin-top: 16px; 27 | margin-bottom: 16px; 28 | } 29 | 30 | p.noindent { 31 | margin-top: 0px; 32 | margin-bottom: 0px; 33 | text-indent: 0px; 34 | } 35 | 36 | p.newsection { 37 | margin-top: 0px; 38 | margin-bottom: 0px; 39 | text-indent: 0px; 40 | } 41 | 42 | p.newscene { 43 | margin-top: 16px; 44 | margin-bottom: 0px; 45 | text-indent: 0px; 46 | } 47 | 48 | p.afterspecial { 49 | margin-top: 8px; 50 | margin-bottom: 0px; 51 | text-indent: 0px; 52 | } 53 | 54 | p.footnote { 55 | text-indent: 0px; 56 | margin-top: 8px; 57 | } 58 | 59 | ol { 60 | margin-left: 40px; 61 | } 62 | -------------------------------------------------------------------------------- /testdoc/chapter1.bd: -------------------------------------------------------------------------------- 1 | # The Fake Chapter with Such a Long Name That It Requires Multiple Lines 2 | 3 | This /is some/ text to fill up a dummy paragraph of text. A code 4 | sample with brackets: `#include`. 5 | 6 | ```code 7 | We are the Borg. 8 | Resistance is futile. 9 | You will be assimilated. 10 | 11 | Empty line above. 12 | ``` 13 | 14 | |The fifth cylinder| must t*hi*s *bol*d *Woking*. 15 | 16 | Escaping characters in output text: \c{/} slash, \c{*} asterisk, \c{|} 17 | pipe, \c{`} backtick, \c{#} hash, \c{\} backslash, \c{^} circumflex. 18 | All in one word: \c{/*|`#\^}. Lastly a word with multiple 19 | replacements: ping\c{/}pong\c{*}pung. 20 | 21 | After this paragraph there will be a scene change. The visual 22 | appearance should be such that there is an empty line and the next 23 | paragraph is not indented. 24 | 25 | #s 26 | 27 | This is the paragraph that should not be indented. Some more text here 28 | to make sure it takes more than one line in the final output. 29 | 30 | This paragraph should again be indented. Some math: /a/^2^ + /b/^2^ = 31 | /c/⁴ 32 | 33 | 34 | ```footnote 35 | This is a footnote with some text in it. You can't yet name them or refer 36 | to them, but then again why would you. 37 | ``` 38 | 39 | “Henderson,” he called, “you saw that shooting star last night?” 40 | 41 | I think everyone expected to see a man emerge—possibly something a 42 | little unlike us terrestrial men, but in all essentials a man. I know I 43 | did. But, looking, I presently saw something stirring within the 44 | shadow: greyish billowy movements, one above another, and then two 45 | luminous disks—like eyes. Then something resembling a little grey 46 | snake, about the thickness of a walking stick, coiled up out of the 47 | writhing middle, and wriggled in the air towards me—and then another. 48 | 49 | #figure testimage.png 50 | 51 | I think everyone expected to see a man emerge—possibly something a 52 | little unlike us terrestrial men, but in all essentials a man. I know I 53 | did. But, looking, I presently saw something stirring within the 54 | shadow: greyish billowy movements, one above another, and then two 55 | luminous disks—like eyes. Then something resembling a little grey 56 | snake, about the thickness of a walking stick, coiled up out of the 57 | writhing middle, and wriggled in the air towards me—and then another. 58 | 59 | ```numberlist 60 | This is number one and the fun has just begun. 61 | 62 | This is number two and it is a longer piece of text so that it takes 63 | multiple output lines and demonstrates how paragraph wrapping works. 64 | 65 | This is number three. 66 | 67 | ``` 68 | 69 | I think everyone expected to see a man emerge—possibly something a 70 | little unlike us terrestrial men, but in all essentials a man. I know I 71 | did. But, looking, I presently saw something stirring within the 72 | shadow: greyish billowy movements, one above another, and then two 73 | luminous disks—like eyes. Then something resembling a little grey 74 | snake, about the thickness of a walking stick, coiled up out of the 75 | writhing middle, and wriggled in the air towards me—and then another. 76 | 77 | A letter follows: 78 | 79 | ```letter 80 | This is an inline piece of text that represents a letter, telegram, 81 | large piece of quotation or something similar. 82 | 83 | For test purposes there are now some more paragraphs of text. This is a 84 | slightly longer paragraph than the previous one. It turns out that 85 | writing filler text is a lot more work than one would imagine. 86 | 87 | And finally here is a short paragraph to end it all. 88 | ``` 89 | 90 | I think everyone expected to see a man emerge—possibly something a 91 | little unlike us terrestrial men, but in all essentials a man. I know I 92 | did. But, looking, I presently saw something stirring within the 93 | shadow: greyish billowy movements, one above another, and then two 94 | luminous disks—like eyes. Then something resembling a little grey 95 | snake, about the thickness of a walking stick, coiled up out of the 96 | writhing middle, and wriggled in the air towards me—and then another. 97 | -------------------------------------------------------------------------------- /testdoc/chapter2.bd: -------------------------------------------------------------------------------- 1 | # THE EVE OF THE WAR. 2 | 3 | |The fifth cylinder| must have fallen right into the midst of the 4 | house we had first visited. The building had vanished, completely 5 | smashed, pulverised, and dispersed by the blow. The cylinder lay now 6 | far beneath the original foundations—deep in a hole, already vastly 7 | larger than the pit I had looked into at Woking. The earth all round 8 | it had splashed under that tremendous impact—“splashed” is the only 9 | word—and lay in heaped piles that hid the masses of the adjacent 10 | houses. It had behaved exactly like mud under the violent blow of a 11 | hammer. Our house had collapsed backward; the front portion, even on 12 | the ground floor, had been destroyed completely; by a chance the 13 | kitchen and scullery had escaped, and stood buried now under soil and 14 | ruins, closed in by tons of earth on every side save towards the 15 | cylinder. Over that aspect we hung now on the very edge of the great 16 | circular pit the Martians were engaged in making. The heavy beating 17 | sound was evidently just behind us, and ever and again a bright green 18 | vapour drove up like a veil across our peephole. 19 | 20 | “Henderson,” he called, “you saw that shooting star last night?” 21 | 22 | I think everyone expected to see a man emerge—possibly something a 23 | little unlike us terrestrial men, but in all essentials a man. I know I 24 | did. But, looking, I presently saw something stirring within the 25 | shadow: greyish billowy movements, one above another, and then two 26 | luminous disks—like eyes. Then something resembling a little grey 27 | snake, about the thickness of a walking stick, coiled up out of the 28 | writhing middle, and wriggled in the air towards me—and then another. 29 | 30 | No one would have believed in the last years of the nineteenth century 31 | that this world was being watched keenly and closely by intelligences 32 | greater than man’s and yet as mortal as his own; that as men busied 33 | themselves about their various concerns they were scrutinised and 34 | studied, perhaps almost as narrowly as a man with a microscope might 35 | scrutinise the transient creatures that swarm and multiply in a drop of 36 | water. With infinite complacency men went to and fro over this globe 37 | about their little affairs, serene in their assurance of their empire 38 | over matter. It is possible that the infusoria under the microscope do 39 | the same. No one gave a thought to the older worlds of space as sources 40 | of human danger, or thought of them only to dismiss the idea of life 41 | upon them as impossible or improbable. It is curious to recall some of 42 | the mental habits of those departed days. At most terrestrial men 43 | fancied there might be other men upon Mars, perhaps inferior to 44 | themselves and ready to welcome a missionary enterprise. Yet across the 45 | gulf of space, minds that are to our minds as ours are to those of the 46 | beasts that perish, intellects vast and cool and unsympathetic, 47 | regarded this earth with envious eyes, and slowly and surely drew their 48 | plans against us. And early in the twentieth century came the great 49 | disillusionment. 50 | 51 | The planet Mars, I scarcely need remind the reader, revolves about the 52 | sun at a mean distance of 140,000,000 miles, and the light and heat it 53 | receives from the sun is barely half of that received by this world. It 54 | must be, if the nebular hypothesis has any truth, older than our world; 55 | and long before this earth ceased to be molten, life upon its surface 56 | must have begun its course. The fact that it is scarcely one seventh of 57 | the volume of the earth must have accelerated its cooling to the 58 | temperature at which life could begin. It has air and water and all 59 | that is necessary for the support of animated existence. 60 | 61 | Yet so vain is man, and so blinded by his vanity, that no writer, up to 62 | the very end of the nineteenth century, expressed any idea that 63 | intelligent life might have developed there far, or indeed at all, 64 | beyond its earthly level. Nor was it generally understood that since 65 | Mars is older than our earth, with scarcely a quarter of the 66 | superficial area and remoter from the sun, it necessarily follows that 67 | it is not only more distant from time’s beginning but nearer its end. 68 | 69 | The secular cooling that must someday overtake our planet has already 70 | gone far indeed with our neighbour. Its physical condition is still 71 | largely a mystery, but we know now that even in its equatorial region 72 | the midday temperature barely approaches that of our coldest winter. 73 | Its air is much more attenuated than ours, its oceans have shrunk until 74 | they cover but a third of its surface, and as its slow seasons change 75 | huge snowcaps gather and melt about either pole and periodically 76 | inundate its temperate zones. That last stage of exhaustion, which to 77 | us is still incredibly remote, has become a present-day problem for the 78 | inhabitants of Mars. The immediate pressure of necessity has brightened 79 | their intellects, enlarged their powers, and hardened their hearts. And 80 | looking across space with instruments, and intelligences such as we 81 | have scarcely dreamed of, they see, at its nearest distance only 82 | 35,000,000 of miles sunward of them, a morning star of hope, our own 83 | warmer planet, green with vegetation and grey with water, with a cloudy 84 | atmosphere eloquent of fertility, with glimpses through its drifting 85 | cloud wisps of broad stretches of populous country and narrow, 86 | navy-crowded seas. 87 | 88 | And we men, the creatures who inhabit this earth, must be to them at 89 | least as alien and lowly as are the monkeys and lemurs to us. The 90 | intellectual side of man already admits that life is an incessant 91 | struggle for existence, and it would seem that this too is the belief 92 | of the minds upon Mars. Their world is far gone in its cooling and this 93 | world is still crowded with life, but crowded only with what they 94 | regard as inferior animals. To carry warfare sunward is, indeed, their 95 | only escape from the destruction that, generation after generation, 96 | creeps upon them. 97 | 98 | And before we judge of them too harshly we must remember what ruthless 99 | and utter destruction our own species has wrought, not only upon 100 | animals, such as the vanished bison and the dodo, but upon its inferior 101 | races. The Tasmanians, in spite of their human likeness, were entirely 102 | swept out of existence in a war of extermination waged by European 103 | immigrants, in the space of fifty years. Are we such apostles of mercy 104 | as to complain if the Martians warred in the same spirit? 105 | 106 | The Martians seem to have calculated their descent with amazing 107 | subtlety—their mathematical learning is evidently far in excess of 108 | ours—and to have carried out their preparations with a well-nigh 109 | perfect unanimity. Had our instruments permitted it, we might have seen 110 | the gathering trouble far back in the nineteenth century. Men like 111 | Schiaparelli watched the red planet—it is odd, by-the-bye, that for 112 | countless centuries Mars has been the star of war—but failed to 113 | interpret the fluctuating appearances of the markings they mapped so 114 | well. All that time the Martians must have been getting ready. 115 | 116 | During the opposition of 1894 a great light was seen on the illuminated 117 | part of the disk, first at the Lick Observatory, then by Perrotin of 118 | Nice, and then by other observers. English readers heard of it first in 119 | the issue of /Nature/ dated August 2. I am inclined to think that this 120 | blaze may have been the casting of the huge gun, in the vast pit sunk 121 | into their planet, from which their shots were fired at us. Peculiar 122 | markings, as yet unexplained, were seen near the site of that outbreak 123 | during the next two oppositions. 124 | 125 | The storm burst upon us six years ago now. As Mars approached 126 | opposition, Lavelle of Java set the wires of the astronomical exchange 127 | palpitating with the amazing intelligence of a huge outbreak of 128 | incandescent gas upon the planet. It had occurred towards midnight of 129 | the twelfth; and the spectroscope, to which he had at once resorted, 130 | indicated a mass of flaming gas, chiefly hydrogen, moving with an 131 | enormous velocity towards this earth. This jet of fire had become 132 | invisible about a quarter past twelve. He compared it to a colossal 133 | puff of flame suddenly and violently squirted out of the planet, “as 134 | flaming gases rushed out of a gun.” 135 | 136 | A singularly appropriate phrase it proved. Yet the next day there was 137 | nothing of this in the papers except a little note in the /Daily 138 | Telegraph/, and the world went in ignorance of one of the gravest 139 | dangers that ever threatened the human race. I might not have heard of 140 | the eruption at all had I not met Ogilvy, the well-known astronomer, at 141 | Ottershaw. He was immensely excited at the news, and in the excess of 142 | his feelings invited me up to take a turn with him that night in a 143 | scrutiny of the red planet. 144 | 145 | In spite of all that has happened since, I still remember that vigil 146 | very distinctly: the black and silent observatory, the shadowed lantern 147 | throwing a feeble glow upon the floor in the corner, the steady ticking 148 | of the clockwork of the telescope, the little slit in the roof—an 149 | oblong profundity with the stardust streaked across it. Ogilvy moved 150 | about, invisible but audible. Looking through the telescope, one saw a 151 | circle of deep blue and the little round planet swimming in the field. 152 | It seemed such a little thing, so bright and small and still, faintly 153 | marked with transverse stripes, and slightly flattened from the perfect 154 | round. But so little it was, so silvery warm—a pin’s head of light! It 155 | was as if it quivered, but really this was the telescope vibrating with 156 | the activity of the clockwork that kept the planet in view. 157 | 158 | As I watched, the planet seemed to grow larger and smaller and to 159 | advance and recede, but that was simply that my eye was tired. Forty 160 | millions of miles it was from us—more than forty millions of miles of 161 | void. Few people realise the immensity of vacancy in which the dust of 162 | the material universe swims. 163 | 164 | Near it in the field, I remember, were three faint points of light, 165 | three telescopic stars infinitely remote, and all around it was the 166 | unfathomable darkness of empty space. You know how that blackness looks 167 | on a frosty starlight night. In a telescope it seems far profounder. 168 | And invisible to me because it was so remote and small, flying swiftly 169 | and steadily towards me across that incredible distance, drawing nearer 170 | every minute by so many thousands of miles, came the Thing they were 171 | sending us, the Thing that was to bring so much struggle and calamity 172 | and death to the earth. I never dreamed of it then as I watched; no one 173 | on earth dreamed of that unerring missile. 174 | 175 | That night, too, there was another jetting out of gas from the distant 176 | planet. I saw it. A reddish flash at the edge, the slightest projection 177 | of the outline just as the chronometer struck midnight; and at that I 178 | told Ogilvy and he took my place. The night was warm and I was thirsty, 179 | and I went stretching my legs clumsily and feeling my way in the 180 | darkness, to the little table where the siphon stood, while Ogilvy 181 | exclaimed at the streamer of gas that came out towards us. 182 | 183 | That night another invisible missile started on its way to the earth 184 | from Mars, just a second or so under twenty-four hours after the first 185 | one. I remember how I sat on the table there in the blackness, with 186 | patches of green and crimson swimming before my eyes. I wished I had a 187 | light to smoke by, little suspecting the meaning of the minute gleam I 188 | had seen and all that it would presently bring me. Ogilvy watched till 189 | one, and then gave it up; and we lit the lantern and walked over to his 190 | house. Down below in the darkness were Ottershaw and Chertsey and all 191 | their hundreds of people, sleeping in peace. 192 | 193 | He was full of speculation that night about the condition of Mars, and 194 | scoffed at the vulgar idea of its having inhabitants who were 195 | signalling us. His idea was that meteorites might be falling in a heavy 196 | shower upon the planet, or that a huge volcanic explosion was in 197 | progress. He pointed out to me how unlikely it was that organic 198 | evolution had taken the same direction in the two adjacent planets. 199 | 200 | “The chances against anything manlike on Mars are a million to one,” he 201 | said. 202 | 203 | Hundreds of observers saw the flame that night and the night after 204 | about midnight, and again the night after; and so for ten nights, a 205 | flame each night. Why the shots ceased after the tenth no one on earth 206 | has attempted to explain. It may be the gases of the firing caused the 207 | Martians inconvenience. Dense clouds of smoke or dust, visible through 208 | a powerful telescope on earth as little grey, fluctuating patches, 209 | spread through the clearness of the planet’s atmosphere and obscured 210 | its more familiar features. 211 | 212 | Even the daily papers woke up to the disturbances at last, and popular 213 | notes appeared here, there, and everywhere concerning the volcanoes 214 | upon Mars. The seriocomic periodical /Punch/, I remember, made a happy 215 | use of it in the political cartoon. And, all unsuspected, those 216 | missiles the Martians had fired at us drew earthward, rushing now at a 217 | pace of many miles a second through the empty gulf of space, hour by 218 | hour and day by day, nearer and nearer. It seems to me now almost 219 | incredibly wonderful that, with that swift fate hanging over us, men 220 | could go about their petty concerns as they did. I remember how 221 | jubilant Markham was at securing a new photograph of the planet for the 222 | illustrated paper he edited in those days. People in these latter times 223 | scarcely realise the abundance and enterprise of our nineteenth-century 224 | papers. For my own part, I was much occupied in learning to ride the 225 | bicycle, and busy upon a series of papers discussing the probable 226 | developments of moral ideas as civilisation progressed. 227 | 228 | One night (the first missile then could scarcely have been 10,000,000 229 | miles away) I went for a walk with my wife. It was starlight and I 230 | explained the Signs of the Zodiac to her, and pointed out Mars, a 231 | bright dot of light creeping zenithward, towards which so many 232 | telescopes were pointed. It was a warm night. Coming home, a party of 233 | excursionists from Chertsey or Isleworth passed us singing and playing 234 | music. There were lights in the upper windows of the houses as the 235 | people went to bed. From the railway station in the distance came the 236 | sound of shunting trains, ringing and rumbling, softened almost into 237 | melody by the distance. My wife pointed out to me the brightness of the 238 | red, green, and yellow signal lights hanging in a framework against the 239 | sky. It seemed so safe and tranquil. 240 | -------------------------------------------------------------------------------- /testdoc/colophon.txt: -------------------------------------------------------------------------------- 1 | This is text that is in the colophon. 2 | 3 | 4 | 5 | 6 | 7 | 8 | © Author Name 2222 9 | 10 | Publisher Name 11 | 12 | ISBN 13 | 14 | Printing location 15 | -------------------------------------------------------------------------------- /testdoc/credits.txt: -------------------------------------------------------------------------------- 1 | Thanks go to 2 | 3 | Graphics + Illustrator name 4 | Layout + DTP program worker's name 5 | Syntax + Not finalised yet 6 | A long key + Some value 7 | Thanks + Person one 8 | + Person Two 9 | + Arthur Negus 10 | + Fake Name 11 | -------------------------------------------------------------------------------- /testdoc/dedication.txt: -------------------------------------------------------------------------------- 1 | I dedicate this piece of code 2 | to someone for some reason. The source 3 | file has its own line separations but 4 | the final output does not follow them. 5 | 6 | This is a second dedication paragraph. 7 | It should stand alone. 8 | -------------------------------------------------------------------------------- /testdoc/epub_cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jpakkane/chapterizer/ec384ca2c766a0a90babbaadac7e60a1b2cd31b9/testdoc/epub_cover.png -------------------------------------------------------------------------------- /testdoc/largesample.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "The Super Book of Stuff", 3 | "author": "Bob McBob", 4 | "language": "fi", 5 | "dedication_": "dedication.txt", 6 | "credits": "credits.txt", 7 | "postcredits": "postcredits.txt", 8 | "frontmatter": [ 9 | "firstpage", 10 | "empty", 11 | "signing.txt", 12 | "colophon.txt", 13 | "dedication.txt" 14 | ], 15 | "sources": [ 16 | "chapterlong.bd" 17 | ], 18 | "backmatter": [ 19 | "credits.txt" 20 | ], 21 | "draft2": { 22 | "surname": "Pakkanen", 23 | "email": "jpakkane@gmail.com", 24 | "phone": "040 111 1111" 25 | }, 26 | "pdf": { 27 | "filename": "large_bookout.pdf", 28 | "page": { 29 | "width": 130, 30 | "height": 210 31 | }, 32 | "margins": { 33 | "inner": 20, 34 | "outer": 15, 35 | "upper": 15, 36 | "lower": 20 37 | }, 38 | "bleed": 20, 39 | "styles": { 40 | "normal": { 41 | "line_height": 14, 42 | "indent": 5, 43 | "font": { 44 | "name": "Liberation Serif", 45 | "type": "regular", 46 | "pointsize": 10 47 | } 48 | }, 49 | "section": { 50 | "line_height": 16, 51 | "indent": 0, 52 | "font": { 53 | "name": "URW Bookman", 54 | "type": "bold", 55 | "pointsize": 120 56 | } 57 | }, 58 | "code": { 59 | "line_height": 12, 60 | "indent": 0, 61 | "font": { 62 | "name": "Liberation Mono", 63 | "type": "regular", 64 | "pointsize": 9 65 | } 66 | }, 67 | "title": { 68 | "line_height": 22, 69 | "indent": 0, 70 | "font": { 71 | "name": "Liberation Serif", 72 | "type": "bold", 73 | "pointsize": 16 74 | } 75 | }, 76 | "author": { 77 | "line_height": 16, 78 | "indent": 0, 79 | "font": { 80 | "name": "Liberation Serif", 81 | "type": "regular", 82 | "pointsize": 14 83 | } 84 | }, 85 | "footnote": { 86 | "line_height": 11, 87 | "indent": 5, 88 | "font": { 89 | "name": "Liberation Serif", 90 | "type": "regular", 91 | "pointsize": 9 92 | } 93 | }, 94 | "letter": { 95 | "line_height": 14, 96 | "indent": 5, 97 | "font": { 98 | "name": "Liberation Serif", 99 | "type": "italic", 100 | "pointsize": 10 101 | } 102 | }, 103 | "lists": { 104 | "line_height": 12, 105 | "indent": 0, 106 | "font": { 107 | "name": "Liberation Serif", 108 | "type": "regular", 109 | "pointsize": 10 110 | } 111 | }, 112 | "colophon": { 113 | "line_height": 11, 114 | "indent": 5, 115 | "font": { 116 | "name": "Liberation Serif", 117 | "type": "regular", 118 | "pointsize": 9 119 | } 120 | }, 121 | "dedication": { 122 | "line_height": 12, 123 | "indent": 0, 124 | "font": { 125 | "name": "Liberation Serif", 126 | "type": "italic", 127 | "pointsize": 10 128 | } 129 | } 130 | }, 131 | "spaces": { 132 | "above_section": 20, 133 | "below_section": 10, 134 | "different_paragraphs": 3, 135 | "codeblock_indent": 5, 136 | "letter_indent": 10, 137 | "footnote_separation": 2 138 | } 139 | }, 140 | "epub2": { 141 | "stylesheet": "book.css", 142 | "cover": "epub_cover.png", 143 | "filename": "large_bookout.epub", 144 | "ISBN": "123456789X", 145 | "file_as": "McBob, Bob" 146 | } 147 | } 148 | -------------------------------------------------------------------------------- /testdoc/postcredits.txt: -------------------------------------------------------------------------------- 1 | Empty. 2 | 3 | -------------------------------------------------------------------------------- /testdoc/sample.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "The Super Book of Stuff", 3 | "author": "Bob McBob", 4 | "language": "fi", 5 | "dedication_": "dedication.txt", 6 | "credits": "credits.txt", 7 | "postcredits": "postcredits.txt", 8 | "sources": [ 9 | "chapter1.bd", 10 | "chapter2.bd" 11 | ], 12 | "draft2": { 13 | "surname": "Pakkanen", 14 | "email": "jpakkane@gmail.com", 15 | "phone": "040 222 2222" 16 | }, 17 | "pdf": { 18 | "colophon": "colophon.txt", 19 | "filename": "bookout.pdf", 20 | "page": { 21 | "width": 130, 22 | "height": 210 23 | }, 24 | "margins": { 25 | "inner": 20, 26 | "outer": 15, 27 | "upper": 15, 28 | "lower": 20 29 | }, 30 | "bleed": 20, 31 | "styles": { 32 | "normal": { 33 | "line_height": 14, 34 | "indent": 5, 35 | "font": { 36 | "name": "Liberation Serif", 37 | "type": "regular", 38 | "pointsize": 10 39 | } 40 | }, 41 | "section": { 42 | "line_height": 16, 43 | "indent": 0, 44 | "font": { 45 | "name": "Liberation Sans", 46 | "type": "bold", 47 | "pointsize": 14 48 | } 49 | }, 50 | "code": { 51 | "line_height": 12, 52 | "indent": 0, 53 | "font": { 54 | "name": "Liberation Mono", 55 | "type": "regular", 56 | "pointsize": 9 57 | } 58 | }, 59 | "title": { 60 | "line_height": 22, 61 | "indent": 0, 62 | "font": { 63 | "name": "Liberation Serif", 64 | "type": "bold", 65 | "pointsize": 16 66 | } 67 | }, 68 | "author": { 69 | "line_height": 16, 70 | "indent": 0, 71 | "font": { 72 | "name": "Liberation Serif", 73 | "type": "regular", 74 | "pointsize": 14 75 | } 76 | }, 77 | "footnote": { 78 | "line_height": 11, 79 | "indent": 5, 80 | "font": { 81 | "name": "Liberation Serif", 82 | "type": "regular", 83 | "pointsize": 9 84 | } 85 | }, 86 | "letter": { 87 | "line_height": 14, 88 | "indent": 5, 89 | "font": { 90 | "name": "Liberation Serif", 91 | "type": "italic", 92 | "pointsize": 10 93 | } 94 | }, 95 | "lists": { 96 | "line_height": 12, 97 | "indent": 0, 98 | "font": { 99 | "name": "Liberation Serif", 100 | "type": "regular", 101 | "pointsize": 10 102 | } 103 | }, 104 | "colophon": { 105 | "line_height": 11, 106 | "indent": 5, 107 | "font": { 108 | "name": "Liberation Serif", 109 | "type": "regular", 110 | "pointsize": 9 111 | } 112 | }, 113 | "dedication": { 114 | "line_height": 12, 115 | "indent": 0, 116 | "font": { 117 | "name": "Liberation Serif", 118 | "type": "italic", 119 | "pointsize": 10 120 | } 121 | } 122 | }, 123 | "spaces": { 124 | "above_section": 20, 125 | "below_section": 10, 126 | "different_paragraphs": 3, 127 | "codeblock_indent": 5, 128 | "letter_indent": 10, 129 | "footnote_separation": 2 130 | } 131 | }, 132 | "epub": { 133 | "stylesheet": "book.css", 134 | "cover": "epub_cover.png", 135 | "filename": "bookout.epub", 136 | "ISBN": "123456789X", 137 | "file_as": "McBob, Bob" 138 | } 139 | } 140 | -------------------------------------------------------------------------------- /testdoc/signing.txt: -------------------------------------------------------------------------------- 1 | This book is number 2 | 3 | ___ / 300 4 | 5 | It is signed by 6 | 7 | _______________ 8 | 9 | -------------------------------------------------------------------------------- /testdoc/testimage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jpakkane/chapterizer/ec384ca2c766a0a90babbaadac7e60a1b2cd31b9/testdoc/testimage.png -------------------------------------------------------------------------------- /tests.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Jussi Pakkanen 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | 20 | #define CHECK(cond) \ 21 | if(!(cond)) { \ 22 | printf("Fail %s:%d\n", __PRETTY_FUNCTION__, __LINE__); \ 23 | std::abort(); \ 24 | } 25 | 26 | void test_hyphenation_simple() { 27 | WordHyphenator h; 28 | auto w = h.hyphenate("morning", Language::English); 29 | HyphenPoint expected{3, SplitType::Regular}; 30 | CHECK(w.size() == 1); 31 | CHECK(w.front() == expected); 32 | } 33 | 34 | void test_hyphenation_dash() { 35 | WordHyphenator h; 36 | auto w = h.hyphenate("hi-ho", Language::English); 37 | HyphenPoint expected{2, SplitType::NoHyphen}; 38 | CHECK(w.size() == 1); 39 | CHECK(w.front() == expected); 40 | } 41 | 42 | void test_hyphenation_emdash() { 43 | WordHyphenator h; 44 | auto w = h.hyphenate("us—more", Language::English); // Unicode 45 | HyphenPoint expected{4, SplitType::NoHyphen}; 46 | CHECK(w.size() == 1); 47 | CHECK(w.front() == expected); 48 | } 49 | 50 | void test_hyphenation_prefix() { 51 | WordHyphenator h; 52 | auto w = h.hyphenate("“morning", Language::English); 53 | HyphenPoint expected{6, 54 | SplitType::Regular}; // First character is Unicode and takes three bytes. 55 | CHECK(w.size() == 1); 56 | CHECK(w.front() == expected); 57 | } 58 | 59 | void test_hyphenation_underscore() { 60 | WordHyphenator h; 61 | auto w = h.hyphenate("_Nature_", Language::English); 62 | HyphenPoint expected{2, SplitType::Regular}; 63 | CHECK(w.size() == 1); 64 | CHECK(w.front() == expected); 65 | } 66 | 67 | void test_utf8_splitting() { 68 | WordHyphenator h; 69 | const std::string text{"emerge—possibly"}; 70 | auto w = h.hyphenate(text, Language::English); // Note: has an em-dash. 71 | for(size_t i = 0; i < w.size(); ++i) { 72 | auto sub = text.substr(0, w[i].loc + 1); 73 | CHECK(g_utf8_validate(sub.c_str(), -1, nullptr)); 74 | } 75 | for(size_t i = 0; i < w.size(); ++i) { 76 | auto sub = text.substr(w[i].loc + 1); 77 | CHECK(g_utf8_validate(sub.c_str(), -1, nullptr)); 78 | } 79 | } 80 | 81 | void test_strange_combo() { 82 | WordHyphenator h; 83 | const std::string text{"impact—“splashed”"}; // impact—“splashed”"}; 84 | auto w = h.hyphenate(text, Language::English); 85 | HyphenPoint expected{1, SplitType::Regular}; 86 | HyphenPoint expected2{8, SplitType::NoHyphen}; 87 | CHECK(w.size() == 2); 88 | CHECK(w.front() == expected); 89 | CHECK(w.back() == expected2); 90 | } 91 | 92 | void test_dualhyphen() { 93 | // Not sure if correct, this could also be hyphenless. 94 | WordHyphenator h; 95 | const std::string text{"maybe——"}; 96 | auto w = h.hyphenate(text, Language::English); 97 | HyphenPoint expected{7, SplitType::NoHyphen}; 98 | CHECK(w.size() == 1); 99 | CHECK(w.front() == expected); 100 | } 101 | 102 | void test_utf8_impl(const char *t, Language lang) { 103 | WordHyphenator h; 104 | const std::string text(t); 105 | const auto w = h.hyphenate(text, lang); 106 | for(const auto &hp : w) { 107 | std::string pre = text.substr(0, hp.loc + 1); 108 | CHECK(g_utf8_validate(pre.c_str(), pre.length(), nullptr)); 109 | std::string post = text.substr(hp.loc + 1, std::string::npos); 110 | CHECK(g_utf8_validate(post.c_str(), post.length(), nullptr)); 111 | } 112 | } 113 | 114 | void test_utf8() { 115 | test_utf8_impl("kansikuvapönöttäjästä", Language::Finnish); 116 | test_utf8_impl("päämajaksi", Language::Finnish); 117 | test_utf8_impl("silkkiäis", Language::Finnish); 118 | } 119 | 120 | void test_finhyphen() { 121 | WordHyphenator h; 122 | const std::string text("juna-UV"); 123 | const auto w = h.hyphenate(text, Language::Finnish); 124 | HyphenPoint expected{1, SplitType::Regular}; 125 | HyphenPoint expected2{4, SplitType::NoHyphen}; 126 | CHECK(w.size() == 2); 127 | CHECK(w.front() == expected); 128 | CHECK(w.back() == expected2); 129 | } 130 | 131 | void test_singleletter() { 132 | WordHyphenator h; 133 | const std::string text{"oliivi"}; 134 | HyphenPoint expected{3, SplitType::Regular}; 135 | const auto w = h.hyphenate(text, Language::Finnish); 136 | CHECK(w.size() == 1); 137 | CHECK(w.front() == expected); 138 | } 139 | 140 | void test_singleletter_end() { 141 | WordHyphenator h; 142 | const std::string text{"tarttua,"}; 143 | HyphenPoint expected{3, SplitType::Regular}; 144 | const auto w = h.hyphenate(text, Language::Finnish); 145 | CHECK(w.size() == 1); 146 | CHECK(w.front() == expected); 147 | } 148 | 149 | void test_singleletter_dash() { 150 | WordHyphenator h; 151 | const std::string text{"junaolio-oliivi"}; 152 | HyphenPoint expected0{1, SplitType::Regular}; 153 | HyphenPoint expected1{3, SplitType::Regular}; 154 | HyphenPoint expected2{4, SplitType::Regular}; 155 | HyphenPoint expected3{8, SplitType::NoHyphen}; 156 | HyphenPoint expected4{12, SplitType::Regular}; 157 | const auto w = h.hyphenate(text, Language::Finnish); 158 | CHECK(w.size() == 5); 159 | CHECK(w[0] == expected0); 160 | CHECK(w[1] == expected1); 161 | CHECK(w[2] == expected2); 162 | CHECK(w[3] == expected3); 163 | CHECK(w[4] == expected4); 164 | } 165 | 166 | void test_hyphenation() { 167 | test_hyphenation_simple(); 168 | test_hyphenation_dash(); 169 | test_hyphenation_emdash(); 170 | test_hyphenation_prefix(); 171 | test_hyphenation_underscore(); 172 | test_utf8_splitting(); 173 | test_strange_combo(); 174 | test_dualhyphen(); 175 | test_utf8(); 176 | test_finhyphen(); 177 | test_singleletter(); 178 | test_singleletter_end(); 179 | test_singleletter_dash(); 180 | } 181 | 182 | int main(int, char **) { 183 | printf("Running hyphenation tests.\n"); 184 | test_hyphenation(); 185 | } 186 | -------------------------------------------------------------------------------- /textstats.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Jussi Pakkanen 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include "textstats.hpp" 18 | #include 19 | 20 | #include 21 | #include 22 | 23 | namespace { 24 | 25 | const std::unordered_map overhang_right{ 26 | {'.', 0.8}, 27 | {',', 0.8}, 28 | {':', 0.8}, 29 | {';', 0.8}, 30 | {'!', 0.7}, 31 | {'?', 0.4}, 32 | // 33 | {'o', 0.2}, 34 | {'p', 0.2}, 35 | {'v', 0.2}, 36 | {'b', 0.2}, 37 | {'r', 0.2}, 38 | // 39 | {'\'', 0.5}, 40 | {'"', 0.5}, 41 | {0xbb, 0.5}, 42 | {0x201d, 0.5}, 43 | {0x2019, 0.3}, 44 | // 45 | {0x2013, 0.55}, 46 | {0x2014, 0.50}, 47 | {'-', 0.6}, 48 | }; 49 | 50 | } 51 | 52 | TextStats::TextStats() { 53 | surface = cairo_pdf_surface_create(nullptr, 595, 842); 54 | cr = cairo_create(surface); 55 | assert(cr); 56 | cairo_move_to(cr, 72, 72); 57 | layout = pango_cairo_create_layout(cr); 58 | PangoContext *context = pango_layout_get_context(layout); 59 | pango_context_set_round_glyph_positions(context, FALSE); 60 | } 61 | 62 | TextStats::~TextStats() { 63 | g_object_unref(G_OBJECT(layout)); 64 | cairo_surface_destroy(surface); 65 | cairo_destroy(cr); 66 | } 67 | 68 | void TextStats::set_pango_state(const char *utf8_text, 69 | const FontParameters &font, 70 | bool is_markup) const { 71 | auto *desc = pango_font_description_from_string(font.name.c_str()); 72 | assert(desc); 73 | pango_font_description_set_absolute_size(desc, font.size.pt() * PANGO_SCALE); 74 | if(font.type == FontStyle::Bold || font.type == FontStyle::BoldItalic) { 75 | pango_font_description_set_weight(desc, PANGO_WEIGHT_BOLD); 76 | } else { 77 | pango_font_description_set_weight(desc, PANGO_WEIGHT_NORMAL); 78 | } 79 | if(font.type == FontStyle::Italic || font.type == FontStyle::BoldItalic) { 80 | pango_font_description_set_style(desc, PANGO_STYLE_ITALIC); 81 | } else { 82 | pango_font_description_set_style(desc, PANGO_STYLE_NORMAL); 83 | } 84 | pango_layout_set_font_description(layout, desc); 85 | assert(g_utf8_validate(utf8_text, -1, nullptr)); 86 | pango_layout_set_attributes(layout, nullptr); 87 | if(is_markup) { 88 | pango_layout_set_markup(layout, utf8_text, -1); 89 | } else { 90 | pango_layout_set_text(layout, utf8_text, -1); 91 | } 92 | pango_font_description_free(desc); 93 | } 94 | 95 | Length TextStats::text_width(const char *utf8_text, const FontParameters &font) const { 96 | StyledPlainText k; 97 | k.text = utf8_text; 98 | k.font = font; 99 | auto f = plaintext_widths.find(k); 100 | if(f != plaintext_widths.end()) { 101 | return f->second; 102 | } 103 | set_pango_state(utf8_text, font, false); 104 | PangoRectangle ink_rect, logical_rect; 105 | pango_layout_get_extents(layout, &ink_rect, &logical_rect); 106 | // printf("Text width is %.2f mm\n", double(logical_rect.width) / PANGO_SCALE / 595 * 220); 107 | Length w = Length::from_pt(double(logical_rect.width) / PANGO_SCALE); 108 | plaintext_widths[k] = w; 109 | return w; 110 | } 111 | 112 | Length TextStats::markup_width(const char *utf8_text, const FontParameters &font) const { 113 | StyledMarkupText k; 114 | k.text = utf8_text; 115 | k.font = font; 116 | auto f = markup_widths.find(k); 117 | if(f != markup_widths.end()) { 118 | return f->second; 119 | } 120 | set_pango_state(utf8_text, font, true); 121 | PangoRectangle ink_rect, logical_rect; 122 | pango_layout_get_extents(layout, &ink_rect, &logical_rect); 123 | 124 | Length w = Length::from_pt(double(logical_rect.width) / PANGO_SCALE); 125 | markup_widths[k] = w; 126 | return w; 127 | } 128 | 129 | Length TextStats::codepoint_right_overhang(const uint32_t uchar, const FontParameters &font) const { 130 | auto it = overhang_right.find(uchar); 131 | if(it == overhang_right.end()) { 132 | return Length{}; 133 | } 134 | const double hang_fraction = it->second; 135 | char buf[10]; 136 | memset(buf, 0, 10); 137 | g_unichar_to_utf8(uchar, buf); 138 | assert(g_utf8_validate(buf, -1, nullptr)); 139 | const auto letter_width = text_width(buf, font); 140 | return hang_fraction * letter_width; 141 | } 142 | -------------------------------------------------------------------------------- /textstats.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Jussi Pakkanen 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | 22 | #include 23 | #include 24 | #include 25 | 26 | struct StyledPlainText { 27 | std::string text; 28 | FontParameters font; 29 | 30 | bool operator==(const StyledPlainText &o) const noexcept { 31 | return text == o.text && font == o.font; 32 | } 33 | }; 34 | 35 | struct StyledMarkupText { 36 | std::string text; 37 | FontParameters font; 38 | 39 | bool operator==(const StyledMarkupText &o) const noexcept { 40 | return text == o.text && font == o.font; 41 | } 42 | }; 43 | 44 | template<> struct std::hash { 45 | std::size_t operator()(StyledPlainText const &s) const noexcept { 46 | auto h1 = std::hash{}(s.text); 47 | auto h2 = std::hash{}(s.font); 48 | return ((h1 * 13) + h2); 49 | } 50 | }; 51 | 52 | template<> struct std::hash { 53 | std::size_t operator()(StyledMarkupText const &s) const noexcept { 54 | auto h1 = std::hash{}(s.text); 55 | auto h2 = std::hash{}(s.font); 56 | return ((h1 * 13) + h2); 57 | } 58 | }; 59 | 60 | class TextStats { 61 | public: 62 | TextStats(); 63 | ~TextStats(); 64 | 65 | Length text_width(const char *utf8_text, const FontParameters &font) const; 66 | 67 | Length text_width(const std::string &s, const FontParameters &font) const { 68 | return text_width(s.c_str(), font); 69 | }; 70 | 71 | Length markup_width(const char *utf8_text, const FontParameters &font) const; 72 | 73 | Length codepoint_right_overhang(const uint32_t uchar, const FontParameters &font) const; 74 | 75 | private: 76 | void set_pango_state(const char *utf8_text, 77 | const FontParameters &font, 78 | bool is_markup = false) const; 79 | 80 | cairo_t *cr; 81 | cairo_surface_t *surface; 82 | PangoLayout *layout; 83 | mutable std::unordered_map plaintext_widths; 84 | mutable std::unordered_map markup_widths; 85 | }; 86 | -------------------------------------------------------------------------------- /units.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Jussi Pakkanen 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | 21 | inline double mm2pt(const double x) { return x * 2.8346456693; } 22 | inline double pt2mm(const double x) { return x / 2.8346456693; } 23 | 24 | class Length { 25 | private: 26 | explicit Length(double d) : v_m(d) {}; 27 | 28 | public: 29 | double v_m = 0.0; 30 | 31 | static Length from_mm(double val) { return Length(val / 1000); } 32 | static Length from_pt(double val) { return Length::from_mm(pt2mm(val)); } 33 | 34 | Length() : v_m(0.0) {} 35 | Length(const Length &d) : v_m(d.v_m) {}; 36 | 37 | static Length zero() { return Length{0.0}; } 38 | 39 | Length operator-() const { return Length{-v_m}; } 40 | 41 | Length &operator=(const Length &p) { 42 | v_m = p.v_m; 43 | return *this; 44 | } 45 | 46 | Length &operator+=(const Length &p) { 47 | v_m += p.v_m; 48 | return *this; 49 | } 50 | 51 | Length operator+(const Length &o) const { return Length{v_m + o.v_m}; } 52 | 53 | Length operator-(const Length &o) const { return Length{v_m - o.v_m}; } 54 | 55 | Length operator*(const double o) const { return Length{v_m * o}; } 56 | 57 | Length operator/(const double o) const { return Length{v_m / o}; } 58 | 59 | Length &operator-=(const Length &o) { 60 | v_m -= o.v_m; 61 | return *this; 62 | } 63 | 64 | std::partial_ordering operator<=>(const Length &o) const { return v_m <=> o.v_m; } 65 | 66 | double pt() const { return mm2pt(mm()); } 67 | double mm() const { return 1000 * v_m; } 68 | }; 69 | 70 | inline Length operator*(const double d, const Length l) { return l * d; } 71 | -------------------------------------------------------------------------------- /utils.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Jussi Pakkanen 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | #include 29 | #include 30 | namespace { 31 | 32 | // Store formatting characters in unused ASCII codepoints during processing. 33 | // 34 | // All characters under 33 are taken except 0, 10 (linefeed). 35 | 36 | std::unordered_map to_internal{ 37 | {'/', 1}, {'*', 2}, {'|', 3}, {'`', 4}, {'#', 5}, {'\\', 6}, {'^', 7}, {'_', 8}}; 38 | std::unordered_map from_internal{ 39 | {1, '/'}, {2, '*'}, {3, '|'}, {4, '`'}, {5, '#'}, {6, '\\'}, {7, '^'}, {8, '_'}}; 40 | 41 | } // namespace 42 | 43 | std::vector split_to_lines(const std::string &in_text) { 44 | std::vector lines; 45 | std::string val; 46 | const char separator = '\n'; 47 | std::vector words; 48 | std::stringstream sstream(in_text); 49 | while(std::getline(sstream, val, separator)) { 50 | while(!val.empty() && val.back() == ' ') { 51 | val.pop_back(); 52 | } 53 | while(!val.empty() && val.front() == ' ') { 54 | val.erase(val.begin()); 55 | } 56 | if(!val.empty()) { 57 | words.push_back(val); 58 | } 59 | } 60 | return words; 61 | } 62 | 63 | std::vector split_to_words(std::string_view in_text) { 64 | std::string text; 65 | text.reserve(in_text.size()); 66 | for(size_t i = 0; i < in_text.size(); ++i) { 67 | if(in_text[i] == '\n') { 68 | text.push_back(' '); 69 | } else { 70 | text.push_back(in_text[i]); 71 | } 72 | } 73 | while(!text.empty() && text.back() == ' ') { 74 | text.pop_back(); 75 | } 76 | while(!text.empty() && text.front() == ' ') { 77 | text.erase(text.begin()); 78 | } 79 | std::string val; 80 | const char separator = ' '; 81 | std::vector words; 82 | std::stringstream sstream(text); 83 | while(std::getline(sstream, val, separator)) { 84 | if(!val.empty()) { 85 | words.push_back(val); 86 | } 87 | } 88 | return words; 89 | } 90 | 91 | MMapper::MMapper(const char *path) { 92 | bufsize = std::filesystem::file_size(path); 93 | int fd = open(path, O_RDONLY); 94 | assert(fd > 0); 95 | buf = static_cast(mmap(nullptr, bufsize, PROT_READ, MAP_PRIVATE, fd, 0)); 96 | assert(buf); 97 | close(fd); 98 | } 99 | 100 | MMapper::~MMapper() { munmap((void *)(buf), bufsize); } 101 | 102 | std::vector read_lines(const char *p) { 103 | std::vector lines; 104 | std::ifstream input(p); 105 | assert(!input.fail()); 106 | for(std::string line; std::getline(input, line);) { 107 | if(!g_utf8_validate(line.c_str(), line.length(), nullptr)) { 108 | printf("Invalid UTF-8 in %s.\n", p); 109 | std::abort(); 110 | } 111 | lines.push_back(line); 112 | } 113 | return lines; 114 | } 115 | 116 | std::vector read_paragraphs(const char *p) { 117 | std::vector paragraphs; 118 | std::string buf; 119 | for(const auto &line : read_lines(p)) { 120 | if(line.empty()) { 121 | if(!buf.empty()) { 122 | paragraphs.emplace_back(std::move(buf)); 123 | buf.clear(); 124 | } 125 | } else { 126 | if(buf.empty()) { 127 | buf = line; 128 | } else { 129 | buf += ' '; 130 | buf += line; 131 | } 132 | } 133 | } 134 | if(!buf.empty()) { 135 | paragraphs.emplace_back(std::move(buf)); 136 | } 137 | return paragraphs; 138 | } 139 | 140 | std::string current_date() { 141 | char buf[200]; 142 | time_t t; 143 | struct tm *tmp; 144 | t = time(NULL); 145 | tmp = localtime(&t); 146 | if(tmp == NULL) { 147 | std::abort(); 148 | } 149 | 150 | if(strftime(buf, 200, "%Y-%m-%d", tmp) == 0) { 151 | std::abort(); 152 | } 153 | return std::string{buf}; 154 | } 155 | 156 | char special2internal(char c) { 157 | auto it = to_internal.find(c); 158 | if(it == to_internal.end()) { 159 | return c; 160 | } 161 | return it->second; 162 | } 163 | 164 | char internal2special(char c) { 165 | auto it = from_internal.find(c); 166 | if(it == from_internal.end()) { 167 | return c; 168 | } 169 | return it->second; 170 | } 171 | 172 | void restore_special_chars(std::string &s) { 173 | for(size_t i = 0; i < s.size(); ++i) { 174 | s[i] = internal2special(s[i]); 175 | } 176 | } 177 | 178 | int words_in_file(const char *fname) { 179 | int num_words = 0; 180 | std::ifstream ifile(fname); 181 | assert(!ifile.fail()); 182 | std::string line; 183 | std::string word; 184 | while(std::getline(ifile, line)) { 185 | std::stringstream wordstream(line); 186 | while(std::getline(wordstream, word, ' ')) { 187 | ++num_words; 188 | } 189 | } 190 | return num_words; 191 | } 192 | -------------------------------------------------------------------------------- /utils.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Jussi Pakkanen 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | #include 22 | 23 | std::vector split_to_words(std::string_view in_text); 24 | std::vector split_to_lines(const std::string &in_text); 25 | 26 | class MMapper { 27 | public: 28 | explicit MMapper(const char *path); 29 | ~MMapper(); 30 | 31 | const char *data() const { return buf; } 32 | int64_t size() const { return bufsize; } 33 | 34 | std::string_view view() const { return std::string_view(buf, bufsize); } 35 | 36 | private: 37 | const char *buf; 38 | int64_t bufsize; 39 | }; 40 | 41 | std::vector read_lines(const char *p); 42 | 43 | std::vector read_paragraphs(const char *p); 44 | 45 | std::string current_date(); 46 | 47 | char special2internal(char c); 48 | char internal2special(char c); 49 | 50 | void restore_special_chars(std::string &s); 51 | 52 | int words_in_file(const char *fname); 53 | -------------------------------------------------------------------------------- /voikkotest.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Jussi Pakkanen 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | 20 | int main(int argc, char **argv) { 21 | if(argc != 2) { 22 | printf("%s \n", argv[0]); 23 | return 1; 24 | } 25 | const char *error; 26 | VoikkoHandle *vh = voikkoInit(&error, "fi", nullptr); 27 | if(!vh) { 28 | printf("Voikko init failed: %s\n", error); 29 | return 1; 30 | } 31 | char *hyphenation = voikkoHyphenateCstr(vh, argv[1]); 32 | printf("%s\n%s\n", argv[1], hyphenation); 33 | voikkoFreeCstr(hyphenation); 34 | voikkoTerminate(vh); 35 | return 0; 36 | } 37 | -------------------------------------------------------------------------------- /wordhyphenator.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Jussi Pakkanen 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include "wordhyphenator.hpp" 18 | #include 19 | #include 20 | #include 21 | 22 | #include 23 | #include 24 | #include 25 | 26 | struct WordPieces { 27 | std::string prefix; 28 | std::string core; 29 | std::string suffix; 30 | }; 31 | 32 | namespace { 33 | 34 | const std::array dash_codepoints{0x2d, 0x2012, 0x2014, 0x2212}; 35 | 36 | struct DashSplit { 37 | std::vector words; 38 | std::vector separators; 39 | }; 40 | 41 | bool is_dashlike(uint32_t uchar) { 42 | for(const auto c : dash_codepoints) { 43 | if(c == uchar) { 44 | return true; 45 | } 46 | } 47 | return false; 48 | } 49 | 50 | DashSplit split_at_dashes(const std::string &word) { 51 | DashSplit splits; 52 | std::unique_ptr bufd{new char[word.size() + 10]}; 53 | char *buf = bufd.get(); 54 | int chars_in_buf = 0; 55 | const char *in = word.c_str(); 56 | while(*in) { 57 | auto c = g_utf8_get_char(in); 58 | if(is_dashlike(c)) { 59 | buf[chars_in_buf] = '\0'; 60 | splits.words.emplace_back(buf); 61 | chars_in_buf = 0; 62 | splits.separators.push_back(c); 63 | } else { 64 | chars_in_buf += g_unichar_to_utf8(c, buf + chars_in_buf); 65 | } 66 | in = g_utf8_next_char(in); 67 | } 68 | assert(splits.words.size() == splits.separators.size()); 69 | buf[chars_in_buf] = '\0'; 70 | splits.words.emplace_back(buf); 71 | return splits; 72 | } 73 | 74 | std::string lowerword(const std::string &w) { 75 | std::string lw; 76 | lw.reserve(w.size()); 77 | for(const auto c : w) { 78 | lw.push_back(tolower(c)); 79 | } 80 | return lw; 81 | } 82 | 83 | const char letters[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; 84 | 85 | WordPieces tripartite(const std::string &word) { 86 | std::string_view letterview(letters); 87 | const auto p1 = word.find_first_of(letterview); 88 | if(p1 == std::string::npos) { 89 | return WordPieces{word, "", ""}; 90 | } 91 | const auto p2 = word.find_last_of(letterview); 92 | assert(p2 != std::string::npos); 93 | 94 | return WordPieces{word.substr(0, p1), word.substr(p1, p2 - p1 + 1), word.substr(p2 + 1)}; 95 | }; 96 | 97 | std::vector build_hyphenation_data(const std::string &word, 98 | const std::vector &hyphens, 99 | size_t prefix_length) { 100 | std::vector hyphen_points; 101 | for(size_t i = 0; i < word.size(); ++i) { 102 | if(hyphens[i] == char(-1)) { 103 | // The hyphenator library's output is a bit weird. 104 | // When we reach an item it has not touched (i.e. is still 255) 105 | // exit out. 106 | break; 107 | } 108 | if(hyphens[i] & 1) { 109 | hyphen_points.emplace_back(HyphenPoint{i + prefix_length, SplitType::Regular}); 110 | } 111 | } 112 | return hyphen_points; 113 | } 114 | 115 | void hyphenate_and_append(std::string &reconstructed_word, 116 | std::vector &hyphen_points, 117 | const std::string &word, 118 | std::optional separator, 119 | HyphenDict *dict) { 120 | const auto trips = tripartite(word); 121 | char buf[7] = {0, 0, 0, 0, 0, 0, 0}; 122 | if(trips.core.empty()) { 123 | reconstructed_word += word; 124 | if(separator) { 125 | g_unichar_to_utf8(*separator, buf); 126 | reconstructed_word += buf; 127 | } 128 | return; 129 | } 130 | reconstructed_word += trips.prefix; 131 | std::string lowercase_word = lowerword(trips.core); 132 | std::vector output(word.size() * 2 + 1, '\0'); 133 | std::vector hyphens(word.size() + 5, (char)-1); 134 | char **rep = nullptr; 135 | int *pos = nullptr; 136 | int *cut = nullptr; 137 | // NOTE: it seems that this function only works with ASCII 138 | // characters. At least feeding it some UTF-8 text produced 139 | // weird results. Consider checking for this if weird 140 | // errors pop up. 141 | const auto rc = hnj_hyphen_hyphenate2(dict, 142 | lowercase_word.c_str(), 143 | (int)lowercase_word.size(), 144 | hyphens.data(), 145 | output.data(), 146 | &rep, 147 | &pos, 148 | &cut); 149 | assert(rc == 0); 150 | auto subhyphens = build_hyphenation_data(word, hyphens, reconstructed_word.length()); 151 | free(rep); 152 | free(pos); 153 | free(cut); 154 | hyphen_points.insert(hyphen_points.cend(), subhyphens.begin(), subhyphens.end()); 155 | reconstructed_word += trips.core; 156 | reconstructed_word += trips.suffix; 157 | if(separator) { 158 | g_unichar_to_utf8(*separator, buf); 159 | reconstructed_word += buf; 160 | hyphen_points.emplace_back(HyphenPoint{reconstructed_word.size() - 1, SplitType::NoHyphen}); 161 | } 162 | } 163 | 164 | char *discard_one_letter_syllables(char *hyphen_str) { 165 | const auto word_len = strlen(hyphen_str); 166 | 167 | if(word_len < 3) { 168 | return hyphen_str; 169 | } 170 | if(hyphen_str[1] == '-') { 171 | hyphen_str[1] = ' '; 172 | } 173 | if(hyphen_str[word_len - 1] == '-') { 174 | hyphen_str[word_len - 1] = ' '; 175 | } 176 | for(size_t i = 1; i < word_len - 3; ++i) { 177 | if(hyphen_str[i] == '=') { 178 | if(hyphen_str[i - 1] == '-') { 179 | hyphen_str[i - 1] = ' '; 180 | } 181 | if(hyphen_str[i + 2] == '-') { 182 | hyphen_str[i + 2] = ' '; 183 | } 184 | } 185 | } 186 | return hyphen_str; 187 | } 188 | 189 | } // namespace 190 | 191 | WordHyphenator::WordHyphenator() { 192 | dict = hnj_hyphen_load("/usr/share/hyphen/hyph_en.dic"); 193 | if(!dict) { 194 | dict = hnj_hyphen_load("/usr/share/hyphen/hyph_en_US.dic"); 195 | } 196 | if(!dict) { 197 | printf("Could not load english hyphenation data.\n"); 198 | std::abort(); 199 | } 200 | const char *error; 201 | voikko = voikkoInit(&error, "fi", nullptr); 202 | if(!voikko) { 203 | printf("Voikko init failed: %s\n", error); 204 | std::abort(); 205 | } 206 | } 207 | 208 | WordHyphenator::~WordHyphenator() { 209 | hnj_hyphen_free(dict); 210 | voikkoTerminate(voikko); 211 | } 212 | 213 | std::vector WordHyphenator::hyphenate(const std::string &word, 214 | const Language lang) const { 215 | assert(word.find(' ') == std::string::npos); 216 | g_utf8_validate(word.c_str(), word.length(), nullptr); 217 | std::vector hyphen_points; 218 | if(lang == Language::Unset) { 219 | // FIXME, split at dashes. 220 | } else if(lang == Language::English) { 221 | std::string reconstructed_word; 222 | reconstructed_word.reserve(word.size()); 223 | // The hyphenation function only deals with lower case single words. 224 | // Attached punctuation, quotes, capital letters etc break it. 225 | // For words like spatio-temporal it splits the individual words but not the hyphen. 226 | std::string_view letterview(letters); 227 | const auto p1 = word.find_first_of(letterview); 228 | if(p1 == std::string::npos) { 229 | // Non-word such as a number or other weird character combinations. 230 | // FIXME to add dashelss hyphenation points for things like 231 | // 1,000,000. 232 | return {}; 233 | } 234 | const auto subwords = split_at_dashes(word); 235 | assert(subwords.words.size() == subwords.separators.size() + 1); 236 | for(size_t ind = 0; ind < subwords.words.size(); ++ind) { 237 | hyphenate_and_append(reconstructed_word, 238 | hyphen_points, 239 | subwords.words[ind], 240 | ind < subwords.separators.size() 241 | ? std::optional{subwords.separators[ind]} 242 | : std::optional{}, 243 | dict); 244 | } 245 | assert(reconstructed_word == word); 246 | } else if(lang == Language::Finnish) { 247 | // The return value is an ASCII string. It lists the hyphenation points 248 | // in characters, but we need them in bytes. 249 | std::string tmp{word}; 250 | size_t popped_chars = 0; 251 | while(!tmp.empty() && 252 | (tmp.back() == ',' || tmp.back() == '.' || tmp.back() == ':' || tmp.back() == ';')) { 253 | tmp.pop_back(); 254 | ++popped_chars; 255 | } 256 | if(popped_chars > 0) { 257 | return hyphenate(tmp, lang); 258 | } 259 | char *hyphenation = discard_one_letter_syllables(voikkoHyphenateCstr(voikko, word.c_str())); 260 | 261 | const char *character_location = word.c_str(); 262 | for(size_t i = 0; hyphenation[i]; ++i) { 263 | auto byte_offset = size_t(character_location - word.c_str()); 264 | switch(hyphenation[i]) { 265 | case '-': 266 | // The weird offset comes from the fact that we need to mimic how the 267 | // english language hyphenator has chosen to set up its offsets. 268 | hyphen_points.emplace_back(HyphenPoint{byte_offset - 1, SplitType::Regular}); 269 | break; 270 | case '=': 271 | hyphen_points.emplace_back(HyphenPoint{byte_offset, SplitType::NoHyphen}); 272 | break; 273 | case ' ': 274 | break; 275 | default: 276 | printf("Unexpected output from hyphenator: %c\n", hyphenation[i]); 277 | std::abort(); 278 | } 279 | character_location = g_utf8_find_next_char(character_location, nullptr); 280 | } 281 | assert(character_location == word.c_str() + word.length()); 282 | voikkoFreeCstr(hyphenation); 283 | } else { 284 | printf("Unkown hyphenation language.\n"); 285 | std::abort(); 286 | } 287 | HyphenatedWord tmp; 288 | tmp.word = word; 289 | tmp.hyphen_points = hyphen_points; 290 | tmp.sanity_check(); 291 | return hyphen_points; 292 | } 293 | 294 | std::vector> 295 | WordHyphenator::hyphenate(const std::vector &words, const Language lang) const { 296 | std::vector> hyphs; 297 | hyphs.reserve(words.size()); 298 | for(const auto &w : words) { 299 | hyphs.emplace_back(hyphenate(w, lang)); 300 | } 301 | return hyphs; 302 | } 303 | 304 | std::string get_visual_string(const std::string &word, 305 | const std::vector hyphen_points) { 306 | std::string dashed_word; 307 | dashed_word.reserve(word.size() + hyphen_points.size()); 308 | size_t hyphen_index = 0; 309 | for(size_t i = 0; i < word.size(); ++i) { 310 | dashed_word += word[i]; 311 | if(hyphen_index < hyphen_points.size() && i == hyphen_points[hyphen_index].loc) { 312 | dashed_word += "⬧"; 313 | ++hyphen_index; 314 | } 315 | } 316 | return dashed_word; 317 | } 318 | 319 | std::string HyphenatedWord::get_visual_string() const { 320 | std::string dashed_word; 321 | dashed_word.reserve(word.size() + hyphen_points.size()); 322 | size_t hyphen_index = 0; 323 | for(size_t i = 0; i < word.size(); ++i) { 324 | dashed_word += word[i]; 325 | if(hyphen_index < hyphen_points.size() && i == hyphen_points[hyphen_index].loc) { 326 | dashed_word += "⬧"; 327 | ++hyphen_index; 328 | } 329 | } 330 | return dashed_word; 331 | } 332 | 333 | void HyphenatedWord::sanity_check() const { 334 | for(const auto &h : hyphen_points) { 335 | assert(h.loc < word.length()); 336 | } 337 | } 338 | -------------------------------------------------------------------------------- /wordhyphenator.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Jussi Pakkanen 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | #include 22 | 23 | #include 24 | #include 25 | 26 | enum class SplitType : int { 27 | Regular, 28 | NoHyphen, 29 | }; 30 | 31 | struct HyphenPoint { 32 | size_t loc; 33 | SplitType type; 34 | 35 | bool operator==(const HyphenPoint &o) const { return loc == o.loc && type == o.type; } 36 | }; 37 | 38 | struct HyphenatedWord { 39 | std::vector hyphen_points; 40 | std::string word; 41 | 42 | std::string get_visual_string() const; 43 | 44 | void sanity_check() const; 45 | }; 46 | 47 | std::string get_visual_string(const std::string &word, 48 | const std::vector hyphen_points); 49 | 50 | class WordHyphenator { 51 | public: 52 | WordHyphenator(); 53 | ~WordHyphenator(); 54 | 55 | std::vector hyphenate(const std::string &word, const Language lang) const; 56 | std::vector> hyphenate(const std::vector &words, 57 | const Language lang) const; 58 | 59 | private: 60 | HyphenDict *dict; 61 | VoikkoHandle *voikko; 62 | }; 63 | --------------------------------------------------------------------------------