├── .github └── workflows │ └── maven-build.yml ├── .gitignore ├── .mvn └── wrapper │ ├── maven-wrapper.jar │ └── maven-wrapper.properties ├── LICENSE.txt ├── README.md ├── mvnw ├── mvnw.cmd ├── pom.xml └── src ├── main └── java │ └── ch │ └── x28 │ └── inscriptis │ ├── CssParse.java │ ├── CssProfile.java │ ├── HtmlElement.java │ ├── HtmlProperties.java │ ├── Inscriptis.java │ ├── Line.java │ ├── ParserConfig.java │ ├── Row.java │ ├── StringUtils.java │ ├── Table.java │ └── TableCell.java └── test ├── java └── ch │ └── x28 │ └── inscriptis │ ├── CssParseTest.java │ ├── HtmlElementTest.java │ ├── InscriptisTest.java │ ├── LineTest.java │ └── TableCellTest.java └── resources └── snippets ├── br-in-table.html ├── br-in-table.txt ├── br-in-table2.html ├── br-li.html ├── br-li.txt ├── br.html ├── br.txt ├── direct-enumeration.html ├── direct-enumeration.txt ├── empty-table.html ├── empty-table.txt ├── enumerations.html ├── enumerations.txt ├── invalid-table.html ├── invalid-table.txt ├── invalid-table2.html ├── invalid-table2.txt ├── invisible.html ├── invisible.txt ├── invisible2.html ├── invisible2.txt ├── nested-table.html ├── nested-table.txt ├── p-br.html ├── p-br.txt ├── pre.html ├── pre.txt ├── table-alignment.html ├── table-alignment.txt ├── table-in-table.html ├── table-in-table.txt ├── table-itemize.html ├── table-itemize.txt ├── table-pre.html ├── table-pre.txt ├── table.html ├── table.txt ├── td-only-table.html ├── td-only-table.txt ├── tr-only-table.html ├── tr-only-table.txt ├── whitespace.html ├── whitespace.txt ├── wikipedia-code.html ├── wikipedia-code.txt ├── wikipedia-enumeration.html ├── wikipedia-enumeration.txt ├── wikipedia-table.html └── wikipedia-table.txt /.github/workflows/maven-build.yml: -------------------------------------------------------------------------------- 1 | name: Maven Build 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | 9 | steps: 10 | - uses: actions/checkout@v2 11 | - name: Set up JDK 8 12 | uses: actions/setup-java@v2 13 | with: 14 | java-version: '8' 15 | distribution: 'adopt' 16 | - name: Build with Maven 17 | run: mvn --batch-mode --update-snapshots verify 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.classpath 2 | /.project 3 | /.settings/ 4 | /target/ -------------------------------------------------------------------------------- /.mvn/wrapper/maven-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/x28/inscriptis-java/639e1661e353337a7c871f4ac4d4460c7317ac64/.mvn/wrapper/maven-wrapper.jar -------------------------------------------------------------------------------- /.mvn/wrapper/maven-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.6.3/apache-maven-3.6.3-bin.zip 2 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | https://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | https://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Maven Build](https://github.com/x28/inscriptis-java/actions/workflows/maven-build.yml/badge.svg)](https://github.com/x28/inscriptis-java/actions/workflows/maven-build.yml) 2 | [![Maven Central](https://maven-badges.herokuapp.com/maven-central/ch.x28.inscriptis/inscriptis/badge.svg)](https://maven-badges.herokuapp.com/maven-central/ch.x28.inscriptis/inscriptis) 3 | [![javadoc](https://javadoc.io/badge2/ch.x28.inscriptis/inscriptis/javadoc.svg)](https://javadoc.io/doc/ch.x28.inscriptis/inscriptis) 4 | 5 | # inscriptis - HTML to text conversion library for Java 6 | 7 | A Java-based HTML to text conversion library with support for nested tables and a subset of CSS. Please take a look at the [Rendering document](https://github.com/weblyzard/inscriptis/blob/master/RENDERING.md) for a demonstration of Inscriptis conversion quality. 8 | 9 | This is a Java port of [inscriptis for Python](https://github.com/weblyzard/inscriptis). 10 | 11 | ## Getting Started 12 | 13 | Here is a quick teaser of an application using inscriptis for Java: 14 | 15 | ```java 16 | package example; 17 | 18 | import org.jsoup.Jsoup; 19 | import org.jsoup.helper.W3CDom; 20 | import org.w3c.dom.Document; 21 | 22 | import ch.x28.inscriptis.Inscriptis; 23 | 24 | public class Example { 25 | 26 | public static void main(String[] args) { 27 | 28 | String htmlContent = "

Hello World!

"; 29 | 30 | // use jsoup to parse HTML and convert it to W3C Document (https://jsoup.org) 31 | Document document = W3CDom.convert(Jsoup.parse(htmlContent)); 32 | 33 | Inscriptis inscriptis = new Inscriptis(document); 34 | String text = inscriptis.getText(); 35 | 36 | System.out.println(text); // Hello World! 37 | } 38 | } 39 | ``` 40 | 41 | ## Maven configuration 42 | 43 | Add the Maven dependency: 44 | 45 | ```xml 46 | 47 | ch.x28.inscriptis 48 | inscriptis 49 | 1.0 50 | 51 | ``` 52 | 53 | ## HTML parser 54 | 55 | inscriptis requires a W3C document, so it's up to you which parser you choose. Here is a list of parsers that support a W3C document result. 56 | 57 | ### jsoup 58 | https://jsoup.org/ 59 | 60 | ### nu-validator HTML Parser 61 | https://mvnrepository.com/artifact/nu.validator/htmlparser 62 | 63 | ## License 64 | 65 | inscriptis for Java is an Open Source software released under the Apache License, Version 2.0 66 | -------------------------------------------------------------------------------- /mvnw: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # ---------------------------------------------------------------------------- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | # ---------------------------------------------------------------------------- 20 | 21 | # ---------------------------------------------------------------------------- 22 | # Maven Start Up Batch script 23 | # 24 | # Required ENV vars: 25 | # ------------------ 26 | # JAVA_HOME - location of a JDK home dir 27 | # 28 | # Optional ENV vars 29 | # ----------------- 30 | # M2_HOME - location of maven2's installed home dir 31 | # MAVEN_OPTS - parameters passed to the Java VM when running Maven 32 | # e.g. to debug Maven itself, use 33 | # set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000 34 | # MAVEN_SKIP_RC - flag to disable loading of mavenrc files 35 | # ---------------------------------------------------------------------------- 36 | 37 | if [ -z "$MAVEN_SKIP_RC" ] ; then 38 | 39 | if [ -f /etc/mavenrc ] ; then 40 | . /etc/mavenrc 41 | fi 42 | 43 | if [ -f "$HOME/.mavenrc" ] ; then 44 | . "$HOME/.mavenrc" 45 | fi 46 | 47 | fi 48 | 49 | # OS specific support. $var _must_ be set to either true or false. 50 | cygwin=false; 51 | darwin=false; 52 | mingw=false 53 | case "`uname`" in 54 | CYGWIN*) cygwin=true ;; 55 | MINGW*) mingw=true;; 56 | Darwin*) darwin=true 57 | # Use /usr/libexec/java_home if available, otherwise fall back to /Library/Java/Home 58 | # See https://developer.apple.com/library/mac/qa/qa1170/_index.html 59 | if [ -z "$JAVA_HOME" ]; then 60 | if [ -x "/usr/libexec/java_home" ]; then 61 | export JAVA_HOME="`/usr/libexec/java_home`" 62 | else 63 | export JAVA_HOME="/Library/Java/Home" 64 | fi 65 | fi 66 | ;; 67 | esac 68 | 69 | if [ -z "$JAVA_HOME" ] ; then 70 | if [ -r /etc/gentoo-release ] ; then 71 | JAVA_HOME=`java-config --jre-home` 72 | fi 73 | fi 74 | 75 | if [ -z "$M2_HOME" ] ; then 76 | ## resolve links - $0 may be a link to maven's home 77 | PRG="$0" 78 | 79 | # need this for relative symlinks 80 | while [ -h "$PRG" ] ; do 81 | ls=`ls -ld "$PRG"` 82 | link=`expr "$ls" : '.*-> \(.*\)$'` 83 | if expr "$link" : '/.*' > /dev/null; then 84 | PRG="$link" 85 | else 86 | PRG="`dirname "$PRG"`/$link" 87 | fi 88 | done 89 | 90 | saveddir=`pwd` 91 | 92 | M2_HOME=`dirname "$PRG"`/.. 93 | 94 | # make it fully qualified 95 | M2_HOME=`cd "$M2_HOME" && pwd` 96 | 97 | cd "$saveddir" 98 | # echo Using m2 at $M2_HOME 99 | fi 100 | 101 | # For Cygwin, ensure paths are in UNIX format before anything is touched 102 | if $cygwin ; then 103 | [ -n "$M2_HOME" ] && 104 | M2_HOME=`cygpath --unix "$M2_HOME"` 105 | [ -n "$JAVA_HOME" ] && 106 | JAVA_HOME=`cygpath --unix "$JAVA_HOME"` 107 | [ -n "$CLASSPATH" ] && 108 | CLASSPATH=`cygpath --path --unix "$CLASSPATH"` 109 | fi 110 | 111 | # For Mingw, ensure paths are in UNIX format before anything is touched 112 | if $mingw ; then 113 | [ -n "$M2_HOME" ] && 114 | M2_HOME="`(cd "$M2_HOME"; pwd)`" 115 | [ -n "$JAVA_HOME" ] && 116 | JAVA_HOME="`(cd "$JAVA_HOME"; pwd)`" 117 | fi 118 | 119 | if [ -z "$JAVA_HOME" ]; then 120 | javaExecutable="`which javac`" 121 | if [ -n "$javaExecutable" ] && ! [ "`expr \"$javaExecutable\" : '\([^ ]*\)'`" = "no" ]; then 122 | # readlink(1) is not available as standard on Solaris 10. 123 | readLink=`which readlink` 124 | if [ ! `expr "$readLink" : '\([^ ]*\)'` = "no" ]; then 125 | if $darwin ; then 126 | javaHome="`dirname \"$javaExecutable\"`" 127 | javaExecutable="`cd \"$javaHome\" && pwd -P`/javac" 128 | else 129 | javaExecutable="`readlink -f \"$javaExecutable\"`" 130 | fi 131 | javaHome="`dirname \"$javaExecutable\"`" 132 | javaHome=`expr "$javaHome" : '\(.*\)/bin'` 133 | JAVA_HOME="$javaHome" 134 | export JAVA_HOME 135 | fi 136 | fi 137 | fi 138 | 139 | if [ -z "$JAVACMD" ] ; then 140 | if [ -n "$JAVA_HOME" ] ; then 141 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 142 | # IBM's JDK on AIX uses strange locations for the executables 143 | JAVACMD="$JAVA_HOME/jre/sh/java" 144 | else 145 | JAVACMD="$JAVA_HOME/bin/java" 146 | fi 147 | else 148 | JAVACMD="`which java`" 149 | fi 150 | fi 151 | 152 | if [ ! -x "$JAVACMD" ] ; then 153 | echo "Error: JAVA_HOME is not defined correctly." >&2 154 | echo " We cannot execute $JAVACMD" >&2 155 | exit 1 156 | fi 157 | 158 | if [ -z "$JAVA_HOME" ] ; then 159 | echo "Warning: JAVA_HOME environment variable is not set." 160 | fi 161 | 162 | CLASSWORLDS_LAUNCHER=org.codehaus.plexus.classworlds.launcher.Launcher 163 | 164 | # traverses directory structure from process work directory to filesystem root 165 | # first directory with .mvn subdirectory is considered project base directory 166 | find_maven_basedir() { 167 | 168 | if [ -z "$1" ] 169 | then 170 | echo "Path not specified to find_maven_basedir" 171 | return 1 172 | fi 173 | 174 | basedir="$1" 175 | wdir="$1" 176 | while [ "$wdir" != '/' ] ; do 177 | if [ -d "$wdir"/.mvn ] ; then 178 | basedir=$wdir 179 | break 180 | fi 181 | # workaround for JBEAP-8937 (on Solaris 10/Sparc) 182 | if [ -d "${wdir}" ]; then 183 | wdir=`cd "$wdir/.."; pwd` 184 | fi 185 | # end of workaround 186 | done 187 | echo "${basedir}" 188 | } 189 | 190 | # concatenates all lines of a file 191 | concat_lines() { 192 | if [ -f "$1" ]; then 193 | echo "$(tr -s '\n' ' ' < "$1")" 194 | fi 195 | } 196 | 197 | BASE_DIR=`find_maven_basedir "$(pwd)"` 198 | if [ -z "$BASE_DIR" ]; then 199 | exit 1; 200 | fi 201 | 202 | ########################################################################################## 203 | # Extension to allow automatically downloading the maven-wrapper.jar from Maven-central 204 | # This allows using the maven wrapper in projects that prohibit checking in binary data. 205 | ########################################################################################## 206 | if [ -r "$BASE_DIR/.mvn/wrapper/maven-wrapper.jar" ]; then 207 | if [ "$MVNW_VERBOSE" = true ]; then 208 | echo "Found .mvn/wrapper/maven-wrapper.jar" 209 | fi 210 | else 211 | if [ "$MVNW_VERBOSE" = true ]; then 212 | echo "Couldn't find .mvn/wrapper/maven-wrapper.jar, downloading it ..." 213 | fi 214 | if [ -n "$MVNW_REPOURL" ]; then 215 | jarUrl="$MVNW_REPOURL/io/takari/maven-wrapper/0.5.6/maven-wrapper-0.5.6.jar" 216 | else 217 | jarUrl="https://repo.maven.apache.org/maven2/io/takari/maven-wrapper/0.5.6/maven-wrapper-0.5.6.jar" 218 | fi 219 | while IFS="=" read key value; do 220 | case "$key" in (wrapperUrl) jarUrl="$value"; break ;; 221 | esac 222 | done < "$BASE_DIR/.mvn/wrapper/maven-wrapper.properties" 223 | if [ "$MVNW_VERBOSE" = true ]; then 224 | echo "Downloading from: $jarUrl" 225 | fi 226 | wrapperJarPath="$BASE_DIR/.mvn/wrapper/maven-wrapper.jar" 227 | if $cygwin; then 228 | wrapperJarPath=`cygpath --path --windows "$wrapperJarPath"` 229 | fi 230 | 231 | if command -v wget > /dev/null; then 232 | if [ "$MVNW_VERBOSE" = true ]; then 233 | echo "Found wget ... using wget" 234 | fi 235 | if [ -z "$MVNW_USERNAME" ] || [ -z "$MVNW_PASSWORD" ]; then 236 | wget "$jarUrl" -O "$wrapperJarPath" 237 | else 238 | wget --http-user=$MVNW_USERNAME --http-password=$MVNW_PASSWORD "$jarUrl" -O "$wrapperJarPath" 239 | fi 240 | elif command -v curl > /dev/null; then 241 | if [ "$MVNW_VERBOSE" = true ]; then 242 | echo "Found curl ... using curl" 243 | fi 244 | if [ -z "$MVNW_USERNAME" ] || [ -z "$MVNW_PASSWORD" ]; then 245 | curl -o "$wrapperJarPath" "$jarUrl" -f 246 | else 247 | curl --user $MVNW_USERNAME:$MVNW_PASSWORD -o "$wrapperJarPath" "$jarUrl" -f 248 | fi 249 | 250 | else 251 | if [ "$MVNW_VERBOSE" = true ]; then 252 | echo "Falling back to using Java to download" 253 | fi 254 | javaClass="$BASE_DIR/.mvn/wrapper/MavenWrapperDownloader.java" 255 | # For Cygwin, switch paths to Windows format before running javac 256 | if $cygwin; then 257 | javaClass=`cygpath --path --windows "$javaClass"` 258 | fi 259 | if [ -e "$javaClass" ]; then 260 | if [ ! -e "$BASE_DIR/.mvn/wrapper/MavenWrapperDownloader.class" ]; then 261 | if [ "$MVNW_VERBOSE" = true ]; then 262 | echo " - Compiling MavenWrapperDownloader.java ..." 263 | fi 264 | # Compiling the Java class 265 | ("$JAVA_HOME/bin/javac" "$javaClass") 266 | fi 267 | if [ -e "$BASE_DIR/.mvn/wrapper/MavenWrapperDownloader.class" ]; then 268 | # Running the downloader 269 | if [ "$MVNW_VERBOSE" = true ]; then 270 | echo " - Running MavenWrapperDownloader.java ..." 271 | fi 272 | ("$JAVA_HOME/bin/java" -cp .mvn/wrapper MavenWrapperDownloader "$MAVEN_PROJECTBASEDIR") 273 | fi 274 | fi 275 | fi 276 | fi 277 | ########################################################################################## 278 | # End of extension 279 | ########################################################################################## 280 | 281 | export MAVEN_PROJECTBASEDIR=${MAVEN_BASEDIR:-"$BASE_DIR"} 282 | if [ "$MVNW_VERBOSE" = true ]; then 283 | echo $MAVEN_PROJECTBASEDIR 284 | fi 285 | MAVEN_OPTS="$(concat_lines "$MAVEN_PROJECTBASEDIR/.mvn/jvm.config") $MAVEN_OPTS" 286 | 287 | # For Cygwin, switch paths to Windows format before running java 288 | if $cygwin; then 289 | [ -n "$M2_HOME" ] && 290 | M2_HOME=`cygpath --path --windows "$M2_HOME"` 291 | [ -n "$JAVA_HOME" ] && 292 | JAVA_HOME=`cygpath --path --windows "$JAVA_HOME"` 293 | [ -n "$CLASSPATH" ] && 294 | CLASSPATH=`cygpath --path --windows "$CLASSPATH"` 295 | [ -n "$MAVEN_PROJECTBASEDIR" ] && 296 | MAVEN_PROJECTBASEDIR=`cygpath --path --windows "$MAVEN_PROJECTBASEDIR"` 297 | fi 298 | 299 | # Provide a "standardized" way to retrieve the CLI args that will 300 | # work with both Windows and non-Windows executions. 301 | MAVEN_CMD_LINE_ARGS="$MAVEN_CONFIG $@" 302 | export MAVEN_CMD_LINE_ARGS 303 | 304 | WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain 305 | 306 | exec "$JAVACMD" \ 307 | $MAVEN_OPTS \ 308 | -classpath "$MAVEN_PROJECTBASEDIR/.mvn/wrapper/maven-wrapper.jar" \ 309 | "-Dmaven.home=${M2_HOME}" "-Dmaven.multiModuleProjectDirectory=${MAVEN_PROJECTBASEDIR}" \ 310 | ${WRAPPER_LAUNCHER} $MAVEN_CONFIG "$@" 311 | -------------------------------------------------------------------------------- /mvnw.cmd: -------------------------------------------------------------------------------- 1 | @REM ---------------------------------------------------------------------------- 2 | @REM Licensed to the Apache Software Foundation (ASF) under one 3 | @REM or more contributor license agreements. See the NOTICE file 4 | @REM distributed with this work for additional information 5 | @REM regarding copyright ownership. The ASF licenses this file 6 | @REM to you under the Apache License, Version 2.0 (the 7 | @REM "License"); you may not use this file except in compliance 8 | @REM with the License. You may obtain a copy of the License at 9 | @REM 10 | @REM http://www.apache.org/licenses/LICENSE-2.0 11 | @REM 12 | @REM Unless required by applicable law or agreed to in writing, 13 | @REM software distributed under the License is distributed on an 14 | @REM "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | @REM KIND, either express or implied. See the License for the 16 | @REM specific language governing permissions and limitations 17 | @REM under the License. 18 | @REM ---------------------------------------------------------------------------- 19 | 20 | @REM ---------------------------------------------------------------------------- 21 | @REM Maven Start Up Batch script 22 | @REM 23 | @REM Required ENV vars: 24 | @REM JAVA_HOME - location of a JDK home dir 25 | @REM 26 | @REM Optional ENV vars 27 | @REM M2_HOME - location of maven2's installed home dir 28 | @REM MAVEN_BATCH_ECHO - set to 'on' to enable the echoing of the batch commands 29 | @REM MAVEN_BATCH_PAUSE - set to 'on' to wait for a keystroke before ending 30 | @REM MAVEN_OPTS - parameters passed to the Java VM when running Maven 31 | @REM e.g. to debug Maven itself, use 32 | @REM set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000 33 | @REM MAVEN_SKIP_RC - flag to disable loading of mavenrc files 34 | @REM ---------------------------------------------------------------------------- 35 | 36 | @REM Begin all REM lines with '@' in case MAVEN_BATCH_ECHO is 'on' 37 | @echo off 38 | @REM set title of command window 39 | title %0 40 | @REM enable echoing by setting MAVEN_BATCH_ECHO to 'on' 41 | @if "%MAVEN_BATCH_ECHO%" == "on" echo %MAVEN_BATCH_ECHO% 42 | 43 | @REM set %HOME% to equivalent of $HOME 44 | if "%HOME%" == "" (set "HOME=%HOMEDRIVE%%HOMEPATH%") 45 | 46 | @REM Execute a user defined script before this one 47 | if not "%MAVEN_SKIP_RC%" == "" goto skipRcPre 48 | @REM check for pre script, once with legacy .bat ending and once with .cmd ending 49 | if exist "%HOME%\mavenrc_pre.bat" call "%HOME%\mavenrc_pre.bat" 50 | if exist "%HOME%\mavenrc_pre.cmd" call "%HOME%\mavenrc_pre.cmd" 51 | :skipRcPre 52 | 53 | @setlocal 54 | 55 | set ERROR_CODE=0 56 | 57 | @REM To isolate internal variables from possible post scripts, we use another setlocal 58 | @setlocal 59 | 60 | @REM ==== START VALIDATION ==== 61 | if not "%JAVA_HOME%" == "" goto OkJHome 62 | 63 | echo. 64 | echo Error: JAVA_HOME not found in your environment. >&2 65 | echo Please set the JAVA_HOME variable in your environment to match the >&2 66 | echo location of your Java installation. >&2 67 | echo. 68 | goto error 69 | 70 | :OkJHome 71 | if exist "%JAVA_HOME%\bin\java.exe" goto init 72 | 73 | echo. 74 | echo Error: JAVA_HOME is set to an invalid directory. >&2 75 | echo JAVA_HOME = "%JAVA_HOME%" >&2 76 | echo Please set the JAVA_HOME variable in your environment to match the >&2 77 | echo location of your Java installation. >&2 78 | echo. 79 | goto error 80 | 81 | @REM ==== END VALIDATION ==== 82 | 83 | :init 84 | 85 | @REM Find the project base dir, i.e. the directory that contains the folder ".mvn". 86 | @REM Fallback to current working directory if not found. 87 | 88 | set MAVEN_PROJECTBASEDIR=%MAVEN_BASEDIR% 89 | IF NOT "%MAVEN_PROJECTBASEDIR%"=="" goto endDetectBaseDir 90 | 91 | set EXEC_DIR=%CD% 92 | set WDIR=%EXEC_DIR% 93 | :findBaseDir 94 | IF EXIST "%WDIR%"\.mvn goto baseDirFound 95 | cd .. 96 | IF "%WDIR%"=="%CD%" goto baseDirNotFound 97 | set WDIR=%CD% 98 | goto findBaseDir 99 | 100 | :baseDirFound 101 | set MAVEN_PROJECTBASEDIR=%WDIR% 102 | cd "%EXEC_DIR%" 103 | goto endDetectBaseDir 104 | 105 | :baseDirNotFound 106 | set MAVEN_PROJECTBASEDIR=%EXEC_DIR% 107 | cd "%EXEC_DIR%" 108 | 109 | :endDetectBaseDir 110 | 111 | IF NOT EXIST "%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config" goto endReadAdditionalConfig 112 | 113 | @setlocal EnableExtensions EnableDelayedExpansion 114 | for /F "usebackq delims=" %%a in ("%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config") do set JVM_CONFIG_MAVEN_PROPS=!JVM_CONFIG_MAVEN_PROPS! %%a 115 | @endlocal & set JVM_CONFIG_MAVEN_PROPS=%JVM_CONFIG_MAVEN_PROPS% 116 | 117 | :endReadAdditionalConfig 118 | 119 | SET MAVEN_JAVA_EXE="%JAVA_HOME%\bin\java.exe" 120 | set WRAPPER_JAR="%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.jar" 121 | set WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain 122 | 123 | set DOWNLOAD_URL="https://repo.maven.apache.org/maven2/io/takari/maven-wrapper/0.5.6/maven-wrapper-0.5.6.jar" 124 | 125 | FOR /F "tokens=1,2 delims==" %%A IN ("%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.properties") DO ( 126 | IF "%%A"=="wrapperUrl" SET DOWNLOAD_URL=%%B 127 | ) 128 | 129 | @REM Extension to allow automatically downloading the maven-wrapper.jar from Maven-central 130 | @REM This allows using the maven wrapper in projects that prohibit checking in binary data. 131 | if exist %WRAPPER_JAR% ( 132 | if "%MVNW_VERBOSE%" == "true" ( 133 | echo Found %WRAPPER_JAR% 134 | ) 135 | ) else ( 136 | if not "%MVNW_REPOURL%" == "" ( 137 | SET DOWNLOAD_URL="%MVNW_REPOURL%/io/takari/maven-wrapper/0.5.6/maven-wrapper-0.5.6.jar" 138 | ) 139 | if "%MVNW_VERBOSE%" == "true" ( 140 | echo Couldn't find %WRAPPER_JAR%, downloading it ... 141 | echo Downloading from: %DOWNLOAD_URL% 142 | ) 143 | 144 | powershell -Command "&{"^ 145 | "$webclient = new-object System.Net.WebClient;"^ 146 | "if (-not ([string]::IsNullOrEmpty('%MVNW_USERNAME%') -and [string]::IsNullOrEmpty('%MVNW_PASSWORD%'))) {"^ 147 | "$webclient.Credentials = new-object System.Net.NetworkCredential('%MVNW_USERNAME%', '%MVNW_PASSWORD%');"^ 148 | "}"^ 149 | "[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12; $webclient.DownloadFile('%DOWNLOAD_URL%', '%WRAPPER_JAR%')"^ 150 | "}" 151 | if "%MVNW_VERBOSE%" == "true" ( 152 | echo Finished downloading %WRAPPER_JAR% 153 | ) 154 | ) 155 | @REM End of extension 156 | 157 | @REM Provide a "standardized" way to retrieve the CLI args that will 158 | @REM work with both Windows and non-Windows executions. 159 | set MAVEN_CMD_LINE_ARGS=%* 160 | 161 | %MAVEN_JAVA_EXE% %JVM_CONFIG_MAVEN_PROPS% %MAVEN_OPTS% %MAVEN_DEBUG_OPTS% -classpath %WRAPPER_JAR% "-Dmaven.multiModuleProjectDirectory=%MAVEN_PROJECTBASEDIR%" %WRAPPER_LAUNCHER% %MAVEN_CONFIG% %* 162 | if ERRORLEVEL 1 goto error 163 | goto end 164 | 165 | :error 166 | set ERROR_CODE=1 167 | 168 | :end 169 | @endlocal & set ERROR_CODE=%ERROR_CODE% 170 | 171 | if not "%MAVEN_SKIP_RC%" == "" goto skipRcPost 172 | @REM check for post script, once with legacy .bat ending and once with .cmd ending 173 | if exist "%HOME%\mavenrc_post.bat" call "%HOME%\mavenrc_post.bat" 174 | if exist "%HOME%\mavenrc_post.cmd" call "%HOME%\mavenrc_post.cmd" 175 | :skipRcPost 176 | 177 | @REM pause the script if MAVEN_BATCH_PAUSE is set to 'on' 178 | if "%MAVEN_BATCH_PAUSE%" == "on" pause 179 | 180 | if "%MAVEN_TERMINATE_CMD%" == "on" exit %ERROR_CODE% 181 | 182 | exit /B %ERROR_CODE% 183 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4 | 4.0.0 5 | ch.x28.inscriptis 6 | inscriptis 7 | 1.1-SNAPSHOT 8 | jar 9 | 10 | inscriptis for Java 11 | A Java-based HTML to text conversion library with support for nested tables and a subset of CSS. 12 | https://github.com/x28/inscriptis-java 13 | 14 | 15 | 16 | Apache License, Version 2.0 17 | https://www.apache.org/licenses/LICENSE-2.0 18 | repo 19 | 20 | 21 | 22 | 23 | x28 AG 24 | https://www.x28.ch 25 | 26 | 27 | 28 | 29 | sw 30 | Sascha Wolski 31 | sascha.wolski at x28.ch 32 | x28 AG 33 | https://www.x28.ch 34 | 35 | Project lead 36 | 37 | +1 38 | 39 | 40 | mh 41 | Matthias Hewelt 42 | matthias.hewelt at x28.ch 43 | x28 AG 44 | https://www.x28.ch 45 | 46 | Project lead 47 | 48 | +1 49 | 50 | 51 | 52 | 53 | https://github.com/x28/inscriptis-java 54 | scm:git:git://github.com/x28/inscriptis-java.git 55 | scm:git:ssh://github.com/x28/inscriptis-java.git 56 | 57 | 58 | 59 | UTF-8 60 | 1.8 61 | 1.8 62 | 63 | 64 | 65 | 66 | org.junit.jupiter 67 | junit-jupiter-api 68 | 5.7.0 69 | test 70 | 71 | 72 | 73 | org.assertj 74 | assertj-core 75 | 3.18.1 76 | test 77 | 78 | 79 | 80 | org.jsoup 81 | jsoup 82 | 1.14.2 83 | test 84 | 85 | 86 | 87 | 88 | 89 | ossrh 90 | https://oss.sonatype.org/content/repositories/snapshots 91 | 92 | 93 | 94 | 95 | 96 | 97 | org.apache.maven.plugins 98 | maven-compiler-plugin 99 | 3.8.0 100 | 101 | 102 | 103 | org.apache.maven.plugins 104 | maven-source-plugin 105 | 3.2.1 106 | 107 | 108 | create-source-jar 109 | 110 | jar-no-fork 111 | 112 | 113 | 114 | 115 | 116 | 117 | org.apache.maven.plugins 118 | maven-javadoc-plugin 119 | 3.1.1 120 | 121 | UTF-8 122 | true 123 | 124 | 125 | 126 | create-javadoc-jar 127 | 128 | jar 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | org.sonatype.plugins 137 | nexus-staging-maven-plugin 138 | 1.6.8 139 | true 140 | 141 | ossrh 142 | https://oss.sonatype.org/ 143 | true 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | release 152 | 153 | 154 | 155 | 156 | 157 | org.apache.maven.plugins 158 | maven-enforcer-plugin 159 | 1.4.1 160 | 161 | 162 | enforce-release-rules 163 | 164 | enforce 165 | 166 | 167 | 168 | 169 | [1.8,1.9) 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | org.apache.maven.plugins 182 | maven-gpg-plugin 183 | 1.6 184 | 185 | 186 | sign-artifacts 187 | verify 188 | 189 | sign 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | -------------------------------------------------------------------------------- /src/main/java/ch/x28/inscriptis/CssParse.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 the original author or authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package ch.x28.inscriptis; 17 | 18 | import java.util.Arrays; 19 | import java.util.List; 20 | import java.util.regex.Matcher; 21 | import java.util.regex.Pattern; 22 | 23 | import ch.x28.inscriptis.HtmlProperties.Display; 24 | import ch.x28.inscriptis.HtmlProperties.WhiteSpace; 25 | 26 | /** 27 | * Parses CSS specifications and translates them into the corresponding HtmlElements used by Inscriptis for rendering 28 | * HTML pages. 29 | * 30 | * @author Sascha Wolski 31 | * @author Matthias Hewelt 32 | */ 33 | class CssParse { 34 | 35 | // used to separate value and unit from each other 36 | private static Pattern RE_UNIT = Pattern.compile("([\\-0-9\\.]+)(\\w+)"); 37 | 38 | // used to validate parsed size units 39 | private static List CSS_RELATIVE_UNITS = Arrays.asList("em", "qem", "rem"); 40 | 41 | // used to chose over whiteSpace values 42 | private static List WHITE_SPACE_NORMAL = Arrays.asList("normal", "nowrap"); 43 | private static List WHITE_SPACE_PRE = Arrays.asList("pre", "pre-line", "pre-wrap"); 44 | 45 | /** 46 | * @param styleAttribute the attribute value of the given style sheet. Example: display: none 47 | * @param htmlElement the HtmlElement to which the given style is applied. 48 | * 49 | * @return An HtmlElement that merges the given element with the style attributes specified. 50 | */ 51 | public static HtmlElement getStyleAttribute(String styleAttribute, HtmlElement htmlElement) { 52 | 53 | HtmlElement customHtmlElement = htmlElement.clone(); 54 | 55 | for (String styleDirective : styleAttribute.toLowerCase().split(";")) { 56 | if (!styleDirective.contains(":")) { 57 | continue; 58 | } 59 | 60 | String[] keyValuePair = StringUtils.split(styleDirective, ':', 1); 61 | if (keyValuePair.length < 2) { 62 | continue; 63 | } 64 | 65 | String key = keyValuePair[0].trim(); 66 | String value = keyValuePair[1].trim(); 67 | 68 | String fieldName = key.replace("-webkit-", ""); 69 | 70 | switch (fieldName) { 71 | case "display": 72 | attributeDisplay(value, customHtmlElement); 73 | break; 74 | case "margin-top": 75 | attributeMarginTop(value, customHtmlElement); 76 | break; 77 | case "margin-bottom": 78 | attributeMarginBottom(value, customHtmlElement); 79 | break; 80 | case "padding-left": 81 | attributePaddingLeft(value, customHtmlElement); 82 | break; 83 | case "white-space": 84 | attributeWhiteSpace(value, customHtmlElement); 85 | break; 86 | default: 87 | break; 88 | } 89 | } 90 | 91 | return customHtmlElement; 92 | } 93 | 94 | /** 95 | * Set the display value. 96 | */ 97 | private static void attributeDisplay(String value, HtmlElement htmlElement) { 98 | 99 | if (htmlElement.getDisplay() == Display.NONE) 100 | return; 101 | 102 | switch (value) { 103 | case "block": 104 | htmlElement.setDisplay(Display.BLOCK); 105 | break; 106 | case "none": 107 | htmlElement.setDisplay(Display.NONE); 108 | break; 109 | default: 110 | htmlElement.setDisplay(Display.INLINE); 111 | } 112 | } 113 | 114 | /** 115 | * Sets the bottom margin for the given HTML element. 116 | */ 117 | private static void attributeMarginBottom(String value, HtmlElement htmlElement) { 118 | htmlElement.setMarginAfter(getEm(value)); 119 | } 120 | 121 | /** 122 | * Sets the top margin for the given HTML element. 123 | */ 124 | private static void attributeMarginTop(String value, HtmlElement htmlElement) { 125 | htmlElement.setMarginBefore(getEm(value)); 126 | } 127 | 128 | /** 129 | * Sets the left padding for the given HTML element. 130 | */ 131 | private static void attributePaddingLeft(String value, HtmlElement htmlElement) { 132 | htmlElement.setPadding(getEm(value)); 133 | } 134 | 135 | /** 136 | * Set the white-space value. 137 | */ 138 | private static void attributeWhiteSpace(String value, HtmlElement htmlElement) { 139 | 140 | if (WHITE_SPACE_NORMAL.contains(value)) { 141 | htmlElement.setWhitespace(WhiteSpace.NORMAL); 142 | } else if (WHITE_SPACE_PRE.contains(value)) { 143 | htmlElement.setWhitespace(WhiteSpace.PRE); 144 | } 145 | } 146 | 147 | /** 148 | * @param length the length (e.g. 2em, 2px, etc.) as specified in the CSS. 149 | * @return the length in em's. 150 | */ 151 | private static int getEm(String length) { 152 | 153 | Matcher matcher = RE_UNIT.matcher(length); 154 | 155 | if (matcher.find()) { 156 | float value = Float.parseFloat(matcher.group(1)); 157 | String unit = matcher.group(2); 158 | 159 | if (!CSS_RELATIVE_UNITS.contains(unit)) { 160 | return Math.round(value / 8); 161 | } 162 | 163 | return Math.round(value); 164 | } 165 | 166 | return 0; 167 | } 168 | } 169 | -------------------------------------------------------------------------------- /src/main/java/ch/x28/inscriptis/CssProfile.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 the original author or authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package ch.x28.inscriptis; 17 | 18 | import java.util.HashMap; 19 | import java.util.Map; 20 | 21 | import ch.x28.inscriptis.HtmlProperties.Display; 22 | import ch.x28.inscriptis.HtmlProperties.WhiteSpace; 23 | 24 | /** 25 | * Standard CSS profiles shipped with Inscriptis. 26 | * 27 | * @author Sascha Wolski 28 | * @author Matthias Hewelt 29 | */ 30 | public class CssProfile { 31 | 32 | /** 33 | * This profile corresponds to the defaults used by Firefox 34 | */ 35 | public static CssProfile STRICT; 36 | /** 37 | * This profile is more suited for text analytics, since it ensures that whitespaces are inserted between 38 | * {@code span} and {@code div} elements preventing cases where two words stick together. 39 | */ 40 | public static CssProfile RELAXED; 41 | 42 | static { 43 | Map strict = new HashMap<>(); 44 | strict.put("body", new HtmlElement("body", Display.INLINE, WhiteSpace.NORMAL)); 45 | strict.put("head", new HtmlElement("head", Display.NONE)); 46 | strict.put("link", new HtmlElement("link", Display.NONE)); 47 | strict.put("meta", new HtmlElement("meta", Display.NONE)); 48 | strict.put("script", new HtmlElement("script", Display.NONE)); 49 | strict.put("title", new HtmlElement("title", Display.NONE)); 50 | strict.put("style", new HtmlElement("style", Display.NONE)); 51 | 52 | strict.put("p", new HtmlElement("p", Display.BLOCK, 1, 1)); 53 | strict.put("figure", new HtmlElement("figure", Display.BLOCK, 1, 1)); 54 | strict.put("h1", new HtmlElement("h1", Display.BLOCK, 1, 1)); 55 | strict.put("h2", new HtmlElement("h2", Display.BLOCK, 1, 1)); 56 | strict.put("h3", new HtmlElement("h3", Display.BLOCK, 1, 1)); 57 | strict.put("h4", new HtmlElement("h4", Display.BLOCK, 1, 1)); 58 | strict.put("h5", new HtmlElement("h5", Display.BLOCK, 1, 1)); 59 | strict.put("h6", new HtmlElement("h6", Display.BLOCK, 1, 1)); 60 | 61 | strict.put("ul", new HtmlElement("ul", Display.BLOCK, 0, 0, 4)); 62 | strict.put("ol", new HtmlElement("ol", Display.BLOCK, 0, 0, 4)); 63 | strict.put("li", new HtmlElement("li", Display.BLOCK)); 64 | 65 | strict.put("address", new HtmlElement("address", Display.BLOCK)); 66 | strict.put("article", new HtmlElement("article", Display.BLOCK)); 67 | strict.put("aside", new HtmlElement("aside", Display.BLOCK)); 68 | strict.put("div", new HtmlElement("div", Display.BLOCK)); 69 | strict.put("footer", new HtmlElement("footer", Display.BLOCK)); 70 | strict.put("header", new HtmlElement("header", Display.BLOCK)); 71 | strict.put("hgroup", new HtmlElement("hgroup", Display.BLOCK)); 72 | strict.put("layer", new HtmlElement("layer", Display.BLOCK)); 73 | strict.put("main", new HtmlElement("main", Display.BLOCK)); 74 | strict.put("nav", new HtmlElement("nav", Display.BLOCK)); 75 | strict.put("figcaption", new HtmlElement("figcaption", Display.BLOCK)); 76 | strict.put("blockquote", new HtmlElement("blockquote", Display.BLOCK)); 77 | 78 | strict.put("q", new HtmlElement("q", "\"", "\"")); 79 | 80 | // Handling of
 81 | 		strict.put("pre", new HtmlElement("pre", Display.BLOCK, WhiteSpace.PRE));
 82 | 		strict.put("xmp", new HtmlElement("xmp", Display.BLOCK, WhiteSpace.PRE));
 83 | 		strict.put("listing", new HtmlElement("listing", Display.BLOCK, WhiteSpace.PRE));
 84 | 		strict.put("plaintext", new HtmlElement("plaintext", Display.BLOCK, WhiteSpace.PRE));
 85 | 
 86 | 		Map relaxed = new HashMap<>(strict);
 87 | 		relaxed.put("div", new HtmlElement("div", Display.BLOCK, 2));
 88 | 		relaxed.put("span", new HtmlElement("span", Display.INLINE, " ", " ", true));
 89 | 
 90 | 		STRICT = new CssProfile(strict);
 91 | 		RELAXED = new CssProfile(relaxed);
 92 | 	}
 93 | 
 94 | 	private Map settings;
 95 | 
 96 | 	private CssProfile(Map settings) {
 97 | 		this.settings = settings;
 98 | 	}
 99 | 
100 | 	public HtmlElement get(String tag) {
101 | 		return settings.get(tag);
102 | 	}
103 | 
104 | 	public HtmlElement getOrDefault(String tag, HtmlElement defaultElement) {
105 | 
106 | 		HtmlElement htmlElement = settings.get(tag);
107 | 		if (htmlElement != null) {
108 | 			return htmlElement;
109 | 		}
110 | 
111 | 		return defaultElement;
112 | 	}
113 | }
114 | 


--------------------------------------------------------------------------------
/src/main/java/ch/x28/inscriptis/HtmlElement.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2020 the original author or authors.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *      https://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | package ch.x28.inscriptis;
 17 | 
 18 | import ch.x28.inscriptis.HtmlProperties.Display;
 19 | import ch.x28.inscriptis.HtmlProperties.WhiteSpace;
 20 | 
 21 | /**
 22 |  * The HtmlElement class stores the CSS properties.
 23 |  *
 24 |  * @author Sascha Wolski
 25 |  * @author Matthias Hewelt
 26 |  */
 27 | class HtmlElement {
 28 | 
 29 | 	/**
 30 | 	 * Name of the given HtmlElement
 31 | 	 */
 32 | 	private String tag = "/";
 33 | 	/**
 34 | 	 * Specifies a prefix that to insert before the tag's content.
 35 | 	 */
 36 | 	private String prefix = "";
 37 | 	/**
 38 | 	 * A suffix to append after the tag's content.
 39 | 	 */
 40 | 	private String suffix = "";
 41 | 	/**
 42 | 	 * {@link Display} strategy used for the content.
 43 | 	 */
 44 | 	private Display display = null;
 45 | 	/**
 46 | 	 * Vertical margin before the tag's content.
 47 | 	 */
 48 | 	private int marginBefore = 0;
 49 | 	/**
 50 | 	 * Vertical margin after the tag's content.
 51 | 	 */
 52 | 	private int marginAfter = 0;
 53 | 	/**
 54 | 	 * Horizontal padding before the tag's content.
 55 | 	 */
 56 | 	private int padding = 0;
 57 | 	/**
 58 | 	 * {@link WhiteSpace} handling strategy.
 59 | 	 */
 60 | 	private WhiteSpace whitespace = null;
 61 | 	/**
 62 | 	 * Limit printing of whitespace affixes to elements with `normal` whitepsace handling.
 63 | 	 */
 64 | 	private boolean limitWhitespaceAffixes = false;
 65 | 
 66 | 	public HtmlElement() {
 67 | 	}
 68 | 
 69 | 	public HtmlElement(String tag) {
 70 | 		this.tag = tag;
 71 | 	}
 72 | 
 73 | 	public HtmlElement(String tag, Display display) {
 74 | 		this.tag = tag;
 75 | 		this.display = display;
 76 | 	}
 77 | 
 78 | 	public HtmlElement(String tag, Display display, int padding) {
 79 | 
 80 | 		this.tag = tag;
 81 | 		this.display = display;
 82 | 		this.padding = padding;
 83 | 	}
 84 | 
 85 | 	public HtmlElement(String tag, Display display, int marginBefore, int marginAfter) {
 86 | 		this.tag = tag;
 87 | 		this.display = display;
 88 | 		this.marginBefore = marginBefore;
 89 | 		this.marginAfter = marginAfter;
 90 | 	}
 91 | 
 92 | 	public HtmlElement(String tag, Display display, int marginBefore, int marginAfter, int padding) {
 93 | 		this.tag = tag;
 94 | 		this.display = display;
 95 | 		this.marginBefore = marginBefore;
 96 | 		this.marginAfter = marginAfter;
 97 | 		this.padding = padding;
 98 | 	}
 99 | 
100 | 	public HtmlElement(String tag, Display display, String prefix, String suffix, boolean limitWhitespaceAffixes) {
101 | 
102 | 		this.tag = tag;
103 | 		this.prefix = prefix;
104 | 		this.suffix = suffix;
105 | 		this.display = display;
106 | 		this.limitWhitespaceAffixes = limitWhitespaceAffixes;
107 | 	}
108 | 
109 | 	public HtmlElement(String tag, Display display, WhiteSpace whitespace) {
110 | 		this.tag = tag;
111 | 		this.display = display;
112 | 		this.whitespace = whitespace;
113 | 	}
114 | 
115 | 	public HtmlElement(
116 | 		String tag,
117 | 		Display display,
118 | 		WhiteSpace whitespace,
119 | 		String prefix,
120 | 		String suffix,
121 | 		int marginBefore,
122 | 		int marginAfter,
123 | 		int padding,
124 | 		boolean limitWhitespaceAffixes) {
125 | 
126 | 		this.tag = tag;
127 | 		this.prefix = prefix;
128 | 		this.suffix = suffix;
129 | 		this.display = display;
130 | 		this.marginBefore = marginBefore;
131 | 		this.marginAfter = marginAfter;
132 | 		this.padding = padding;
133 | 		this.whitespace = whitespace;
134 | 		this.limitWhitespaceAffixes = limitWhitespaceAffixes;
135 | 	}
136 | 
137 | 	public HtmlElement(String tag, String prefix, String suffix) {
138 | 
139 | 		this.tag = tag;
140 | 		this.prefix = prefix;
141 | 		this.suffix = suffix;
142 | 	}
143 | 
144 | 	/**
145 | 	 * @return a clone of the current HtmlElement
146 | 	 */
147 | 	@Override
148 | 	public HtmlElement clone() {
149 | 
150 | 		return new HtmlElement(
151 | 			tag,
152 | 			display,
153 | 			whitespace,
154 | 			prefix,
155 | 			suffix,
156 | 			marginBefore,
157 | 			marginAfter,
158 | 			padding,
159 | 			limitWhitespaceAffixes);
160 | 	}
161 | 
162 | 	public Display getDisplay() {
163 | 		return display;
164 | 	}
165 | 
166 | 	public int getMarginAfter() {
167 | 		return marginAfter;
168 | 	}
169 | 
170 | 	public int getMarginBefore() {
171 | 		return marginBefore;
172 | 	}
173 | 
174 | 	public int getPadding() {
175 | 		return padding;
176 | 	}
177 | 
178 | 	public String getPrefix() {
179 | 		return prefix;
180 | 	}
181 | 
182 | 	/**
183 | 	 * @param htmlElement the new HtmlElement to be applied to the current context.
184 | 	 * @return the refined element with the context applied.
185 | 	 */
186 | 	public HtmlElement getRefinedHtmlElement(HtmlElement htmlElement) {
187 | 
188 | 		Display display = this.display == Display.NONE
189 | 			? Display.NONE
190 | 			: htmlElement.getDisplay();
191 | 
192 | 		WhiteSpace whiteSpace = null;
193 | 		if (htmlElement.getWhitespace() != null) {
194 | 			whiteSpace = htmlElement.getWhitespace();
195 | 		} else if (this.getWhitespace() != null) {
196 | 			whiteSpace = this.whitespace;
197 | 		}
198 | 
199 | 		// do not display whitespace only affixes in Whitespace.pre areas
200 | 		// if `limit_whitespace_affixes` is set.
201 | 		String prefix = htmlElement.getPrefix();
202 | 		String suffix = htmlElement.getSuffix();
203 | 
204 | 		if (htmlElement.isLimitWhitespaceAffixes() && whiteSpace == WhiteSpace.PRE) {
205 | 			if (StringUtils.isBlank(prefix)) {
206 | 				prefix = "";
207 | 			}
208 | 
209 | 			if (StringUtils.isBlank(suffix)) {
210 | 				suffix = "";
211 | 			}
212 | 		}
213 | 
214 | 		return new HtmlElement(
215 | 			htmlElement.getTag(),
216 | 			display,
217 | 			whiteSpace,
218 | 			prefix,
219 | 			suffix,
220 | 			htmlElement.getMarginBefore(),
221 | 			htmlElement.getMarginAfter(),
222 | 			htmlElement.getPadding(),
223 | 			false);
224 | 	}
225 | 
226 | 	public String getSuffix() {
227 | 		return suffix;
228 | 	}
229 | 
230 | 	public String getTag() {
231 | 		return tag;
232 | 	}
233 | 
234 | 	public WhiteSpace getWhitespace() {
235 | 		return whitespace;
236 | 	}
237 | 
238 | 	public boolean isLimitWhitespaceAffixes() {
239 | 		return limitWhitespaceAffixes;
240 | 	}
241 | 
242 | 	public void setDisplay(Display display) {
243 | 		this.display = display;
244 | 	}
245 | 
246 | 	public void setLimitWhitespaceAffixes(boolean limitWhitespaceAffixes) {
247 | 		this.limitWhitespaceAffixes = limitWhitespaceAffixes;
248 | 	}
249 | 
250 | 	public void setMarginAfter(int marginAfter) {
251 | 		this.marginAfter = marginAfter;
252 | 	}
253 | 
254 | 	public void setMarginBefore(int marginBefore) {
255 | 		this.marginBefore = marginBefore;
256 | 	}
257 | 
258 | 	public void setPadding(int padding) {
259 | 		this.padding = padding;
260 | 	}
261 | 
262 | 	public void setPrefix(String prefix) {
263 | 		this.prefix = prefix;
264 | 	}
265 | 
266 | 	public void setSuffix(String suffix) {
267 | 		this.suffix = suffix;
268 | 	}
269 | 
270 | 	public void setTag(String tag) {
271 | 		this.tag = tag;
272 | 	}
273 | 
274 | 	public void setWhitespace(WhiteSpace whitespace) {
275 | 		this.whitespace = whitespace;
276 | 	}
277 | 
278 | 	@Override
279 | 	public String toString() {
280 | 		return "HtmlElement [tag=" + tag +
281 | 			", display=" + display +
282 | 			", whitespace=" + whitespace +
283 | 			", prefix=" + prefix +
284 | 			", suffix=" + suffix +
285 | 			", marginBefore=" + marginBefore +
286 | 			", marginAfter=" + marginAfter +
287 | 			", padding=" + padding +
288 | 			", limitWhitespaceAffixes=" + limitWhitespaceAffixes + "]";
289 | 	}
290 | 
291 | }
292 | 


--------------------------------------------------------------------------------
/src/main/java/ch/x28/inscriptis/HtmlProperties.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2020 the original author or authors.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      https://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package ch.x28.inscriptis;
17 | 
18 | /**
19 |  * @author Sascha Wolski
20 |  * @author Matthias Hewelt
21 |  */
22 | class HtmlProperties {
23 | 
24 | 	/**
25 | 	 * This enum specifies whether content will be rendered as inline, block or none (i.e. not rendered).
26 | 	 */
27 | 	public enum Display {
28 | 		INLINE(1),
29 | 		BLOCK(2),
30 | 		NONE(3);
31 | 
32 | 		private final int value;
33 | 
34 | 		private Display(int value) {
35 | 			this.value = value;
36 | 		}
37 | 
38 | 		public int getValue() {
39 | 			return value;
40 | 		}
41 | 	}
42 | 
43 | 	/**
44 | 	 * This enum specifies the vertical alignment.
45 | 	 */
46 | 	public enum HorizontalAlignment {
47 | 		LEFT('<'),
48 | 		RIGHT('>'),
49 | 		CENTER('^');
50 | 
51 | 		private final char value;
52 | 
53 | 		private HorizontalAlignment(char value) {
54 | 			this.value = value;
55 | 		}
56 | 
57 | 		public char getValue() {
58 | 			return value;
59 | 		}
60 | 	}
61 | 
62 | 	/**
63 | 	 * This enum specifies the whitespace handling used for an HTML element as outlined in the Cascading Style Sheets
64 | 	 * specification.
65 | 	 *
66 | 	 * @NORMAL Sequences of whitespaces will be collapsed into a single one.
67 | 	 * @PRE Sequences of whitespaces will preserved.
68 | 	 */
69 | 	public enum WhiteSpace {
70 | 		NORMAL(1),
71 | 		PRE(3);
72 | 
73 | 		private final int value;
74 | 
75 | 		private WhiteSpace(int value) {
76 | 			this.value = value;
77 | 		}
78 | 
79 | 		public int getValue() {
80 | 			return value;
81 | 		}
82 | 	}
83 | 
84 | }
85 | 


--------------------------------------------------------------------------------
/src/main/java/ch/x28/inscriptis/Inscriptis.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2020 the original author or authors.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *      https://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | package ch.x28.inscriptis;
 17 | 
 18 | import java.util.ArrayList;
 19 | import java.util.List;
 20 | import java.util.Stack;
 21 | import java.util.stream.Collectors;
 22 | 
 23 | import org.w3c.dom.Document;
 24 | import org.w3c.dom.NamedNodeMap;
 25 | import org.w3c.dom.Node;
 26 | import org.w3c.dom.NodeList;
 27 | 
 28 | import ch.x28.inscriptis.HtmlProperties.Display;
 29 | import ch.x28.inscriptis.HtmlProperties.WhiteSpace;
 30 | 
 31 | /**
 32 |  * The Inscriptis class translates a W3C document to its corresponding text representation.
 33 |  * 

34 | * Example 35 | * 36 | *

 37 |  * 
 38 |  * Document document = <W3C document>;
 39 |  * Inscriptis inscriptis = new Inscriptis(document);
 40 |  * String text = inscriptis.getText();
 41 |  * 
 42 |  * 
43 | * 44 | * @author Sascha Wolski 45 | * @author Matthias Hewelt 46 | */ 47 | public class Inscriptis { 48 | 49 | private static final String[] UL_COUNTER = { "* ", "+ ", "o ", "- " }; 50 | private static final HtmlElement DEFAULT_ELEMENT = new HtmlElement(); 51 | 52 | private final ParserConfig config; 53 | 54 | private final Stack currentTag; 55 | private final Stack currentLine; 56 | private final Stack nextLine; 57 | /** 58 | * The canvases used for displaying text. cleanTextLines[0] refers to the root canvas; tables write into child 59 | * canvases that are created for every table line and merged with the root canvas at the end of a table. 60 | */ 61 | private Stack> cleanTextLines; 62 | 63 | private Stack currentTable; 64 | private Stack liCounter; 65 | private int liLevel = 0; 66 | private String lastCaption; 67 | private String linkTarget; 68 | 69 | /** 70 | * Translates the given W3C document to its corresponding text representation by using the default 71 | * {@link ParserConfig} with {@link CssProfile#RELAXED}. 72 | * 73 | * @param document the W3C document to convert 74 | */ 75 | public Inscriptis(Document document) { 76 | this(document, new ParserConfig()); 77 | } 78 | 79 | /** 80 | * Translates the given W3C document to its corresponding text representation by using the specified 81 | * {@link ParserConfig}. 82 | * 83 | * @param document the W3C document to convert 84 | * @param config an optional ParserConfig configuration object 85 | */ 86 | public Inscriptis(Document document, ParserConfig config) { 87 | 88 | this.config = config; 89 | 90 | currentTag = new Stack<>(); 91 | currentLine = new Stack<>(); 92 | nextLine = new Stack<>(); 93 | 94 | currentTag.push(this.config.getCss().get("body")); 95 | currentLine.push(new Line()); 96 | nextLine.push(new Line()); 97 | 98 | // The canvases used for displaying text 99 | // cleanTextLines[0] refers to the root canvas; tables write into child 100 | // canvases that are created for every table line and merged with the 101 | // root canvas at the end of a table 102 | cleanTextLines = new Stack<>(); 103 | cleanTextLines.push(new ArrayList<>()); 104 | 105 | currentTable = new Stack<>(); 106 | liCounter = new Stack<>(); 107 | liLevel = 0; 108 | lastCaption = null; 109 | 110 | // Used if ParserConfig#displayLinks is enabled 111 | linkTarget = ""; 112 | 113 | parseHtmlTree(document); 114 | 115 | if (currentLine.peek() != null) { 116 | writeLine(false); 117 | } 118 | } 119 | 120 | /** 121 | * Returns the text representation of the HTML content. 122 | * 123 | * @return the text representation of the HTML content 124 | */ 125 | public String getText() { 126 | 127 | String text = cleanTextLines.stream() 128 | .flatMap(lines -> lines.stream()) 129 | .collect(Collectors.joining("\n")); 130 | 131 | return StringUtils.stripTrailing(text); 132 | } 133 | 134 | private void endA() { 135 | 136 | if (!linkTarget.isEmpty()) { 137 | currentLine.peek().addContent(String.format("](%s)", linkTarget)); 138 | } 139 | } 140 | 141 | private void endOl() { 142 | 143 | liLevel -= 1; 144 | liCounter.pop(); 145 | } 146 | 147 | private void endTable() { 148 | 149 | if (!currentTable.isEmpty() && currentTable.peek().isTdOpen()) { 150 | endTd(); 151 | } 152 | 153 | writeLine(false); 154 | 155 | Table table = currentTable.pop(); 156 | writeLineVerbatim(table.getText()); 157 | } 158 | 159 | private void endTd() { 160 | 161 | if (!currentTable.isEmpty() && currentTable.peek().isTdOpen()) { 162 | currentTable.peek().setTdOpen(false); 163 | writeLine(true); 164 | cleanTextLines.pop(); 165 | currentLine.pop(); 166 | nextLine.pop(); 167 | } 168 | } 169 | 170 | private void endUl() { 171 | 172 | liLevel -= 1; 173 | liCounter.pop(); 174 | } 175 | 176 | /** 177 | * @return The bullet that corresponds to the given index. 178 | */ 179 | private String getBullet(int index) { 180 | return UL_COUNTER[index % UL_COUNTER.length]; 181 | } 182 | 183 | /** 184 | * Handels text belonging to HTML tags. 185 | * 186 | * @param data the text to process. 187 | */ 188 | private void handleData(String data) { 189 | 190 | HtmlElement curTag = currentTag.peek(); 191 | if (curTag.getDisplay() == Display.NONE) { 192 | return; 193 | } 194 | 195 | // protect pre areas 196 | if (curTag.getWhitespace() == WhiteSpace.PRE) { 197 | data = "\0" + data + "\0"; 198 | } 199 | 200 | // add prefix, if present 201 | data = curTag.getPrefix() + data + curTag.getSuffix(); 202 | 203 | // determine whether to add this content to a table column or to a standard line 204 | currentLine.peek().addContent(data); 205 | } 206 | 207 | /** 208 | * Handels HTML end tags. 209 | * 210 | * @param node the HTML end tag to process. 211 | */ 212 | private void handleEndTag(Node node) { 213 | 214 | HtmlElement curTag = currentTag.pop(); 215 | nextLine.peek().setPadding(currentLine.peek().getPadding() - curTag.getPadding()); 216 | currentLine.peek().setMarginAfter(Math.max(currentLine.peek().getMarginAfter(), curTag.getMarginAfter())); 217 | 218 | // flush text after display:block elements 219 | if (curTag.getDisplay() == Display.BLOCK) { 220 | // propagate the new padding to the current line, if nothing has been written 221 | if (!writeLine(false)) { 222 | currentLine.peek().setPadding(nextLine.peek().getPadding()); 223 | } 224 | } 225 | 226 | String tag = node.getNodeName(); 227 | 228 | switch (tag) { 229 | case "table": 230 | endTable(); 231 | break; 232 | case "ul": 233 | endUl(); 234 | break; 235 | case "ol": 236 | endOl(); 237 | break; 238 | case "th": 239 | case "td": 240 | endTd(); 241 | break; 242 | case "a": 243 | if (config.isDisplayAnchors() || config.isDisplayLinks()) { 244 | endA(); 245 | } 246 | break; 247 | } 248 | } 249 | 250 | private void handleStartTag(Node node) { 251 | 252 | String tag = node.getNodeName(); 253 | NamedNodeMap attrs = node.getAttributes(); 254 | 255 | // use the css to handle tags known to it 256 | HtmlElement curTag = currentTag.peek().getRefinedHtmlElement( 257 | config.getCss().getOrDefault(tag, Inscriptis.DEFAULT_ELEMENT)); 258 | 259 | Node attrStyle = attrs.getNamedItem("style"); 260 | if (attrStyle != null) { 261 | curTag = CssParse.getStyleAttribute(attrStyle.getNodeValue(), curTag); 262 | } 263 | 264 | currentTag.push(curTag); 265 | 266 | nextLine.peek().setPadding(currentLine.peek().getPadding() + curTag.getPadding()); 267 | 268 | // flush text before display: block elements 269 | if (curTag.getDisplay() == Display.BLOCK) { 270 | if (!writeLine(false)) { 271 | int marginBefore = cleanTextLines.get(0).isEmpty() 272 | ? 0 273 | : Math.max(currentLine.peek().getMarginBefore(), curTag.getMarginBefore()); 274 | 275 | currentLine.peek().setMarginBefore(marginBefore); 276 | currentLine.peek().setPadding(nextLine.peek().getPadding()); 277 | } else { 278 | currentLine.peek().setMarginAfter(Math.max(currentLine.peek().getMarginAfter(), curTag.getMarginAfter())); 279 | } 280 | } 281 | 282 | switch (tag) { 283 | case "table": 284 | startTable(); 285 | break; 286 | case "tr": 287 | startTr(); 288 | break; 289 | case "th": 290 | case "td": 291 | startTd(); 292 | break; 293 | case "ul": 294 | startUl(); 295 | break; 296 | case "ol": 297 | startOl(); 298 | break; 299 | case "li": 300 | startLi(); 301 | break; 302 | case "br": 303 | newline(); 304 | break; 305 | case "a": 306 | if (config.isDisplayAnchors() || config.isDisplayLinks()) { 307 | startA(node.getAttributes()); 308 | } 309 | break; 310 | case "img": 311 | if (config.isDisplayImages()) { 312 | startImg(node.getAttributes()); 313 | } 314 | break; 315 | } 316 | } 317 | 318 | private void newline() { 319 | writeLine(true); 320 | } 321 | 322 | /** 323 | * Parses the HTML tree. 324 | * 325 | * @param document the W3C document 326 | */ 327 | private void parseHtmlTree(Node node) { 328 | 329 | if (node.getNodeType() != Node.DOCUMENT_NODE && 330 | node.getNodeType() != Node.ELEMENT_NODE && 331 | node.getNodeType() != Node.TEXT_NODE) { 332 | return; 333 | } 334 | 335 | if (node.getNodeType() == Node.ELEMENT_NODE) { 336 | handleStartTag(node); 337 | } 338 | 339 | if (node.getNodeType() == Node.TEXT_NODE) { 340 | String text = node.getNodeValue(); 341 | if (text != null && !text.isEmpty()) { 342 | handleData(text); 343 | } 344 | } 345 | 346 | NodeList children = node.getChildNodes(); 347 | for (int i = 0; i < children.getLength(); i++) { 348 | parseHtmlTree(children.item(i)); 349 | } 350 | 351 | if (node.getNodeType() == Node.ELEMENT_NODE) { 352 | handleEndTag(node); 353 | } 354 | } 355 | 356 | private void startA(NamedNodeMap attributes) { 357 | 358 | linkTarget = ""; 359 | 360 | if (config.isDisplayLinks()) { 361 | Node hrefAttribute = attributes.getNamedItem("href"); 362 | linkTarget = hrefAttribute != null 363 | ? hrefAttribute.getNodeValue() 364 | : ""; 365 | } 366 | 367 | if (config.isDisplayAnchors() && linkTarget.isEmpty()) { 368 | Node nameAttribute = attributes.getNamedItem("name"); 369 | linkTarget = nameAttribute != null 370 | ? nameAttribute.getNodeValue() 371 | : ""; 372 | } 373 | 374 | if (!linkTarget.isEmpty()) { 375 | currentLine.peek().addContent("["); 376 | } 377 | } 378 | 379 | private void startImg(NamedNodeMap attributes) { 380 | 381 | String imageText = ""; 382 | 383 | Node altNode = attributes.getNamedItem("alt"); 384 | if (altNode != null) { 385 | imageText = altNode.getNodeValue(); 386 | } else { 387 | Node titleNode = attributes.getNamedItem("title"); 388 | if (titleNode != null) { 389 | imageText = titleNode.getNodeValue(); 390 | } 391 | } 392 | 393 | if (!imageText.isEmpty() && !(config.isDeduplicateCaptions() && imageText.equals(lastCaption))) { 394 | currentLine.peek().addContent(String.format("[%s]", imageText)); 395 | lastCaption = imageText; 396 | } 397 | } 398 | 399 | private void startLi() { 400 | 401 | writeLine(false); 402 | 403 | Object bullet; 404 | if (liLevel > 0) { 405 | bullet = liCounter.peek(); 406 | } else { 407 | bullet = "* "; 408 | } 409 | 410 | if (bullet instanceof Integer) { 411 | int bulletNumber = (int) liCounter.pop(); 412 | liCounter.push(bulletNumber + 1); 413 | currentLine.peek().setListBullet(String.format("%s. ", bulletNumber)); 414 | } else { 415 | currentLine.peek().setListBullet(bullet.toString()); 416 | } 417 | } 418 | 419 | private void startOl() { 420 | liCounter.push(1); 421 | liLevel += 1; 422 | } 423 | 424 | private void startTable() { 425 | currentTable.push(new Table()); 426 | } 427 | 428 | private void startTd() { 429 | 430 | if (currentTable.isEmpty()) { 431 | return; 432 | } 433 | 434 | Table curTable = currentTable.peek(); 435 | 436 | // check whether we need to cleanup a 3 | first
tag that has not been closed yet 437 | if (curTable.isTdOpen()) { 438 | endTd(); 439 | } 440 | 441 | // open td tag 442 | cleanTextLines.push(new ArrayList<>()); 443 | currentLine.push(new Line()); 444 | nextLine.push(new Line()); 445 | curTable.addCell(cleanTextLines.peek()); 446 | curTable.setTdOpen(true); 447 | } 448 | 449 | private void startTr() { 450 | 451 | if (currentTable.isEmpty()) { 452 | return; 453 | } 454 | 455 | Table curTable = currentTable.peek(); 456 | 457 | // check whether we need to cleanup a tag that has not been closed yet 458 | if (curTable.isTdOpen()) { 459 | endTd(); 460 | } 461 | 462 | curTable.addRow(); 463 | } 464 | 465 | private void startUl() { 466 | 467 | liLevel += 1; 468 | liCounter.push(getBullet(liLevel - 1)); 469 | } 470 | 471 | /** 472 | * Writes the current line to the buffer, provided that there is any data to write. 473 | * 474 | * @param force if true, data will be written even if it's empty. 475 | * @return {@code true}, if a line has been written, otherwise {@code false}. 476 | */ 477 | private boolean writeLine(boolean force) { 478 | 479 | // only write the line if it contains relevant content 480 | if (!force && StringUtils.isBlank(currentLine.peek().getContent())) { 481 | currentLine.peek().setMarginBefore(Math.max(currentLine.peek().getMarginBefore(), currentTag.peek().getMarginBefore())); 482 | return false; 483 | } 484 | 485 | String line = currentLine.peek().getText(); 486 | cleanTextLines.peek().add(line); 487 | 488 | currentLine.pop(); 489 | currentLine.push(nextLine.pop()); 490 | nextLine.push(new Line()); 491 | 492 | return true; 493 | } 494 | 495 | /** 496 | * Writes the current buffer without any modifications. 497 | * 498 | * @param text the text to write. 499 | */ 500 | private void writeLineVerbatim(String text) { 501 | cleanTextLines.peek().add(text); 502 | } 503 | } 504 | -------------------------------------------------------------------------------- /src/main/java/ch/x28/inscriptis/Line.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 the original author or authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package ch.x28.inscriptis; 17 | 18 | import java.util.ArrayList; 19 | import java.util.Arrays; 20 | import java.util.List; 21 | import java.util.stream.Collectors; 22 | import java.util.stream.Stream; 23 | 24 | /** 25 | * This class represents a line to render. 26 | * 27 | * @author Sascha Wolski 28 | * @author Matthias Hewelt 29 | */ 30 | class Line { 31 | 32 | private int marginBefore = 0; 33 | private int marginAfter = 0; 34 | private String prefix = ""; 35 | private String suffix = ""; 36 | private String content = ""; 37 | private String listBullet = ""; 38 | private int padding = 0; 39 | 40 | public void addContent(String content) { 41 | this.content += content; 42 | } 43 | 44 | public String getContent() { 45 | return content; 46 | } 47 | 48 | public String getListBullet() { 49 | return listBullet; 50 | } 51 | 52 | public int getMarginAfter() { 53 | return marginAfter; 54 | } 55 | 56 | public int getMarginBefore() { 57 | return marginBefore; 58 | } 59 | 60 | public int getPadding() { 61 | return padding; 62 | } 63 | 64 | public String getPrefix() { 65 | return prefix; 66 | } 67 | 68 | public String getSuffix() { 69 | return suffix; 70 | } 71 | 72 | /** 73 | * @return the text representation of the current line. 74 | */ 75 | public String getText() { 76 | 77 | List text = new ArrayList<>(); 78 | 79 | if (!content.contains("\0")) { 80 | // standard text without any `WhiteSpace#PRE` formatted text. 81 | text.addAll(Arrays.asList(content.trim().split("\\s+"))); 82 | } else { 83 | // content containing `WhiteSpace#PRE` formatted text 84 | content = content.replace("\0\0", ""); 85 | 86 | String basePadding = StringUtils.repeat(" ", padding); 87 | 88 | int i = 0; 89 | for (String data : content.split("\0")) { 90 | if (i++ % 2 == 0) { 91 | // handle standard content 92 | // python extend filters empty elements 93 | List d = Stream.of(data.trim().split("\\s+")) 94 | .filter(str -> !str.isEmpty()) 95 | .collect(Collectors.toList()); 96 | 97 | text.addAll(d); 98 | } else { 99 | // handle `WhiteSpace#PRE` formatted content. 100 | text.add(data.replaceAll("\n", "\n" + basePadding)); 101 | } 102 | } 103 | } 104 | 105 | StringBuilder result = new StringBuilder() 106 | .append(StringUtils.repeat("\n", marginBefore)) 107 | .append(StringUtils.repeat(" ", Math.max(0, padding - listBullet.length()))) 108 | .append(listBullet) 109 | .append(prefix) 110 | .append(String.join(" ", text)) 111 | .append(suffix) 112 | .append(StringUtils.repeat("\n", marginAfter)); 113 | 114 | return result.toString(); 115 | } 116 | 117 | /** 118 | * Set the String that will be used as a bullet symbol in a list. 119 | * 120 | * @param listBullet the bullet to be used in a list. 121 | */ 122 | public void setListBullet(String listBullet) { 123 | this.listBullet = listBullet; 124 | } 125 | 126 | /** 127 | * Set the amount of empty lines that will be added after the lines content. 128 | * 129 | * @param marginAfter the number of empty lines 130 | */ 131 | public void setMarginAfter(int marginAfter) { 132 | this.marginAfter = marginAfter; 133 | } 134 | 135 | /** 136 | * Set the amount of empty lines that will be added before the lines content. 137 | * 138 | * @param marginBefore the number of empty lines. 139 | */ 140 | public void setMarginBefore(int marginBefore) { 141 | this.marginBefore = marginBefore; 142 | } 143 | 144 | /** 145 | * Set the amount of horizontal padding (spaces) that will be used to intend the lines content. If a list bullet is 146 | * used, the actual padding will be reduced by the amount of characters of this list bullet. This means the list 147 | * bullet is handled as part of the padding. 148 | * 149 | * @param padding the amount of spaces to be added. 150 | */ 151 | public void setPadding(int padding) { 152 | this.padding = padding; 153 | } 154 | 155 | /** 156 | * Set the String value that will be added in front of the lines content. 157 | * 158 | * @param prefix the string value to be added. 159 | */ 160 | public void setPrefix(String prefix) { 161 | this.prefix = prefix; 162 | } 163 | 164 | /** 165 | * Set the String value that will be added behind the lines content. 166 | * 167 | * @param suffix the string value to be added. 168 | */ 169 | public void setSuffix(String suffix) { 170 | this.suffix = suffix; 171 | } 172 | 173 | } 174 | -------------------------------------------------------------------------------- /src/main/java/ch/x28/inscriptis/ParserConfig.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 the original author or authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package ch.x28.inscriptis; 17 | 18 | /** 19 | * The ParserConfig object encapsulates configuration options and custom CSS definitions used by inscriptis for 20 | * translating HTML to text. 21 | * 22 | * @author Sascha Wolski 23 | * @author Matthias Hewelt 24 | */ 25 | public class ParserConfig { 26 | 27 | private final CssProfile css; 28 | private boolean displayImages = false; 29 | private boolean deduplicateCaptions = false; 30 | private boolean displayLinks = false; 31 | private boolean displayAnchors = false; 32 | 33 | /** 34 | * Creates a new parser configuration with {@link CssProfile#RELAXED}. 35 | */ 36 | public ParserConfig() { 37 | css = CssProfile.RELAXED; 38 | } 39 | 40 | /** 41 | * Creates a new parser configuration with the given {@link CssProfile}. 42 | * 43 | * @param cssProfile an custom CSS definition, otherwise {@link CssProfile#RELAXED}. 44 | */ 45 | public ParserConfig(CssProfile cssProfile) { 46 | css = cssProfile; 47 | } 48 | 49 | /** 50 | * Returns the configured {@link CssProfile}. 51 | * 52 | * @return the configured {@link CssProfile}. 53 | */ 54 | public CssProfile getCss() { 55 | return css; 56 | } 57 | 58 | /** 59 | * Whether to deduplicate captions such as image titles (many newspaper include images and video previews with 60 | * identifical titles). 61 | * 62 | * @return {@code true} of deduplicate captions, otherwise {@code false}. 63 | */ 64 | public boolean isDeduplicateCaptions() { 65 | return deduplicateCaptions; 66 | } 67 | 68 | /** 69 | * Whether to display anchors (e.g. [here](#here)). 70 | * 71 | * @return {@code true} of display anchors, otherwise {@code false}. 72 | */ 73 | public boolean isDisplayAnchors() { 74 | return displayAnchors; 75 | } 76 | 77 | /** 78 | * Whether to include images alt or title attribute values as text. If an image has both 79 | * alt and title attribute the alt value will be used. 80 | * 81 | * @return {@code true} to include images, otherwise {@code false}. 82 | */ 83 | public boolean isDisplayImages() { 84 | return displayImages; 85 | } 86 | 87 | /** 88 | * Whether to display link targets (e.g. [Python](https://www.python.org)). 89 | * 90 | * @return {@code true} to display links, otherwise {@code false} 91 | */ 92 | public boolean isDisplayLinks() { 93 | return displayLinks; 94 | } 95 | 96 | /** 97 | * Whether to deduplicate captions such as image titles (many newspaper include images and video previews with 98 | * identifical titles). 99 | * 100 | * @param deduplicateCaptions if set to true, successive caption duplicates won't be rendered. 101 | */ 102 | public void setDeduplicateCaptions(boolean deduplicateCaptions) { 103 | this.deduplicateCaptions = deduplicateCaptions; 104 | } 105 | 106 | /** 107 | * Whether to display anchors (e.g. [here](#here)). 108 | * 109 | * @param displayAnchors if true, anchors will be rendered. 110 | */ 111 | public void setDisplayAnchors(boolean displayAnchors) { 112 | this.displayAnchors = displayAnchors; 113 | } 114 | 115 | /** 116 | * Whether to include images alt or title attribute values as text. If an image has both 117 | * alt and title attribute the alt value will be used. 118 | * 119 | * @param displayImages when true, images alt or title will be rendered. Otherwise no 120 | * information about images will be rendered. 121 | */ 122 | public void setDisplayImages(boolean displayImages) { 123 | this.displayImages = displayImages; 124 | } 125 | 126 | /** 127 | * Whether to display link targets (e.g. [Python](https://www.python.org)). 128 | * 129 | * @param displayLinks if true, link targets will be rendered. 130 | */ 131 | public void setDisplayLinks(boolean displayLinks) { 132 | this.displayLinks = displayLinks; 133 | } 134 | 135 | } 136 | -------------------------------------------------------------------------------- /src/main/java/ch/x28/inscriptis/Row.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 the original author or authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package ch.x28.inscriptis; 17 | 18 | import java.util.ArrayList; 19 | import java.util.List; 20 | 21 | /** 22 | * A single row within a table. 23 | * 24 | * @author Sascha Wolski 25 | * @author Matthias Hewelt 26 | */ 27 | class Row { 28 | 29 | private final List columns = new ArrayList<>(); 30 | 31 | private static List> zipLongest(List> lists, String fillValue) { 32 | 33 | // determine longest list 34 | int maxListLength = 0; 35 | for (List list : lists) { 36 | maxListLength = Math.max(maxListLength, list.size()); 37 | } 38 | 39 | List> resultLists = new ArrayList<>(); 40 | 41 | for (int listElementIndex = 0; listElementIndex < maxListLength; listElementIndex++) { 42 | List subList = new ArrayList<>(); 43 | for (List list : lists) { 44 | String element = list.size() > listElementIndex 45 | ? list.get(listElementIndex) 46 | : fillValue; 47 | 48 | subList.add(element); 49 | } 50 | resultLists.add(subList); 51 | } 52 | return resultLists; 53 | } 54 | 55 | /** 56 | * Computes the list of lines in the cell specified by the column_idx. 57 | * 58 | * @param columnIndex The column index of the cell. 59 | * @return The list of lines in the cell specified by the column_idx or an empty list if the column does not exist. 60 | */ 61 | public List getCellLines(int columnIndex) { 62 | 63 | if (columnIndex >= columns.size()) { 64 | return new ArrayList(0); 65 | } 66 | 67 | return columns.get(columnIndex).getCellLines(); 68 | } 69 | 70 | public List getColumns() { 71 | return columns; 72 | } 73 | 74 | /** 75 | * @return A rendered string representation of the given row. 76 | */ 77 | public String getText() { 78 | 79 | List> lines = new ArrayList<>(); 80 | 81 | for (TableCell column : columns) { 82 | lines.add(column.getCellLines()); 83 | } 84 | 85 | List> longestZip = zipLongest(lines, " "); 86 | 87 | List rowLines = new ArrayList<>(); 88 | for (List list : longestZip) { 89 | rowLines.add(String.join(" ", list)); 90 | } 91 | 92 | return String.join("\n", rowLines); 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /src/main/java/ch/x28/inscriptis/StringUtils.java: -------------------------------------------------------------------------------- 1 | package ch.x28.inscriptis; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | class StringUtils { 7 | 8 | /** 9 | * Check if a string is null, has length zero or consists of whitespace characters only. 10 | * 11 | * @param str the string to be checked 12 | * @return true if str is null or of length zero or consists of whitespace characters only. 13 | */ 14 | public static boolean isBlank(String str) { 15 | 16 | if (str == null || str.length() == 0) { 17 | return true; 18 | } 19 | 20 | for (char c : str.toCharArray()) { 21 | if (!Character.isWhitespace(c)) { 22 | return false; 23 | } 24 | } 25 | 26 | return true; 27 | } 28 | 29 | /** 30 | * Adds as many spaces equally distributed to the front and end of the string as needed to increase its length to 31 | * the provided target length. If the amount of spaces that will be added is an odd number, the left over space will 32 | * be added to the end of the string. 33 | * 34 | * @param str the string to be padded. 35 | * @param targetLength the final length of the string. 36 | * @return a string surrounded with as many space characters as required. Will return null if the providede string 37 | * is null. Will return the original string, if the providede target length isn't greater then the original 38 | * strings length. 39 | */ 40 | public static String padCenter(String str, int targetLength) { 41 | 42 | if (str == null) { 43 | return null; 44 | } 45 | 46 | int spacesNeeded = targetLength - str.length(); 47 | if (spacesNeeded <= 0) { 48 | return str; 49 | } 50 | 51 | int frontSpaces = spacesNeeded / 2; 52 | int endSpaces = spacesNeeded - frontSpaces; 53 | 54 | return repeat(" ", frontSpaces).concat(str).concat(repeat(" ", endSpaces)); 55 | } 56 | 57 | /** 58 | * Adds as many spaces in front of the string as needed to increase its length to the provided target length. 59 | * 60 | * @param str the string to be padded. 61 | * @param targetLength the final length of the string. 62 | * @return a string prefixed with as many space characters as required. Will return null if the providede string is 63 | * null. Will return the original string, if the providede target length isn't greater then the original 64 | * strings length. 65 | */ 66 | public static String padLeft(String str, int targetLength) { 67 | 68 | if (str == null) { 69 | return null; 70 | } 71 | 72 | int spacesNeeded = targetLength - str.length(); 73 | if (spacesNeeded <= 0) { 74 | return str; 75 | } 76 | 77 | return repeat(" ", spacesNeeded).concat(str); 78 | } 79 | 80 | /** 81 | * Adds as many spaces to the end of the string as needed to increase its length to the provided target length. 82 | * 83 | * @param str the string to be padded. 84 | * @param targetLength the final length of the string. 85 | * @return a string suffixed with as many space characters as required. Will return null if the providede string is 86 | * null. Will return the original string, if the providede target length isn't greater then the original 87 | * strings length. 88 | */ 89 | public static String padRight(String str, int targetLength) { 90 | 91 | if (str == null) { 92 | return null; 93 | } 94 | 95 | int spacesNeeded = targetLength - str.length(); 96 | if (spacesNeeded <= 0) { 97 | return str; 98 | } 99 | 100 | return str.concat(repeat(" ", spacesNeeded)); 101 | } 102 | 103 | /** 104 | * Repeat a string several times. 105 | * 106 | * @param str the string value to be repeated several times. 107 | * @param repetitions how many times the string should be repeated. 108 | * @return A new string consisting of repetitions times the value of str. Returns null if 109 | * the provided string is null. Returns an empty string if the provided string is empty of repetitions is < 110 | * 1. 111 | */ 112 | public static String repeat(String str, int repetitions) { 113 | 114 | if (str == null) { 115 | return null; 116 | } 117 | 118 | if (str.length() < 1 || repetitions < 1) { 119 | return ""; 120 | } 121 | 122 | StringBuffer result = new StringBuffer(); 123 | for (int i = 0; i < repetitions; i++) { 124 | result.append(str); 125 | } 126 | 127 | return result.toString(); 128 | } 129 | 130 | /** 131 | * Split a string by a separator char for a specified amount of times.
132 | * Example: 133 | *

134 | * 135 | * split("hello:world", ':', 1) // [["hello"],["world"]]
136 | * split("hello:world", ':', 2) // [["hello"],["world"]]
137 | * split(":helloworld", ':', 1) // [[""],["helloworld"]]
138 | * split(":hello:world", ':', 1) // [[""],["hello:world"]]
139 | * split(":hello:world", ':', 2) // [[""],["hello"],[":world"]]
140 | * split("helloworld:", ':', 1) // [["helloworld"],[""]]
141 | *
142 | *

143 | * 144 | * @param str the string to be split. 145 | * @param c the separator char. 146 | * @param maxSplits the max amount of splits being performed. 147 | * @return a string array containing the splits. Returns null if the provided string is null. Returns a string array 148 | * with the original string as element when the separator is not found or the number of max splits is lower 149 | * than 1. 150 | */ 151 | public static String[] split(String str, char separator, int maxSplits) { 152 | 153 | if (str == null) { 154 | return null; 155 | } 156 | 157 | int separatorIndex = 0; 158 | int splits = 0; 159 | String remaining = str; 160 | 161 | List result = new ArrayList<>(); 162 | 163 | while (splits < maxSplits && remaining.length() > 0) { 164 | 165 | separatorIndex = remaining.indexOf(separator); 166 | if (separatorIndex == -1) { 167 | break; 168 | } 169 | 170 | result.add(remaining.substring(0, separatorIndex)); 171 | 172 | separatorIndex++; 173 | 174 | if (separatorIndex < remaining.length()) { 175 | remaining = remaining.substring(separatorIndex); 176 | } else { 177 | remaining = ""; 178 | } 179 | } 180 | 181 | result.add(remaining.substring(0)); 182 | 183 | return result.toArray(new String[0]); 184 | } 185 | 186 | /** 187 | * Removes any trailing whitespace characters. 188 | * 189 | * @param str the string to be processed. 190 | * @return the string without any trailing whitespace characters. 191 | */ 192 | public static String stripTrailing(String str) { 193 | 194 | if (str == null) { 195 | return null; 196 | } 197 | 198 | int index = str.length() - 1; 199 | 200 | boolean isWhitespaceCharacter = true; 201 | while (index >= 0 && isWhitespaceCharacter) { 202 | isWhitespaceCharacter = Character.isWhitespace(str.charAt(index)); 203 | 204 | if (isWhitespaceCharacter) { 205 | index--; 206 | } 207 | } 208 | 209 | return str.substring(0, index + 1); 210 | } 211 | } 212 | -------------------------------------------------------------------------------- /src/main/java/ch/x28/inscriptis/Table.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 the original author or authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package ch.x28.inscriptis; 17 | 18 | import java.util.ArrayList; 19 | import java.util.List; 20 | 21 | import ch.x28.inscriptis.HtmlProperties.HorizontalAlignment; 22 | 23 | /** 24 | * An HTML table. 25 | * 26 | * @author Sascha Wolski 27 | * @author Matthias Hewelt 28 | */ 29 | class Table { 30 | 31 | private final List rows = new ArrayList<>(); 32 | private boolean tdOpen = false; 33 | 34 | /** 35 | * Adds a new left aligned TableCell to the table's last row. If no row exists yet, a new row is created. 36 | */ 37 | public void addCell(List canvas) { 38 | addCell(canvas, HorizontalAlignment.LEFT); 39 | } 40 | 41 | /** 42 | * Adds a new TableCell to the table's last row. If no row exists yet, a new row is created. 43 | */ 44 | public void addCell(List canvas, HorizontalAlignment alignment) { 45 | 46 | if (rows.isEmpty()) { 47 | rows.add(new Row()); 48 | } 49 | 50 | Row last = rows.get(rows.size() - 1); 51 | last.getColumns().add(new TableCell(canvas, alignment, null, null)); 52 | } 53 | 54 | /** 55 | * Adds an empty Row to the table. 56 | */ 57 | public void addRow() { 58 | rows.add(new Row()); 59 | } 60 | 61 | /** 62 | * Compute and set the column width and height for all colls in the table. 63 | */ 64 | public void computeColumnWidthAnHeight() { 65 | 66 | // skip tables with no row 67 | if (rows.isEmpty()) 68 | return; 69 | 70 | // determine row height 71 | for (Row row : rows) { 72 | int maxRowHeight = 1; 73 | for (TableCell col : row.getColumns()) { 74 | maxRowHeight = Math.max(maxRowHeight, col.getCellLines().size()); 75 | } 76 | for (TableCell col : row.getColumns()) { 77 | col.setHeight(maxRowHeight); 78 | } 79 | } 80 | 81 | // determine maximum number of columns 82 | int maxColumns = 0; 83 | for (Row row : rows) { 84 | maxColumns = Math.max(maxColumns, row.getColumns().size()); 85 | } 86 | 87 | for (int columnIndex = 0; columnIndex < maxColumns; columnIndex++) { 88 | // determine max column width by longest cell line per row 89 | int maxColumnWidth = 0; 90 | for (Row row : rows) { 91 | for (String cellLine : row.getCellLines(columnIndex)) { 92 | maxColumnWidth = Math.max(maxColumnWidth, cellLine.length()); 93 | } 94 | } 95 | 96 | // set column width in all rows 97 | for (Row row : rows) { 98 | if (row.getColumns().size() > columnIndex) { 99 | row.getColumns().get(columnIndex).setWidth(maxColumnWidth); 100 | } 101 | } 102 | } 103 | } 104 | 105 | /** 106 | * Get a rendered string representation of this table. 107 | */ 108 | public String getText() { 109 | 110 | computeColumnWidthAnHeight(); 111 | 112 | List rowContents = new ArrayList<>(); 113 | for (Row row : rows) { 114 | rowContents.add(row.getText()); 115 | } 116 | 117 | return String.join("\n", rowContents); 118 | } 119 | 120 | public boolean isTdOpen() { 121 | return tdOpen; 122 | } 123 | 124 | public void setTdOpen(boolean tdOpen) { 125 | this.tdOpen = tdOpen; 126 | } 127 | 128 | } 129 | -------------------------------------------------------------------------------- /src/main/java/ch/x28/inscriptis/TableCell.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 the original author or authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package ch.x28.inscriptis; 17 | 18 | import java.util.ArrayList; 19 | import java.util.Arrays; 20 | import java.util.List; 21 | import java.util.stream.Collectors; 22 | 23 | import ch.x28.inscriptis.HtmlProperties.HorizontalAlignment; 24 | 25 | /** 26 | * A single Table Cell. 27 | * 28 | * @author Sascha Wolski 29 | * @author Matthias Hewelt 30 | */ 31 | class TableCell { 32 | 33 | private List canvas; 34 | private HorizontalAlignment horizontalAlignment; 35 | private Integer width; 36 | private Integer height; 37 | 38 | /** 39 | * Create a new table cell with the given properties 40 | */ 41 | public TableCell(List canvas, HorizontalAlignment horizontalAlignment, Integer width, Integer height) { 42 | this.canvas = canvas; 43 | this.horizontalAlignment = horizontalAlignment; 44 | this.width = width; 45 | this.height = height; 46 | } 47 | 48 | public List getCanvas() { 49 | return canvas; 50 | } 51 | 52 | /** 53 | * @return a list of all the lines stores within the table cell. 54 | */ 55 | public List getCellLines() { 56 | 57 | List lines = new ArrayList<>(); 58 | for (String str : this.canvas) { 59 | String[] split = str.split("\n"); 60 | lines.addAll(Arrays.asList(split)); 61 | } 62 | 63 | this.canvas.clear(); 64 | this.canvas.addAll(lines); 65 | 66 | if (this.height != null) { 67 | for (int i = 0; i < this.height - this.canvas.size(); i++) { 68 | lines.add(""); 69 | } 70 | } 71 | 72 | if (this.width != null && this.width > 0) { 73 | lines = lines.stream() 74 | .map(this::alignString) 75 | .collect(Collectors.toList()); 76 | } 77 | 78 | return lines; 79 | } 80 | 81 | public int getHeight() { 82 | return height; 83 | } 84 | 85 | public HorizontalAlignment getHorizontalAlignment() { 86 | return horizontalAlignment; 87 | } 88 | 89 | public int getWidth() { 90 | return width; 91 | } 92 | 93 | /** 94 | * The text lines contained in this table cell. 95 | */ 96 | public void setCanvas(List canvas) { 97 | this.canvas = canvas; 98 | } 99 | 100 | /** 101 | * Set the height (amount of lines) of this table cell. 102 | */ 103 | public void setHeight(Integer height) { 104 | this.height = height; 105 | } 106 | 107 | /** 108 | * Set the horizontal alignment of this table cell. 109 | * 110 | * @param horizontalAlignment one of CENTER, LEFT or RIGHT 111 | */ 112 | public void setHorizontalAlignment(HorizontalAlignment horizontalAlignment) { 113 | this.horizontalAlignment = horizontalAlignment; 114 | } 115 | 116 | /** 117 | * Set the width of the lines in this table cell. 118 | */ 119 | public void setWidth(Integer width) { 120 | this.width = width; 121 | } 122 | 123 | private String alignString(String str) { 124 | 125 | switch (horizontalAlignment) { 126 | case CENTER: 127 | return StringUtils.padCenter(str, width); 128 | case LEFT: 129 | return StringUtils.padRight(str, width); 130 | case RIGHT: 131 | return StringUtils.padLeft(str, width); 132 | default: 133 | return StringUtils.padCenter(str, width); 134 | } 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /src/test/java/ch/x28/inscriptis/CssParseTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 the original author or authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package ch.x28.inscriptis; 17 | 18 | import static org.assertj.core.api.Assertions.*; 19 | 20 | import org.junit.jupiter.api.Test; 21 | 22 | import ch.x28.inscriptis.HtmlProperties.Display; 23 | 24 | /** 25 | * @author Sascha Wolski 26 | * @author Matthias Hewelt 27 | */ 28 | public class CssParseTest { 29 | 30 | @Test 31 | public void testDisplayBlockWithPadding() { 32 | 33 | // given 34 | CssProfile css = CssProfile.STRICT; 35 | 36 | // when 37 | HtmlElement htmlElement = CssParse.getStyleAttribute("padding-left: 8px; display: block", css.get("div")); 38 | // then 39 | assertThat(htmlElement.getPadding()).isEqualTo(1); 40 | assertThat(htmlElement.getDisplay()).isEqualTo(Display.BLOCK); 41 | } 42 | 43 | @Test 44 | public void testDisplayInlineWithMarginBefore() { 45 | 46 | // given 47 | CssProfile css = CssProfile.STRICT; 48 | 49 | // when 50 | HtmlElement htmlElement = CssParse.getStyleAttribute("margin-top: 8em; display: inline", css.get("div")); 51 | 52 | // then 53 | assertThat(htmlElement.getMarginBefore()).isEqualTo(8); 54 | assertThat(htmlElement.getDisplay()).isEqualTo(Display.INLINE); 55 | } 56 | 57 | @Test 58 | public void testStyleUnitParsing() { 59 | 60 | // given 61 | // when 62 | HtmlElement htmlElement = CssParse.getStyleAttribute("margin-top:2.666666667em;margin-bottom: 2.666666667em", new HtmlElement()); 63 | 64 | // then 65 | assertThat(htmlElement.getMarginBefore()).isEqualTo(3); 66 | assertThat(htmlElement.getMarginAfter()).isEqualTo(3); 67 | } 68 | 69 | } 70 | -------------------------------------------------------------------------------- /src/test/java/ch/x28/inscriptis/HtmlElementTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 the original author or authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package ch.x28.inscriptis; 17 | 18 | import static org.assertj.core.api.Assertions.*; 19 | 20 | import org.junit.jupiter.api.Test; 21 | 22 | import ch.x28.inscriptis.HtmlProperties.Display; 23 | import ch.x28.inscriptis.HtmlProperties.WhiteSpace; 24 | 25 | /** 26 | * @author Sascha Wolski 27 | * @author Matthias Hewelt 28 | */ 29 | public class HtmlElementTest { 30 | 31 | @Test 32 | public void testRefinement() { 33 | 34 | HtmlElement span = new HtmlElement("span", Display.INLINE, null, " ", " ", 0, 0, 0, true); 35 | HtmlElement pre = new HtmlElement("pre", Display.BLOCK, WhiteSpace.PRE); 36 | HtmlElement code = new HtmlElement("code"); 37 | 38 | // refinement with pre and whitespaces 39 | HtmlElement refined = pre.getRefinedHtmlElement(span); 40 | assertThat(refined.getPrefix()).isEqualTo(""); 41 | assertThat(refined.getSuffix()).isEqualTo(""); 42 | 43 | // refinement with code and whitespaces 44 | refined = code.getRefinedHtmlElement(span); 45 | assertThat(refined.getPrefix()).isEqualTo(" "); 46 | assertThat(refined.getSuffix()).isEqualTo(" "); 47 | 48 | // refinement with pre and non-whitespaces 49 | span.setPrefix(" 1. "); 50 | span.setSuffix("<"); 51 | refined = pre.getRefinedHtmlElement(span); 52 | assertThat(refined.getPrefix()).isEqualTo(" 1. "); 53 | assertThat(refined.getSuffix()).isEqualTo("<"); 54 | 55 | // refinement with code and non-whitespaces 56 | refined = code.getRefinedHtmlElement(span); 57 | assertThat(refined.getPrefix()).isEqualTo(" 1. "); 58 | assertThat(refined.getSuffix()).isEqualTo("<"); 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /src/test/java/ch/x28/inscriptis/InscriptisTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 the original author or authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package ch.x28.inscriptis; 17 | 18 | import static org.assertj.core.api.Assertions.*; 19 | 20 | import java.io.IOException; 21 | import java.net.URISyntaxException; 22 | import java.nio.charset.StandardCharsets; 23 | import java.nio.file.Files; 24 | import java.nio.file.Path; 25 | import java.nio.file.Paths; 26 | import java.util.Set; 27 | import java.util.stream.Collectors; 28 | import java.util.stream.Stream; 29 | 30 | import org.jsoup.Jsoup; 31 | import org.jsoup.helper.W3CDom; 32 | import org.junit.jupiter.api.Test; 33 | import org.w3c.dom.Document; 34 | 35 | /** 36 | * @author Sascha Wolski 37 | * @author Matthias Hewelt 38 | */ 39 | public class InscriptisTest { 40 | 41 | /** 42 | * Converts an HTML string to text, optionally including and deduplicating image captions, displaying link targets 43 | * and using either the standard or extended indentation strategy. 44 | * 45 | * @param htmlContent the HTML string to be converted to text. 46 | * @return The text representation of the HTML content. 47 | */ 48 | private static String getText(String htmlContent) { 49 | return getText(htmlContent, new ParserConfig()); 50 | } 51 | 52 | /** 53 | * Converts an HTML string to text, optionally including and deduplicating image captions, displaying link targets 54 | * and using either the standard or extended indentation strategy. 55 | * 56 | * @param htmlContent the HTML string to be converted to text. 57 | * @param config an optional ParserConfig object. 58 | * @return The text representation of the HTML content. 59 | */ 60 | private static String getText(String htmlContent, ParserConfig config) { 61 | 62 | if (StringUtils.isBlank(htmlContent)) { 63 | return ""; 64 | } 65 | 66 | htmlContent = htmlContent.trim(); 67 | 68 | Document document = W3CDom.convert(Jsoup.parse(htmlContent)); 69 | Inscriptis inscriptis = new Inscriptis(document, config); 70 | 71 | return inscriptis.getText(); 72 | 73 | } 74 | 75 | @Test 76 | public void testBr() { 77 | 78 | //given 79 | String html = "
" 80 | + "first

"; 81 | 82 | // when 83 | // then 84 | assertThat(getText(html)).isEqualTo("\nfirst"); 85 | } 86 | 87 | @Test 88 | public void testContent() { 89 | 90 | // given 91 | // when 92 | // then 93 | assertThat(getText("first")).isEqualTo("first"); 94 | } 95 | 96 | @Test 97 | public void testDisplayAnchors() { 98 | 99 | // given 100 | String html = "\n" 101 | + " \n" 102 | + " first\n" 103 | + " second\n" 104 | + " \n" 105 | + ""; 106 | 107 | // when 108 | ParserConfig config = new ParserConfig(); 109 | config.setDisplayAnchors(true); 110 | 111 | String text = getText(html, config); 112 | 113 | //then 114 | assertThat(text).isEqualTo("[first](first) second"); 115 | } 116 | 117 | @Test 118 | public void testDisplayImages() { 119 | 120 | // given 121 | String html = "\n" 122 | + " \n" 123 | + " \"Ein\n" 124 | + " \"Ein\n" 125 | + " \"Ein\n" 126 | + " \n" 127 | + ""; 128 | 129 | // when 130 | ParserConfig config = new ParserConfig(); 131 | config.setDisplayImages(true); 132 | 133 | String text = getText(html, config); 134 | 135 | //then 136 | assertThat(text).isEqualTo("[Ein Test Bild] [Ein Test Bild] [Ein zweites Bild]"); 137 | } 138 | 139 | @Test 140 | public void testDisplayImagesDeduplicated() { 141 | 142 | // given 143 | String html = "\n" 144 | + " \n" 145 | + " \"Ein\n" 146 | + " \"Ein\n" 147 | + " \"Ein\n" 148 | + " \n" 149 | + ""; 150 | 151 | // when 152 | ParserConfig config = new ParserConfig(); 153 | config.setDisplayImages(true); 154 | config.setDeduplicateCaptions(true); 155 | 156 | String text = getText(html, config); 157 | 158 | //then 159 | assertThat(text).isEqualTo("[Ein Test Bild] [Ein zweites Bild]"); 160 | } 161 | 162 | @Test 163 | public void testDisplayLinks() { 164 | 165 | // given 166 | String html = "\n" 167 | + " \n" 168 | + " first\n" 169 | + " second\n" 170 | + " third\n" 171 | + " \n" 172 | + ""; 173 | 174 | // when 175 | ParserConfig config = new ParserConfig(); 176 | config.setDisplayLinks(true); 177 | 178 | String text = getText(html, config); 179 | 180 | //then 181 | assertThat(text).isEqualTo("[first](first) [second](second) third"); 182 | } 183 | 184 | @Test 185 | public void testDisplayLinksAndAnchors() { 186 | 187 | // given 188 | String html = "\n" 189 | + " \n" 190 | + " first\n" 191 | + " second\n" 192 | + " third\n" 193 | + " \n" 194 | + ""; 195 | 196 | // when 197 | ParserConfig config = new ParserConfig(); 198 | config.setDisplayLinks(true); 199 | config.setDisplayAnchors(true); 200 | 201 | String text = getText(html, config); 202 | 203 | //then 204 | assertThat(text).isEqualTo("[first](first) [second](second) [third](third)"); 205 | } 206 | 207 | @Test 208 | public void testDivs() { 209 | // given 210 | ParserConfig config = new ParserConfig(CssProfile.STRICT); 211 | 212 | // when 213 | // then 214 | assertThat(getText("Thomas
Anton
Maria", config)).isEqualTo("Thomas\nAnton\nMaria"); 215 | assertThat(getText("Thomas
Anna läuft weit weg.
", config)).isEqualTo("Thomas\nAnna läuft weit weg."); 216 | assertThat(getText("Thomas
  • Anton
    Maria
", config)).isEqualTo("Thomas\n * Anton\n Maria"); 217 | assertThat(getText("Thomas
  • Anton
    Maria
", config)).isEqualTo("Thomas\n * Anton\n Maria"); 218 | assertThat(getText("Thomas
  • a
    Anton
    Maria
", config)).isEqualTo("Thomas\n * a\n Anton\n Maria"); 219 | } 220 | 221 | @Test 222 | public void testEmptyAndCorrupt() { 223 | 224 | // given 225 | // when 226 | // then 227 | assertThat(getText("test")).isEqualTo("test"); 228 | assertThat(getText(" ")).isEqualTo(""); 229 | assertThat(getText("")).isEqualTo(""); 230 | assertThat(getText("<<<")).isEqualTo("<<<"); // not equal to python version 231 | } 232 | 233 | @Test 234 | public void testForgottenTdCloseTagOneLine() { 235 | 236 | // given 237 | String html = ("hallo
12
echo"); 238 | 239 | // when 240 | // then 241 | assertThat(getText(html)).isEqualTo("hallo\n1 2\necho"); 242 | } 243 | 244 | @Test 245 | public void testForgottenTdCloseTagTwoLines() { 246 | 247 | // given 248 | String html = ("hallo
12
34
echo"); 249 | 250 | // when 251 | // then 252 | assertThat(getText(html)).isEqualTo("hallo\n1 2\n3 4\necho"); 253 | } 254 | 255 | @Test 256 | public void testHtmlSnippets() throws IOException, URISyntaxException { 257 | 258 | // given 259 | Path path = Paths.get(getClass().getClassLoader().getResource("snippets").toURI()); 260 | 261 | Set textFiles; 262 | try (Stream stream = Files.walk(path)) { 263 | textFiles = stream 264 | .filter(file -> !Files.isDirectory(file)) 265 | .filter(file -> file.getFileName().toString().endsWith(".txt")) 266 | .collect(Collectors.toSet()); 267 | } 268 | 269 | for (Path textFile : textFiles) { 270 | String text = new String(Files.readAllBytes(textFile), StandardCharsets.UTF_8); 271 | String html = new String(Files.readAllBytes(Paths.get(textFile.toString().replace(".txt", ".html"))), StandardCharsets.UTF_8); 272 | 273 | text = StringUtils.stripTrailing(text); 274 | html = "" + html + ""; 275 | 276 | // when 277 | ParserConfig config = new ParserConfig(CssProfile.STRICT); 278 | String result = getText(html, config); 279 | 280 | // then 281 | assertThat(result) 282 | .as(textFile.getFileName().toString()) 283 | .isEqualTo(text); 284 | } 285 | } 286 | 287 | @Test 288 | public void testLimitWhitespaceAffixes() { 289 | 290 | // given 291 | String html = "\n" 292 | + " \n" 293 | + " halloecho\n" 294 | + "
\n"
295 | 			+ "def hallo():\n"
296 | 			+ "   print(\"echo\")\n"
297 | 			+ "    
\n" 298 | + " \n" 299 | + ""; 300 | 301 | // when 302 | String text = getText(html); 303 | 304 | //then 305 | assertThat(text).isEqualTo("hallo echo\ndef hallo():\n print(\"echo\")"); // not equal to python version 306 | } 307 | 308 | @Test 309 | public void testMarginBefore() { 310 | 311 | // given 312 | // when 313 | // then 314 | assertThat(getText("

first

")).isEqualTo("first"); 315 | } 316 | 317 | @Test 318 | public void testMarginBeforeWithLinebreak() { 319 | 320 | // given 321 | String html = "first

" 322 | + "second

"; 323 | 324 | // when 325 | // then 326 | assertThat(getText(html)).isEqualTo("first\nsecond"); 327 | } 328 | 329 | /** 330 | * Ensures that two successive <a>text</a> contain a space between each other, if there is 331 | * a linebreak or space between the tags. 332 | */ 333 | @Test 334 | public void testSuccessiveA() { 335 | 336 | // given 337 | String htmlNoNewLine = "firstsecond"; 338 | String htmlWithNewLine = "first\nsecond"; 339 | 340 | // when 341 | // then 342 | assertThat(getText(htmlNoNewLine)).isEqualTo("firstsecond"); 343 | assertThat(getText(htmlWithNewLine)).isEqualTo("first second"); 344 | } 345 | 346 | @Test 347 | public void testWhiteSpace() { 348 | 349 | // given 350 | ParserConfig config = new ParserConfig(CssProfile.STRICT); 351 | 352 | // when 353 | // then 354 | assertThat(getText("12\n3", config)).isEqualTo("12 3"); 355 | assertThat(getText("12\n3", config)).isEqualTo("12 3"); 356 | assertThat(getText("12\n3", config)).isEqualTo("12\n3"); 357 | assertThat(getText("12\n3", config)).isEqualTo("12\n3"); 358 | assertThat(getText("12\n3", config)).isEqualTo("12\n3"); 359 | } 360 | 361 | /** 362 | * Ensures that xml declaration are correctly stripped. 363 | */ 364 | @Test 365 | public void testXmlDeclaration() { 366 | 367 | // given 368 | // when 369 | // then 370 | assertThat(getText(" Hallo?>")).isEqualTo("Hallo?>"); 371 | } 372 | } 373 | -------------------------------------------------------------------------------- /src/test/java/ch/x28/inscriptis/LineTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 the original author or authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package ch.x28.inscriptis; 17 | 18 | import static org.assertj.core.api.Assertions.*; 19 | 20 | import org.junit.jupiter.api.Test; 21 | 22 | /** 23 | * @author Sascha Wolski 24 | * @author Matthias Hewelt 25 | */ 26 | public class LineTest { 27 | 28 | @Test 29 | public void testGetText() { 30 | // given 31 | Line line = new Line(); 32 | line.addContent("Ehre sei Gott!"); 33 | 34 | // when 35 | // then 36 | assertThat(line.getText()).isEqualTo("Ehre sei Gott!"); 37 | 38 | } 39 | 40 | @Test 41 | public void testListBulletWithoutPadding() { 42 | // given 43 | Line line = new Line(); 44 | line.addContent("Ehre sei Gott!"); 45 | line.setListBullet("* "); 46 | 47 | // when 48 | // then 49 | assertThat(line.getText()).isEqualTo("* Ehre sei Gott!"); 50 | } 51 | 52 | @Test 53 | public void testListBulletWithPadding() { 54 | // given 55 | Line line = new Line(); 56 | line.addContent("Ehre sei Gott!"); 57 | line.setListBullet("* "); 58 | line.setPadding(3); 59 | 60 | // when 61 | // then 62 | assertThat(line.getText()).isEqualTo(" * Ehre sei Gott!"); 63 | } 64 | 65 | @Test 66 | public void testMargin() { 67 | // given 68 | Line line = new Line(); 69 | line.addContent("Ehre sei Gott!"); 70 | line.setMarginBefore(1); 71 | line.setMarginAfter(2); 72 | 73 | // when 74 | // then 75 | assertThat(line.getText()).isEqualTo("\nEhre sei Gott!\n\n"); 76 | } 77 | 78 | @Test 79 | public void testPrefixAndSuffix() { 80 | // given 81 | Line line = new Line(); 82 | line.addContent("Ehre sei Gott!"); 83 | line.setPrefix(">>"); 84 | line.setSuffix("<<"); 85 | 86 | // when 87 | // then 88 | assertThat(line.getText()).isEqualTo(">>Ehre sei Gott!<<"); 89 | } 90 | 91 | } 92 | -------------------------------------------------------------------------------- /src/test/java/ch/x28/inscriptis/TableCellTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 the original author or authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package ch.x28.inscriptis; 17 | 18 | import static org.assertj.core.api.Assertions.*; 19 | 20 | import java.util.ArrayList; 21 | import java.util.List; 22 | 23 | import org.junit.jupiter.api.Test; 24 | 25 | import ch.x28.inscriptis.HtmlProperties.HorizontalAlignment; 26 | 27 | /** 28 | * @author Sascha Wolski 29 | * @author Matthias Hewelt 30 | */ 31 | public class TableCellTest { 32 | 33 | @Test 34 | public void testLeftAlign() { 35 | 36 | //given 37 | List canvas = new ArrayList<>(); 38 | TableCell cell = new TableCell(canvas, HorizontalAlignment.LEFT, 16, null); 39 | 40 | // when 41 | canvas.add("Ehre sei Gott!"); 42 | 43 | // then 44 | assertThat(cell.getCellLines()) 45 | .first() 46 | .isEqualTo("Ehre sei Gott! "); 47 | } 48 | 49 | @Test 50 | public void testRightAlign() { 51 | 52 | //given 53 | List canvas = new ArrayList<>(); 54 | TableCell cell = new TableCell(canvas, HorizontalAlignment.RIGHT, 16, null); 55 | 56 | // when 57 | canvas.add("Ehre sei Gott!"); 58 | 59 | // then 60 | assertThat(cell.getCellLines()) 61 | .first() 62 | .isEqualTo(" Ehre sei Gott!"); 63 | } 64 | 65 | } 66 | -------------------------------------------------------------------------------- /src/test/resources/snippets/br-in-table.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 |
First
a special line
SecondThird
abc
11 | -------------------------------------------------------------------------------- /src/test/resources/snippets/br-in-table.txt: -------------------------------------------------------------------------------- 1 | First Second Third 2 | a special line 3 | a b c 4 | -------------------------------------------------------------------------------- /src/test/resources/snippets/br-in-table2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 |
First
a special line
SecondThird
abc
a2c2
  • first
  • second
  • third
c3
last1last2last3
23 | -------------------------------------------------------------------------------- /src/test/resources/snippets/br-li.html: -------------------------------------------------------------------------------- 1 | List 2 |
    3 |
  • first line
    4 | second line 5 |
  • third line 6 |
7 | -------------------------------------------------------------------------------- /src/test/resources/snippets/br-li.txt: -------------------------------------------------------------------------------- 1 | List 2 | * first line 3 | second line 4 | * third line 5 | -------------------------------------------------------------------------------- /src/test/resources/snippets/br.html: -------------------------------------------------------------------------------- 1 | First line
2 | second line 3 | -------------------------------------------------------------------------------- /src/test/resources/snippets/br.txt: -------------------------------------------------------------------------------- 1 | First line 2 | second line 3 | -------------------------------------------------------------------------------- /src/test/resources/snippets/direct-enumeration.html: -------------------------------------------------------------------------------- 1 |
    2 |
  1. First 3 |
  2. Second 4 |
      5 |
    1. Sec, First 6 |
    2. Sec, Second 7 |
        8 |
      • item 9 |
      • item2 10 |
      11 |
    12 |
  3. Third 13 |
14 | -------------------------------------------------------------------------------- /src/test/resources/snippets/direct-enumeration.txt: -------------------------------------------------------------------------------- 1 | 1. First 2 | 2. Second 3 | 1. Sec, First 4 | 2. Sec, Second 5 | o item 6 | o item2 7 | 3. Third 8 | -------------------------------------------------------------------------------- /src/test/resources/snippets/empty-table.html: -------------------------------------------------------------------------------- 1 | 2 | 1 3 |
4 | -------------------------------------------------------------------------------- /src/test/resources/snippets/empty-table.txt: -------------------------------------------------------------------------------- 1 | 1 2 | -------------------------------------------------------------------------------- /src/test/resources/snippets/enumerations.html: -------------------------------------------------------------------------------- 1 | Hallo 2 |
    3 |
  1. First 4 |
  2. Second 5 |
      6 |
    1. Second, First 7 |
    2. Second, Second 8 |
        9 |
      • item 10 |
      • item2 11 |
      12 |
    13 |
  3. Third 14 |
15 | -------------------------------------------------------------------------------- /src/test/resources/snippets/enumerations.txt: -------------------------------------------------------------------------------- 1 | Hallo 2 | 1. First 3 | 2. Second 4 | 1. Second, First 5 | 2. Second, Second 6 | o item 7 | o item2 8 | 3. Third 9 | -------------------------------------------------------------------------------- /src/test/resources/snippets/invalid-table.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 |
FirstSecondThird
anybetacharly
long timeshort timemedium time
15 | -------------------------------------------------------------------------------- /src/test/resources/snippets/invalid-table.txt: -------------------------------------------------------------------------------- 1 | First Second Third 2 | any beta charly 3 | long time short time medium time 4 | -------------------------------------------------------------------------------- /src/test/resources/snippets/invalid-table2.html: -------------------------------------------------------------------------------- 1 | Good day 2 |
second third 4 |
5 | forth 6 | 7 | beta
alpha 8 | epsilon 9 |
gamma 10 |
11 | -------------------------------------------------------------------------------- /src/test/resources/snippets/invalid-table2.txt: -------------------------------------------------------------------------------- 1 | Good day first second third 2 | forth beta 3 | alpha epsilon 4 | gamma 5 | -------------------------------------------------------------------------------- /src/test/resources/snippets/invisible.html: -------------------------------------------------------------------------------- 1 | <ul>hallo 2 | 3 |

Title

4 | noch mehr text 5 | -------------------------------------------------------------------------------- /src/test/resources/snippets/invisible.txt: -------------------------------------------------------------------------------- 1 | Title 2 | -------------------------------------------------------------------------------- /src/test/resources/snippets/invisible2.html: -------------------------------------------------------------------------------- 1 |

Leertest

2 | halloecho 3 | -------------------------------------------------------------------------------- /src/test/resources/snippets/invisible2.txt: -------------------------------------------------------------------------------- 1 | Leertest 2 | -------------------------------------------------------------------------------- /src/test/resources/snippets/nested-table.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 |
column with nested tablecolumn 2column 3
7 | 8 | 9 | 10 |
nestedtable
12
11 |
TomJoe
lastline
20 | -------------------------------------------------------------------------------- /src/test/resources/snippets/nested-table.txt: -------------------------------------------------------------------------------- 1 | column with nested table column 2 column 3 2 | nested table Tom Joe 3 | 1 2 4 | 5 | last line 6 | -------------------------------------------------------------------------------- /src/test/resources/snippets/p-br.html: -------------------------------------------------------------------------------- 1 | L


2 | B 3 |

4 | Line 5 |
6 |

Another line
7 | Third line

8 |
9 | Forth line 10 | -------------------------------------------------------------------------------- /src/test/resources/snippets/p-br.txt: -------------------------------------------------------------------------------- 1 | L 2 | 3 | 4 | B 5 | 6 | Line 7 | 8 | Another line 9 | Third line 10 | 11 | Forth line 12 | -------------------------------------------------------------------------------- /src/test/resources/snippets/pre.html: -------------------------------------------------------------------------------- 1 |

Pre elements

2 | 3 |
 4 | b = 1
 5 | for a in range(10):
 6 |    print(a)
 7 |    b *= a
 8 |    print(b)
 9 | 
10 | 11 |

A pre block within an enumeration

12 | 13 |
    14 |
  • Hallo
  • 15 |
  • b = 1
    16 | for a in range(10):
    17 |    print(a)
    18 |    b *= a
    19 |    print(b)
    20 | 
  • 21 |
  • Echo
  • 22 | 23 | -------------------------------------------------------------------------------- /src/test/resources/snippets/pre.txt: -------------------------------------------------------------------------------- 1 | Pre elements 2 | 3 | b = 1 4 | for a in range(10): 5 | print(a) 6 | b *= a 7 | print(b) 8 | 9 | 10 | A pre block within an enumeration 11 | 12 | * Hallo 13 | * b = 1 14 | for a in range(10): 15 | print(a) 16 | b *= a 17 | print(b) 18 | 19 | * Echo 20 | -------------------------------------------------------------------------------- /src/test/resources/snippets/table-alignment.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 |
    TitelBeschreibungLänge
    123
    Der Prinz von ÄgyptenBasierend auf dem Buch Exodus99 min
    Leo Lausemaus Der Läusebub99 min
    8 | 9 | -------------------------------------------------------------------------------- /src/test/resources/snippets/table-alignment.txt: -------------------------------------------------------------------------------- 1 | Titel Beschreibung Länge 2 | 1 2 3 3 | Der Prinz von Ägypten Basierend auf dem Buch Exodus 99 min 4 | Leo Lausemaus Der Läusebub 99 min 5 | -------------------------------------------------------------------------------- /src/test/resources/snippets/table-in-table.html: -------------------------------------------------------------------------------- 1 |

    Single

    2 | 3 |

    First

    4 | 5 | 6 |
    redgreen
    blue
    redgreen
    7 | 8 |

    Second

    9 | 10 | 11 |
    blue
    redgreen
    blue
    12 | 13 |

    Nested

    14 | 15 | 16 | 19 | 22 | 25 | 26 | 27 | 30 | 33 | 36 | 37 | 38 | 41 | 44 | 47 | 48 |
    17 | 18 |
    redgreen
    blue
    redgreen
    20 | 21 |
    blue
    redgreen
    blue
    23 | 24 |
    blue
    redgreen
    blue
    28 | 29 |
    blue
    redgreen
    blue
    31 | 32 |
    redgreen
    blue
    redgreen
    34 | 35 |
    blue
    redgreen
    blue
    39 | 40 |
    redgreen
    blue
    redgreen
    42 | 43 |
    blue
    redgreen
    blue
    45 | 46 |
    blue
    redgreen
    blue
    49 | -------------------------------------------------------------------------------- /src/test/resources/snippets/table-in-table.txt: -------------------------------------------------------------------------------- 1 | Single 2 | 3 | 4 | First 5 | 6 | red green 7 | blue 8 | red green 9 | 10 | Second 11 | 12 | blue 13 | red green 14 | blue 15 | 16 | Nested 17 | 18 | red green blue blue 19 | blue red green red green 20 | red green blue blue 21 | 22 | blue red green blue 23 | red green blue red green 24 | blue red green blue 25 | 26 | red green blue blue 27 | blue red green red green 28 | red green blue blue 29 | -------------------------------------------------------------------------------- /src/test/resources/snippets/table-itemize.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 |

    An der Gewerbeausstellung vom 1.-3.September sind wir nicht persönlich anwesend. 5 |

    6 |
    8 | -------------------------------------------------------------------------------- /src/test/resources/snippets/table-itemize.txt: -------------------------------------------------------------------------------- 1 | * aktuell An der Gewerbeausstellung vom 1.-3.September sind wir nicht persönlich anwesend. 2 | * projekte 3 | * zu verkaufen 4 | * offene stelle 5 | -------------------------------------------------------------------------------- /src/test/resources/snippets/table-pre.html: -------------------------------------------------------------------------------- 1 |

    Pre elements that have been nested in a table.

    2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 18 | 28 | 31 | 34 | 35 |
    PythonJava
    10 |
    11 | b = 1
    12 | for a in range(10):
    13 |    print(a)
    14 |    b *= a
    15 |    print(b)
    16 | 
    17 |
    19 |
    20 | int b = 1;
    21 | for (int a=0; a<10; a++) {
    22 |    System.out.println(a);
    23 |    b = b * a;
    24 |    System.out.println(b);
    25 | }
    26 | 
    27 |
    29 | 3.8 30 | 32 | 14 33 |
    36 | 37 | -------------------------------------------------------------------------------- /src/test/resources/snippets/table-pre.txt: -------------------------------------------------------------------------------- 1 | Pre elements that have been nested in a table. 2 | 3 | Python Java 4 | b = 1 int b = 1; 5 | for a in range(10): for (int a=0; a<10; a++) { 6 | print(a) System.out.println(a); 7 | b *= a b = b * a; 8 | print(b) System.out.println(b); 9 | } 10 | 11 | 3.8 14 12 | -------------------------------------------------------------------------------- /src/test/resources/snippets/table.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 |
    FirstSecondThird
    abc
    11 | -------------------------------------------------------------------------------- /src/test/resources/snippets/table.txt: -------------------------------------------------------------------------------- 1 | First Second Third 2 | a b c 3 | -------------------------------------------------------------------------------- /src/test/resources/snippets/td-only-table.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |
    123
    6 | -------------------------------------------------------------------------------- /src/test/resources/snippets/td-only-table.txt: -------------------------------------------------------------------------------- 1 | 1 2 3 2 | -------------------------------------------------------------------------------- /src/test/resources/snippets/tr-only-table.html: -------------------------------------------------------------------------------- 1 | 2 | 1 3 | 2 4 | 3 5 |
    6 | -------------------------------------------------------------------------------- /src/test/resources/snippets/tr-only-table.txt: -------------------------------------------------------------------------------- 1 | 1 2 3 2 | 3 | 4 | -------------------------------------------------------------------------------- /src/test/resources/snippets/whitespace.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Das 5 | ist 6 | interessant 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /src/test/resources/snippets/whitespace.txt: -------------------------------------------------------------------------------- 1 | 2 | Das 3 | ist 4 | interessant 5 | -------------------------------------------------------------------------------- /src/test/resources/snippets/wikipedia-code.html: -------------------------------------------------------------------------------- 1 |

    Pythons Schlüsselwort lambda könnte manche Anhänger der funktionalen Programmierung fehlleiten. Solche lambda-Blöcke in Python können nur Ausdrücke enthalten, aber keine Anweisungen. Damit werden solche Anweisungen generell nicht verwendet, um eine Funktion zurückzugeben. Die übliche Vorgehensweise ist stattdessen, den Namen einer lokalen Funktion zurückzugeben. Das folgende Beispiel zeigt dies anhand einer einfachen Funktion nach den Ideen von Haskell Brooks Curry: 2 |

    3 |
    def add_and_print_maker(x):
     4 |     def temp(y):
     5 |         print("{} + {} = {}".format(x, y, x + y))
     6 | 
     7 |     return temp
     8 | 
    9 |

    Damit ist auch Currying auf einfache Art möglich, um generische Funktionsobjekte auf problemspezifische herunterzubrechen. Hier ein einfaches Beispiel: 10 |

    11 |
    def curry(func, known_argument):
    12 |     return lambda unknown_argument: func(unknown_argument, known_argument)
    13 | 
    14 |

    Wird die curry-Funktion aufgerufen, erwartet diese eine Funktion mit zwei notwendigen Parametern sowie die Parameterbelegung für den zweiten Parameter dieser Funktion. Der Rückgabewert von curry ist eine Funktion, die das Gleiche tut wie func, aber nur noch einen Parameter benötigt. 15 | -------------------------------------------------------------------------------- /src/test/resources/snippets/wikipedia-code.txt: -------------------------------------------------------------------------------- 1 | Pythons Schlüsselwort lambda könnte manche Anhänger der funktionalen Programmierung fehlleiten. Solche lambda-Blöcke in Python können nur Ausdrücke enthalten, aber keine Anweisungen. Damit werden solche Anweisungen generell nicht verwendet, um eine Funktion zurückzugeben. Die übliche Vorgehensweise ist stattdessen, den Namen einer lokalen Funktion zurückzugeben. Das folgende Beispiel zeigt dies anhand einer einfachen Funktion nach den Ideen von Haskell Brooks Curry: 2 | 3 | def add_and_print_maker(x): 4 | def temp(y): 5 | print("{} + {} = {}".format(x, y, x + y)) 6 | 7 | return temp 8 | 9 | 10 | Damit ist auch Currying auf einfache Art möglich, um generische Funktionsobjekte auf problemspezifische herunterzubrechen. Hier ein einfaches Beispiel: 11 | 12 | def curry(func, known_argument): 13 | return lambda unknown_argument: func(unknown_argument, known_argument) 14 | 15 | 16 | Wird die curry-Funktion aufgerufen, erwartet diese eine Funktion mit zwei notwendigen Parametern sowie die Parameterbelegung für den zweiten Parameter dieser Funktion. Der Rückgabewert von curry ist eine Funktion, die das Gleiche tut wie func, aber nur noch einen Parameter benötigt. 17 | -------------------------------------------------------------------------------- /src/test/resources/snippets/wikipedia-enumeration.html: -------------------------------------------------------------------------------- 1 |

    2 | Inhaltsverzeichnis 3 |
    4 | 62 | -------------------------------------------------------------------------------- /src/test/resources/snippets/wikipedia-enumeration.txt: -------------------------------------------------------------------------------- 1 | Inhaltsverzeichnis 2 | * 1 Name und Aussprache 3 | * 2 Geographie 4 | + 2.1 Stadtquartiere 5 | + 2.2 Klima 6 | * 3 Geschichte 7 | + 3.1 Vorrömische Zeit 8 | + 3.2 Antike 9 | + 3.3 Mittelalter 10 | + 3.4 Wende zur Neuzeit 11 | + 3.5 Reformation und Dreissigjähriger Krieg 12 | + 3.6 19. Jahrhundert 13 | + 3.7 Moderne und Gegenwart 14 | * 4 Bevölkerung 15 | + 4.1 Sprachen 16 | + 4.2 Religionen 17 | * 5 Wappen 18 | * 6 Politik 19 | + 6.1 Stadtpräsidenten 20 | + 6.2 Partnerstädte 21 | * 7 Wirtschaft und Infrastruktur 22 | + 7.1 Wirtschaft 23 | + 7.2 Land- und Alpwirtschaft 24 | + 7.3 Verkehr 25 | + 7.4 Bildung 26 | + 7.5 Medien 27 | + 7.6 Kultur 28 | + 7.7 Justiz 29 | + 7.8 Friedhöfe 30 | + 7.9 Sportvereine 31 | * 8 Sehenswürdigkeiten und Tourismus 32 | + 8.1 Tourismus 33 | * 9 Besonderes 34 | * 10 Galerie 35 | * 11 Persönlichkeiten 36 | * 12 Siehe auch 37 | * 13 Literatur 38 | * 14 Weblinks 39 | * 15 Einzelnachweise 40 | -------------------------------------------------------------------------------- /src/test/resources/snippets/wikipedia-table.html: -------------------------------------------------------------------------------- 1 |

    Bevölkerung[Bearbeiten]

    2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 |
    Bevölkerungsentwicklung[6]
    Jahr150018601900195019702000200520112012
    Einwohnerca. 1500399011'53219'38231'19332'98932'40936'69037'036
    31 | -------------------------------------------------------------------------------- /src/test/resources/snippets/wikipedia-table.txt: -------------------------------------------------------------------------------- 1 | Bevölkerung[Bearbeiten] 2 | 3 | Bevölkerungsentwicklung[6] 4 | Jahr 1500 1860 1900 1950 1970 2000 2005 2011 2012 5 | Einwohner ca. 1500 3990 11'532 19'382 31'193 32'989 32'409 36'690 37'036 6 | --------------------------------------------------------------------------------