├── .github
└── workflows
│ └── maven-build.yml
├── .gitignore
├── .mvn
└── wrapper
│ ├── maven-wrapper.jar
│ └── maven-wrapper.properties
├── LICENSE.txt
├── README.md
├── mvnw
├── mvnw.cmd
├── pom.xml
└── src
├── main
└── java
│ └── ch
│ └── x28
│ └── inscriptis
│ ├── CssParse.java
│ ├── CssProfile.java
│ ├── HtmlElement.java
│ ├── HtmlProperties.java
│ ├── Inscriptis.java
│ ├── Line.java
│ ├── ParserConfig.java
│ ├── Row.java
│ ├── StringUtils.java
│ ├── Table.java
│ └── TableCell.java
└── test
├── java
└── ch
│ └── x28
│ └── inscriptis
│ ├── CssParseTest.java
│ ├── HtmlElementTest.java
│ ├── InscriptisTest.java
│ ├── LineTest.java
│ └── TableCellTest.java
└── resources
└── snippets
├── br-in-table.html
├── br-in-table.txt
├── br-in-table2.html
├── br-li.html
├── br-li.txt
├── br.html
├── br.txt
├── direct-enumeration.html
├── direct-enumeration.txt
├── empty-table.html
├── empty-table.txt
├── enumerations.html
├── enumerations.txt
├── invalid-table.html
├── invalid-table.txt
├── invalid-table2.html
├── invalid-table2.txt
├── invisible.html
├── invisible.txt
├── invisible2.html
├── invisible2.txt
├── nested-table.html
├── nested-table.txt
├── p-br.html
├── p-br.txt
├── pre.html
├── pre.txt
├── table-alignment.html
├── table-alignment.txt
├── table-in-table.html
├── table-in-table.txt
├── table-itemize.html
├── table-itemize.txt
├── table-pre.html
├── table-pre.txt
├── table.html
├── table.txt
├── td-only-table.html
├── td-only-table.txt
├── tr-only-table.html
├── tr-only-table.txt
├── whitespace.html
├── whitespace.txt
├── wikipedia-code.html
├── wikipedia-code.txt
├── wikipedia-enumeration.html
├── wikipedia-enumeration.txt
├── wikipedia-table.html
└── wikipedia-table.txt
/.github/workflows/maven-build.yml:
--------------------------------------------------------------------------------
1 | name: Maven Build
2 |
3 | on: [push]
4 |
5 | jobs:
6 | build:
7 | runs-on: ubuntu-latest
8 |
9 | steps:
10 | - uses: actions/checkout@v2
11 | - name: Set up JDK 8
12 | uses: actions/setup-java@v2
13 | with:
14 | java-version: '8'
15 | distribution: 'adopt'
16 | - name: Build with Maven
17 | run: mvn --batch-mode --update-snapshots verify
18 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /.classpath
2 | /.project
3 | /.settings/
4 | /target/
--------------------------------------------------------------------------------
/.mvn/wrapper/maven-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/x28/inscriptis-java/639e1661e353337a7c871f4ac4d4460c7317ac64/.mvn/wrapper/maven-wrapper.jar
--------------------------------------------------------------------------------
/.mvn/wrapper/maven-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.6.3/apache-maven-3.6.3-bin.zip
2 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | https://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright [yyyy] [name of copyright owner]
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | https://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://github.com/x28/inscriptis-java/actions/workflows/maven-build.yml)
2 | [](https://maven-badges.herokuapp.com/maven-central/ch.x28.inscriptis/inscriptis)
3 | [](https://javadoc.io/doc/ch.x28.inscriptis/inscriptis)
4 |
5 | # inscriptis - HTML to text conversion library for Java
6 |
7 | A Java-based HTML to text conversion library with support for nested tables and a subset of CSS. Please take a look at the [Rendering document](https://github.com/weblyzard/inscriptis/blob/master/RENDERING.md) for a demonstration of Inscriptis conversion quality.
8 |
9 | This is a Java port of [inscriptis for Python](https://github.com/weblyzard/inscriptis).
10 |
11 | ## Getting Started
12 |
13 | Here is a quick teaser of an application using inscriptis for Java:
14 |
15 | ```java
16 | package example;
17 |
18 | import org.jsoup.Jsoup;
19 | import org.jsoup.helper.W3CDom;
20 | import org.w3c.dom.Document;
21 |
22 | import ch.x28.inscriptis.Inscriptis;
23 |
24 | public class Example {
25 |
26 | public static void main(String[] args) {
27 |
28 | String htmlContent = "
Hello World!
";
29 |
30 | // use jsoup to parse HTML and convert it to W3C Document (https://jsoup.org)
31 | Document document = W3CDom.convert(Jsoup.parse(htmlContent));
32 |
33 | Inscriptis inscriptis = new Inscriptis(document);
34 | String text = inscriptis.getText();
35 |
36 | System.out.println(text); // Hello World!
37 | }
38 | }
39 | ```
40 |
41 | ## Maven configuration
42 |
43 | Add the Maven dependency:
44 |
45 | ```xml
46 |
47 | ch.x28.inscriptis
48 | inscriptis
49 | 1.0
50 |
51 | ```
52 |
53 | ## HTML parser
54 |
55 | inscriptis requires a W3C document, so it's up to you which parser you choose. Here is a list of parsers that support a W3C document result.
56 |
57 | ### jsoup
58 | https://jsoup.org/
59 |
60 | ### nu-validator HTML Parser
61 | https://mvnrepository.com/artifact/nu.validator/htmlparser
62 |
63 | ## License
64 |
65 | inscriptis for Java is an Open Source software released under the Apache License, Version 2.0
66 |
--------------------------------------------------------------------------------
/mvnw:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # ----------------------------------------------------------------------------
3 | # Licensed to the Apache Software Foundation (ASF) under one
4 | # or more contributor license agreements. See the NOTICE file
5 | # distributed with this work for additional information
6 | # regarding copyright ownership. The ASF licenses this file
7 | # to you under the Apache License, Version 2.0 (the
8 | # "License"); you may not use this file except in compliance
9 | # with the License. You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied. See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 | # ----------------------------------------------------------------------------
20 |
21 | # ----------------------------------------------------------------------------
22 | # Maven Start Up Batch script
23 | #
24 | # Required ENV vars:
25 | # ------------------
26 | # JAVA_HOME - location of a JDK home dir
27 | #
28 | # Optional ENV vars
29 | # -----------------
30 | # M2_HOME - location of maven2's installed home dir
31 | # MAVEN_OPTS - parameters passed to the Java VM when running Maven
32 | # e.g. to debug Maven itself, use
33 | # set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000
34 | # MAVEN_SKIP_RC - flag to disable loading of mavenrc files
35 | # ----------------------------------------------------------------------------
36 |
37 | if [ -z "$MAVEN_SKIP_RC" ] ; then
38 |
39 | if [ -f /etc/mavenrc ] ; then
40 | . /etc/mavenrc
41 | fi
42 |
43 | if [ -f "$HOME/.mavenrc" ] ; then
44 | . "$HOME/.mavenrc"
45 | fi
46 |
47 | fi
48 |
49 | # OS specific support. $var _must_ be set to either true or false.
50 | cygwin=false;
51 | darwin=false;
52 | mingw=false
53 | case "`uname`" in
54 | CYGWIN*) cygwin=true ;;
55 | MINGW*) mingw=true;;
56 | Darwin*) darwin=true
57 | # Use /usr/libexec/java_home if available, otherwise fall back to /Library/Java/Home
58 | # See https://developer.apple.com/library/mac/qa/qa1170/_index.html
59 | if [ -z "$JAVA_HOME" ]; then
60 | if [ -x "/usr/libexec/java_home" ]; then
61 | export JAVA_HOME="`/usr/libexec/java_home`"
62 | else
63 | export JAVA_HOME="/Library/Java/Home"
64 | fi
65 | fi
66 | ;;
67 | esac
68 |
69 | if [ -z "$JAVA_HOME" ] ; then
70 | if [ -r /etc/gentoo-release ] ; then
71 | JAVA_HOME=`java-config --jre-home`
72 | fi
73 | fi
74 |
75 | if [ -z "$M2_HOME" ] ; then
76 | ## resolve links - $0 may be a link to maven's home
77 | PRG="$0"
78 |
79 | # need this for relative symlinks
80 | while [ -h "$PRG" ] ; do
81 | ls=`ls -ld "$PRG"`
82 | link=`expr "$ls" : '.*-> \(.*\)$'`
83 | if expr "$link" : '/.*' > /dev/null; then
84 | PRG="$link"
85 | else
86 | PRG="`dirname "$PRG"`/$link"
87 | fi
88 | done
89 |
90 | saveddir=`pwd`
91 |
92 | M2_HOME=`dirname "$PRG"`/..
93 |
94 | # make it fully qualified
95 | M2_HOME=`cd "$M2_HOME" && pwd`
96 |
97 | cd "$saveddir"
98 | # echo Using m2 at $M2_HOME
99 | fi
100 |
101 | # For Cygwin, ensure paths are in UNIX format before anything is touched
102 | if $cygwin ; then
103 | [ -n "$M2_HOME" ] &&
104 | M2_HOME=`cygpath --unix "$M2_HOME"`
105 | [ -n "$JAVA_HOME" ] &&
106 | JAVA_HOME=`cygpath --unix "$JAVA_HOME"`
107 | [ -n "$CLASSPATH" ] &&
108 | CLASSPATH=`cygpath --path --unix "$CLASSPATH"`
109 | fi
110 |
111 | # For Mingw, ensure paths are in UNIX format before anything is touched
112 | if $mingw ; then
113 | [ -n "$M2_HOME" ] &&
114 | M2_HOME="`(cd "$M2_HOME"; pwd)`"
115 | [ -n "$JAVA_HOME" ] &&
116 | JAVA_HOME="`(cd "$JAVA_HOME"; pwd)`"
117 | fi
118 |
119 | if [ -z "$JAVA_HOME" ]; then
120 | javaExecutable="`which javac`"
121 | if [ -n "$javaExecutable" ] && ! [ "`expr \"$javaExecutable\" : '\([^ ]*\)'`" = "no" ]; then
122 | # readlink(1) is not available as standard on Solaris 10.
123 | readLink=`which readlink`
124 | if [ ! `expr "$readLink" : '\([^ ]*\)'` = "no" ]; then
125 | if $darwin ; then
126 | javaHome="`dirname \"$javaExecutable\"`"
127 | javaExecutable="`cd \"$javaHome\" && pwd -P`/javac"
128 | else
129 | javaExecutable="`readlink -f \"$javaExecutable\"`"
130 | fi
131 | javaHome="`dirname \"$javaExecutable\"`"
132 | javaHome=`expr "$javaHome" : '\(.*\)/bin'`
133 | JAVA_HOME="$javaHome"
134 | export JAVA_HOME
135 | fi
136 | fi
137 | fi
138 |
139 | if [ -z "$JAVACMD" ] ; then
140 | if [ -n "$JAVA_HOME" ] ; then
141 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
142 | # IBM's JDK on AIX uses strange locations for the executables
143 | JAVACMD="$JAVA_HOME/jre/sh/java"
144 | else
145 | JAVACMD="$JAVA_HOME/bin/java"
146 | fi
147 | else
148 | JAVACMD="`which java`"
149 | fi
150 | fi
151 |
152 | if [ ! -x "$JAVACMD" ] ; then
153 | echo "Error: JAVA_HOME is not defined correctly." >&2
154 | echo " We cannot execute $JAVACMD" >&2
155 | exit 1
156 | fi
157 |
158 | if [ -z "$JAVA_HOME" ] ; then
159 | echo "Warning: JAVA_HOME environment variable is not set."
160 | fi
161 |
162 | CLASSWORLDS_LAUNCHER=org.codehaus.plexus.classworlds.launcher.Launcher
163 |
164 | # traverses directory structure from process work directory to filesystem root
165 | # first directory with .mvn subdirectory is considered project base directory
166 | find_maven_basedir() {
167 |
168 | if [ -z "$1" ]
169 | then
170 | echo "Path not specified to find_maven_basedir"
171 | return 1
172 | fi
173 |
174 | basedir="$1"
175 | wdir="$1"
176 | while [ "$wdir" != '/' ] ; do
177 | if [ -d "$wdir"/.mvn ] ; then
178 | basedir=$wdir
179 | break
180 | fi
181 | # workaround for JBEAP-8937 (on Solaris 10/Sparc)
182 | if [ -d "${wdir}" ]; then
183 | wdir=`cd "$wdir/.."; pwd`
184 | fi
185 | # end of workaround
186 | done
187 | echo "${basedir}"
188 | }
189 |
190 | # concatenates all lines of a file
191 | concat_lines() {
192 | if [ -f "$1" ]; then
193 | echo "$(tr -s '\n' ' ' < "$1")"
194 | fi
195 | }
196 |
197 | BASE_DIR=`find_maven_basedir "$(pwd)"`
198 | if [ -z "$BASE_DIR" ]; then
199 | exit 1;
200 | fi
201 |
202 | ##########################################################################################
203 | # Extension to allow automatically downloading the maven-wrapper.jar from Maven-central
204 | # This allows using the maven wrapper in projects that prohibit checking in binary data.
205 | ##########################################################################################
206 | if [ -r "$BASE_DIR/.mvn/wrapper/maven-wrapper.jar" ]; then
207 | if [ "$MVNW_VERBOSE" = true ]; then
208 | echo "Found .mvn/wrapper/maven-wrapper.jar"
209 | fi
210 | else
211 | if [ "$MVNW_VERBOSE" = true ]; then
212 | echo "Couldn't find .mvn/wrapper/maven-wrapper.jar, downloading it ..."
213 | fi
214 | if [ -n "$MVNW_REPOURL" ]; then
215 | jarUrl="$MVNW_REPOURL/io/takari/maven-wrapper/0.5.6/maven-wrapper-0.5.6.jar"
216 | else
217 | jarUrl="https://repo.maven.apache.org/maven2/io/takari/maven-wrapper/0.5.6/maven-wrapper-0.5.6.jar"
218 | fi
219 | while IFS="=" read key value; do
220 | case "$key" in (wrapperUrl) jarUrl="$value"; break ;;
221 | esac
222 | done < "$BASE_DIR/.mvn/wrapper/maven-wrapper.properties"
223 | if [ "$MVNW_VERBOSE" = true ]; then
224 | echo "Downloading from: $jarUrl"
225 | fi
226 | wrapperJarPath="$BASE_DIR/.mvn/wrapper/maven-wrapper.jar"
227 | if $cygwin; then
228 | wrapperJarPath=`cygpath --path --windows "$wrapperJarPath"`
229 | fi
230 |
231 | if command -v wget > /dev/null; then
232 | if [ "$MVNW_VERBOSE" = true ]; then
233 | echo "Found wget ... using wget"
234 | fi
235 | if [ -z "$MVNW_USERNAME" ] || [ -z "$MVNW_PASSWORD" ]; then
236 | wget "$jarUrl" -O "$wrapperJarPath"
237 | else
238 | wget --http-user=$MVNW_USERNAME --http-password=$MVNW_PASSWORD "$jarUrl" -O "$wrapperJarPath"
239 | fi
240 | elif command -v curl > /dev/null; then
241 | if [ "$MVNW_VERBOSE" = true ]; then
242 | echo "Found curl ... using curl"
243 | fi
244 | if [ -z "$MVNW_USERNAME" ] || [ -z "$MVNW_PASSWORD" ]; then
245 | curl -o "$wrapperJarPath" "$jarUrl" -f
246 | else
247 | curl --user $MVNW_USERNAME:$MVNW_PASSWORD -o "$wrapperJarPath" "$jarUrl" -f
248 | fi
249 |
250 | else
251 | if [ "$MVNW_VERBOSE" = true ]; then
252 | echo "Falling back to using Java to download"
253 | fi
254 | javaClass="$BASE_DIR/.mvn/wrapper/MavenWrapperDownloader.java"
255 | # For Cygwin, switch paths to Windows format before running javac
256 | if $cygwin; then
257 | javaClass=`cygpath --path --windows "$javaClass"`
258 | fi
259 | if [ -e "$javaClass" ]; then
260 | if [ ! -e "$BASE_DIR/.mvn/wrapper/MavenWrapperDownloader.class" ]; then
261 | if [ "$MVNW_VERBOSE" = true ]; then
262 | echo " - Compiling MavenWrapperDownloader.java ..."
263 | fi
264 | # Compiling the Java class
265 | ("$JAVA_HOME/bin/javac" "$javaClass")
266 | fi
267 | if [ -e "$BASE_DIR/.mvn/wrapper/MavenWrapperDownloader.class" ]; then
268 | # Running the downloader
269 | if [ "$MVNW_VERBOSE" = true ]; then
270 | echo " - Running MavenWrapperDownloader.java ..."
271 | fi
272 | ("$JAVA_HOME/bin/java" -cp .mvn/wrapper MavenWrapperDownloader "$MAVEN_PROJECTBASEDIR")
273 | fi
274 | fi
275 | fi
276 | fi
277 | ##########################################################################################
278 | # End of extension
279 | ##########################################################################################
280 |
281 | export MAVEN_PROJECTBASEDIR=${MAVEN_BASEDIR:-"$BASE_DIR"}
282 | if [ "$MVNW_VERBOSE" = true ]; then
283 | echo $MAVEN_PROJECTBASEDIR
284 | fi
285 | MAVEN_OPTS="$(concat_lines "$MAVEN_PROJECTBASEDIR/.mvn/jvm.config") $MAVEN_OPTS"
286 |
287 | # For Cygwin, switch paths to Windows format before running java
288 | if $cygwin; then
289 | [ -n "$M2_HOME" ] &&
290 | M2_HOME=`cygpath --path --windows "$M2_HOME"`
291 | [ -n "$JAVA_HOME" ] &&
292 | JAVA_HOME=`cygpath --path --windows "$JAVA_HOME"`
293 | [ -n "$CLASSPATH" ] &&
294 | CLASSPATH=`cygpath --path --windows "$CLASSPATH"`
295 | [ -n "$MAVEN_PROJECTBASEDIR" ] &&
296 | MAVEN_PROJECTBASEDIR=`cygpath --path --windows "$MAVEN_PROJECTBASEDIR"`
297 | fi
298 |
299 | # Provide a "standardized" way to retrieve the CLI args that will
300 | # work with both Windows and non-Windows executions.
301 | MAVEN_CMD_LINE_ARGS="$MAVEN_CONFIG $@"
302 | export MAVEN_CMD_LINE_ARGS
303 |
304 | WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain
305 |
306 | exec "$JAVACMD" \
307 | $MAVEN_OPTS \
308 | -classpath "$MAVEN_PROJECTBASEDIR/.mvn/wrapper/maven-wrapper.jar" \
309 | "-Dmaven.home=${M2_HOME}" "-Dmaven.multiModuleProjectDirectory=${MAVEN_PROJECTBASEDIR}" \
310 | ${WRAPPER_LAUNCHER} $MAVEN_CONFIG "$@"
311 |
--------------------------------------------------------------------------------
/mvnw.cmd:
--------------------------------------------------------------------------------
1 | @REM ----------------------------------------------------------------------------
2 | @REM Licensed to the Apache Software Foundation (ASF) under one
3 | @REM or more contributor license agreements. See the NOTICE file
4 | @REM distributed with this work for additional information
5 | @REM regarding copyright ownership. The ASF licenses this file
6 | @REM to you under the Apache License, Version 2.0 (the
7 | @REM "License"); you may not use this file except in compliance
8 | @REM with the License. You may obtain a copy of the License at
9 | @REM
10 | @REM http://www.apache.org/licenses/LICENSE-2.0
11 | @REM
12 | @REM Unless required by applicable law or agreed to in writing,
13 | @REM software distributed under the License is distributed on an
14 | @REM "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | @REM KIND, either express or implied. See the License for the
16 | @REM specific language governing permissions and limitations
17 | @REM under the License.
18 | @REM ----------------------------------------------------------------------------
19 |
20 | @REM ----------------------------------------------------------------------------
21 | @REM Maven Start Up Batch script
22 | @REM
23 | @REM Required ENV vars:
24 | @REM JAVA_HOME - location of a JDK home dir
25 | @REM
26 | @REM Optional ENV vars
27 | @REM M2_HOME - location of maven2's installed home dir
28 | @REM MAVEN_BATCH_ECHO - set to 'on' to enable the echoing of the batch commands
29 | @REM MAVEN_BATCH_PAUSE - set to 'on' to wait for a keystroke before ending
30 | @REM MAVEN_OPTS - parameters passed to the Java VM when running Maven
31 | @REM e.g. to debug Maven itself, use
32 | @REM set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000
33 | @REM MAVEN_SKIP_RC - flag to disable loading of mavenrc files
34 | @REM ----------------------------------------------------------------------------
35 |
36 | @REM Begin all REM lines with '@' in case MAVEN_BATCH_ECHO is 'on'
37 | @echo off
38 | @REM set title of command window
39 | title %0
40 | @REM enable echoing by setting MAVEN_BATCH_ECHO to 'on'
41 | @if "%MAVEN_BATCH_ECHO%" == "on" echo %MAVEN_BATCH_ECHO%
42 |
43 | @REM set %HOME% to equivalent of $HOME
44 | if "%HOME%" == "" (set "HOME=%HOMEDRIVE%%HOMEPATH%")
45 |
46 | @REM Execute a user defined script before this one
47 | if not "%MAVEN_SKIP_RC%" == "" goto skipRcPre
48 | @REM check for pre script, once with legacy .bat ending and once with .cmd ending
49 | if exist "%HOME%\mavenrc_pre.bat" call "%HOME%\mavenrc_pre.bat"
50 | if exist "%HOME%\mavenrc_pre.cmd" call "%HOME%\mavenrc_pre.cmd"
51 | :skipRcPre
52 |
53 | @setlocal
54 |
55 | set ERROR_CODE=0
56 |
57 | @REM To isolate internal variables from possible post scripts, we use another setlocal
58 | @setlocal
59 |
60 | @REM ==== START VALIDATION ====
61 | if not "%JAVA_HOME%" == "" goto OkJHome
62 |
63 | echo.
64 | echo Error: JAVA_HOME not found in your environment. >&2
65 | echo Please set the JAVA_HOME variable in your environment to match the >&2
66 | echo location of your Java installation. >&2
67 | echo.
68 | goto error
69 |
70 | :OkJHome
71 | if exist "%JAVA_HOME%\bin\java.exe" goto init
72 |
73 | echo.
74 | echo Error: JAVA_HOME is set to an invalid directory. >&2
75 | echo JAVA_HOME = "%JAVA_HOME%" >&2
76 | echo Please set the JAVA_HOME variable in your environment to match the >&2
77 | echo location of your Java installation. >&2
78 | echo.
79 | goto error
80 |
81 | @REM ==== END VALIDATION ====
82 |
83 | :init
84 |
85 | @REM Find the project base dir, i.e. the directory that contains the folder ".mvn".
86 | @REM Fallback to current working directory if not found.
87 |
88 | set MAVEN_PROJECTBASEDIR=%MAVEN_BASEDIR%
89 | IF NOT "%MAVEN_PROJECTBASEDIR%"=="" goto endDetectBaseDir
90 |
91 | set EXEC_DIR=%CD%
92 | set WDIR=%EXEC_DIR%
93 | :findBaseDir
94 | IF EXIST "%WDIR%"\.mvn goto baseDirFound
95 | cd ..
96 | IF "%WDIR%"=="%CD%" goto baseDirNotFound
97 | set WDIR=%CD%
98 | goto findBaseDir
99 |
100 | :baseDirFound
101 | set MAVEN_PROJECTBASEDIR=%WDIR%
102 | cd "%EXEC_DIR%"
103 | goto endDetectBaseDir
104 |
105 | :baseDirNotFound
106 | set MAVEN_PROJECTBASEDIR=%EXEC_DIR%
107 | cd "%EXEC_DIR%"
108 |
109 | :endDetectBaseDir
110 |
111 | IF NOT EXIST "%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config" goto endReadAdditionalConfig
112 |
113 | @setlocal EnableExtensions EnableDelayedExpansion
114 | for /F "usebackq delims=" %%a in ("%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config") do set JVM_CONFIG_MAVEN_PROPS=!JVM_CONFIG_MAVEN_PROPS! %%a
115 | @endlocal & set JVM_CONFIG_MAVEN_PROPS=%JVM_CONFIG_MAVEN_PROPS%
116 |
117 | :endReadAdditionalConfig
118 |
119 | SET MAVEN_JAVA_EXE="%JAVA_HOME%\bin\java.exe"
120 | set WRAPPER_JAR="%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.jar"
121 | set WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain
122 |
123 | set DOWNLOAD_URL="https://repo.maven.apache.org/maven2/io/takari/maven-wrapper/0.5.6/maven-wrapper-0.5.6.jar"
124 |
125 | FOR /F "tokens=1,2 delims==" %%A IN ("%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.properties") DO (
126 | IF "%%A"=="wrapperUrl" SET DOWNLOAD_URL=%%B
127 | )
128 |
129 | @REM Extension to allow automatically downloading the maven-wrapper.jar from Maven-central
130 | @REM This allows using the maven wrapper in projects that prohibit checking in binary data.
131 | if exist %WRAPPER_JAR% (
132 | if "%MVNW_VERBOSE%" == "true" (
133 | echo Found %WRAPPER_JAR%
134 | )
135 | ) else (
136 | if not "%MVNW_REPOURL%" == "" (
137 | SET DOWNLOAD_URL="%MVNW_REPOURL%/io/takari/maven-wrapper/0.5.6/maven-wrapper-0.5.6.jar"
138 | )
139 | if "%MVNW_VERBOSE%" == "true" (
140 | echo Couldn't find %WRAPPER_JAR%, downloading it ...
141 | echo Downloading from: %DOWNLOAD_URL%
142 | )
143 |
144 | powershell -Command "&{"^
145 | "$webclient = new-object System.Net.WebClient;"^
146 | "if (-not ([string]::IsNullOrEmpty('%MVNW_USERNAME%') -and [string]::IsNullOrEmpty('%MVNW_PASSWORD%'))) {"^
147 | "$webclient.Credentials = new-object System.Net.NetworkCredential('%MVNW_USERNAME%', '%MVNW_PASSWORD%');"^
148 | "}"^
149 | "[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12; $webclient.DownloadFile('%DOWNLOAD_URL%', '%WRAPPER_JAR%')"^
150 | "}"
151 | if "%MVNW_VERBOSE%" == "true" (
152 | echo Finished downloading %WRAPPER_JAR%
153 | )
154 | )
155 | @REM End of extension
156 |
157 | @REM Provide a "standardized" way to retrieve the CLI args that will
158 | @REM work with both Windows and non-Windows executions.
159 | set MAVEN_CMD_LINE_ARGS=%*
160 |
161 | %MAVEN_JAVA_EXE% %JVM_CONFIG_MAVEN_PROPS% %MAVEN_OPTS% %MAVEN_DEBUG_OPTS% -classpath %WRAPPER_JAR% "-Dmaven.multiModuleProjectDirectory=%MAVEN_PROJECTBASEDIR%" %WRAPPER_LAUNCHER% %MAVEN_CONFIG% %*
162 | if ERRORLEVEL 1 goto error
163 | goto end
164 |
165 | :error
166 | set ERROR_CODE=1
167 |
168 | :end
169 | @endlocal & set ERROR_CODE=%ERROR_CODE%
170 |
171 | if not "%MAVEN_SKIP_RC%" == "" goto skipRcPost
172 | @REM check for post script, once with legacy .bat ending and once with .cmd ending
173 | if exist "%HOME%\mavenrc_post.bat" call "%HOME%\mavenrc_post.bat"
174 | if exist "%HOME%\mavenrc_post.cmd" call "%HOME%\mavenrc_post.cmd"
175 | :skipRcPost
176 |
177 | @REM pause the script if MAVEN_BATCH_PAUSE is set to 'on'
178 | if "%MAVEN_BATCH_PAUSE%" == "on" pause
179 |
180 | if "%MAVEN_TERMINATE_CMD%" == "on" exit %ERROR_CODE%
181 |
182 | exit /B %ERROR_CODE%
183 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
3 |
4 | 4.0.0
5 | ch.x28.inscriptis
6 | inscriptis
7 | 1.1-SNAPSHOT
8 | jar
9 |
10 | inscriptis for Java
11 | A Java-based HTML to text conversion library with support for nested tables and a subset of CSS.
12 | https://github.com/x28/inscriptis-java
13 |
14 |
15 |
16 | Apache License, Version 2.0
17 | https://www.apache.org/licenses/LICENSE-2.0
18 | repo
19 |
20 |
21 |
22 |
23 | x28 AG
24 | https://www.x28.ch
25 |
26 |
27 |
28 |
29 | sw
30 | Sascha Wolski
31 | sascha.wolski at x28.ch
32 | x28 AG
33 | https://www.x28.ch
34 |
35 | Project lead
36 |
37 | +1
38 |
39 |
40 | mh
41 | Matthias Hewelt
42 | matthias.hewelt at x28.ch
43 | x28 AG
44 | https://www.x28.ch
45 |
46 | Project lead
47 |
48 | +1
49 |
50 |
51 |
52 |
53 | https://github.com/x28/inscriptis-java
54 | scm:git:git://github.com/x28/inscriptis-java.git
55 | scm:git:ssh://github.com/x28/inscriptis-java.git
56 |
57 |
58 |
59 | UTF-8
60 | 1.8
61 | 1.8
62 |
63 |
64 |
65 |
66 | org.junit.jupiter
67 | junit-jupiter-api
68 | 5.7.0
69 | test
70 |
71 |
72 |
73 | org.assertj
74 | assertj-core
75 | 3.18.1
76 | test
77 |
78 |
79 |
80 | org.jsoup
81 | jsoup
82 | 1.14.2
83 | test
84 |
85 |
86 |
87 |
88 |
89 | ossrh
90 | https://oss.sonatype.org/content/repositories/snapshots
91 |
92 |
93 |
94 |
95 |
96 |
97 | org.apache.maven.plugins
98 | maven-compiler-plugin
99 | 3.8.0
100 |
101 |
102 |
103 | org.apache.maven.plugins
104 | maven-source-plugin
105 | 3.2.1
106 |
107 |
108 | create-source-jar
109 |
110 | jar-no-fork
111 |
112 |
113 |
114 |
115 |
116 |
117 | org.apache.maven.plugins
118 | maven-javadoc-plugin
119 | 3.1.1
120 |
121 | UTF-8
122 | true
123 |
124 |
125 |
126 | create-javadoc-jar
127 |
128 | jar
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 | org.sonatype.plugins
137 | nexus-staging-maven-plugin
138 | 1.6.8
139 | true
140 |
141 | ossrh
142 | https://oss.sonatype.org/
143 | true
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 | release
152 |
153 |
154 |
155 |
156 |
157 | org.apache.maven.plugins
158 | maven-enforcer-plugin
159 | 1.4.1
160 |
161 |
162 | enforce-release-rules
163 |
164 | enforce
165 |
166 |
167 |
168 |
169 | [1.8,1.9)
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 | org.apache.maven.plugins
182 | maven-gpg-plugin
183 | 1.6
184 |
185 |
186 | sign-artifacts
187 | verify
188 |
189 | sign
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
--------------------------------------------------------------------------------
/src/main/java/ch/x28/inscriptis/CssParse.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020 the original author or authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package ch.x28.inscriptis;
17 |
18 | import java.util.Arrays;
19 | import java.util.List;
20 | import java.util.regex.Matcher;
21 | import java.util.regex.Pattern;
22 |
23 | import ch.x28.inscriptis.HtmlProperties.Display;
24 | import ch.x28.inscriptis.HtmlProperties.WhiteSpace;
25 |
26 | /**
27 | * Parses CSS specifications and translates them into the corresponding HtmlElements used by Inscriptis for rendering
28 | * HTML pages.
29 | *
30 | * @author Sascha Wolski
31 | * @author Matthias Hewelt
32 | */
33 | class CssParse {
34 |
35 | // used to separate value and unit from each other
36 | private static Pattern RE_UNIT = Pattern.compile("([\\-0-9\\.]+)(\\w+)");
37 |
38 | // used to validate parsed size units
39 | private static List CSS_RELATIVE_UNITS = Arrays.asList("em", "qem", "rem");
40 |
41 | // used to chose over whiteSpace values
42 | private static List WHITE_SPACE_NORMAL = Arrays.asList("normal", "nowrap");
43 | private static List WHITE_SPACE_PRE = Arrays.asList("pre", "pre-line", "pre-wrap");
44 |
45 | /**
46 | * @param styleAttribute the attribute value of the given style sheet. Example: display: none
47 | * @param htmlElement the HtmlElement to which the given style is applied.
48 | *
49 | * @return An HtmlElement that merges the given element with the style attributes specified.
50 | */
51 | public static HtmlElement getStyleAttribute(String styleAttribute, HtmlElement htmlElement) {
52 |
53 | HtmlElement customHtmlElement = htmlElement.clone();
54 |
55 | for (String styleDirective : styleAttribute.toLowerCase().split(";")) {
56 | if (!styleDirective.contains(":")) {
57 | continue;
58 | }
59 |
60 | String[] keyValuePair = StringUtils.split(styleDirective, ':', 1);
61 | if (keyValuePair.length < 2) {
62 | continue;
63 | }
64 |
65 | String key = keyValuePair[0].trim();
66 | String value = keyValuePair[1].trim();
67 |
68 | String fieldName = key.replace("-webkit-", "");
69 |
70 | switch (fieldName) {
71 | case "display":
72 | attributeDisplay(value, customHtmlElement);
73 | break;
74 | case "margin-top":
75 | attributeMarginTop(value, customHtmlElement);
76 | break;
77 | case "margin-bottom":
78 | attributeMarginBottom(value, customHtmlElement);
79 | break;
80 | case "padding-left":
81 | attributePaddingLeft(value, customHtmlElement);
82 | break;
83 | case "white-space":
84 | attributeWhiteSpace(value, customHtmlElement);
85 | break;
86 | default:
87 | break;
88 | }
89 | }
90 |
91 | return customHtmlElement;
92 | }
93 |
94 | /**
95 | * Set the display value.
96 | */
97 | private static void attributeDisplay(String value, HtmlElement htmlElement) {
98 |
99 | if (htmlElement.getDisplay() == Display.NONE)
100 | return;
101 |
102 | switch (value) {
103 | case "block":
104 | htmlElement.setDisplay(Display.BLOCK);
105 | break;
106 | case "none":
107 | htmlElement.setDisplay(Display.NONE);
108 | break;
109 | default:
110 | htmlElement.setDisplay(Display.INLINE);
111 | }
112 | }
113 |
114 | /**
115 | * Sets the bottom margin for the given HTML element.
116 | */
117 | private static void attributeMarginBottom(String value, HtmlElement htmlElement) {
118 | htmlElement.setMarginAfter(getEm(value));
119 | }
120 |
121 | /**
122 | * Sets the top margin for the given HTML element.
123 | */
124 | private static void attributeMarginTop(String value, HtmlElement htmlElement) {
125 | htmlElement.setMarginBefore(getEm(value));
126 | }
127 |
128 | /**
129 | * Sets the left padding for the given HTML element.
130 | */
131 | private static void attributePaddingLeft(String value, HtmlElement htmlElement) {
132 | htmlElement.setPadding(getEm(value));
133 | }
134 |
135 | /**
136 | * Set the white-space value.
137 | */
138 | private static void attributeWhiteSpace(String value, HtmlElement htmlElement) {
139 |
140 | if (WHITE_SPACE_NORMAL.contains(value)) {
141 | htmlElement.setWhitespace(WhiteSpace.NORMAL);
142 | } else if (WHITE_SPACE_PRE.contains(value)) {
143 | htmlElement.setWhitespace(WhiteSpace.PRE);
144 | }
145 | }
146 |
147 | /**
148 | * @param length the length (e.g. 2em, 2px, etc.) as specified in the CSS.
149 | * @return the length in em's.
150 | */
151 | private static int getEm(String length) {
152 |
153 | Matcher matcher = RE_UNIT.matcher(length);
154 |
155 | if (matcher.find()) {
156 | float value = Float.parseFloat(matcher.group(1));
157 | String unit = matcher.group(2);
158 |
159 | if (!CSS_RELATIVE_UNITS.contains(unit)) {
160 | return Math.round(value / 8);
161 | }
162 |
163 | return Math.round(value);
164 | }
165 |
166 | return 0;
167 | }
168 | }
169 |
--------------------------------------------------------------------------------
/src/main/java/ch/x28/inscriptis/CssProfile.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020 the original author or authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package ch.x28.inscriptis;
17 |
18 | import java.util.HashMap;
19 | import java.util.Map;
20 |
21 | import ch.x28.inscriptis.HtmlProperties.Display;
22 | import ch.x28.inscriptis.HtmlProperties.WhiteSpace;
23 |
24 | /**
25 | * Standard CSS profiles shipped with Inscriptis.
26 | *
27 | * @author Sascha Wolski
28 | * @author Matthias Hewelt
29 | */
30 | public class CssProfile {
31 |
32 | /**
33 | * This profile corresponds to the defaults used by Firefox
34 | */
35 | public static CssProfile STRICT;
36 | /**
37 | * This profile is more suited for text analytics, since it ensures that whitespaces are inserted between
38 | * {@code span} and {@code div} elements preventing cases where two words stick together.
39 | */
40 | public static CssProfile RELAXED;
41 |
42 | static {
43 | Map strict = new HashMap<>();
44 | strict.put("body", new HtmlElement("body", Display.INLINE, WhiteSpace.NORMAL));
45 | strict.put("head", new HtmlElement("head", Display.NONE));
46 | strict.put("link", new HtmlElement("link", Display.NONE));
47 | strict.put("meta", new HtmlElement("meta", Display.NONE));
48 | strict.put("script", new HtmlElement("script", Display.NONE));
49 | strict.put("title", new HtmlElement("title", Display.NONE));
50 | strict.put("style", new HtmlElement("style", Display.NONE));
51 |
52 | strict.put("p", new HtmlElement("p", Display.BLOCK, 1, 1));
53 | strict.put("figure", new HtmlElement("figure", Display.BLOCK, 1, 1));
54 | strict.put("h1", new HtmlElement("h1", Display.BLOCK, 1, 1));
55 | strict.put("h2", new HtmlElement("h2", Display.BLOCK, 1, 1));
56 | strict.put("h3", new HtmlElement("h3", Display.BLOCK, 1, 1));
57 | strict.put("h4", new HtmlElement("h4", Display.BLOCK, 1, 1));
58 | strict.put("h5", new HtmlElement("h5", Display.BLOCK, 1, 1));
59 | strict.put("h6", new HtmlElement("h6", Display.BLOCK, 1, 1));
60 |
61 | strict.put("ul", new HtmlElement("ul", Display.BLOCK, 0, 0, 4));
62 | strict.put("ol", new HtmlElement("ol", Display.BLOCK, 0, 0, 4));
63 | strict.put("li", new HtmlElement("li", Display.BLOCK));
64 |
65 | strict.put("address", new HtmlElement("address", Display.BLOCK));
66 | strict.put("article", new HtmlElement("article", Display.BLOCK));
67 | strict.put("aside", new HtmlElement("aside", Display.BLOCK));
68 | strict.put("div", new HtmlElement("div", Display.BLOCK));
69 | strict.put("footer", new HtmlElement("footer", Display.BLOCK));
70 | strict.put("header", new HtmlElement("header", Display.BLOCK));
71 | strict.put("hgroup", new HtmlElement("hgroup", Display.BLOCK));
72 | strict.put("layer", new HtmlElement("layer", Display.BLOCK));
73 | strict.put("main", new HtmlElement("main", Display.BLOCK));
74 | strict.put("nav", new HtmlElement("nav", Display.BLOCK));
75 | strict.put("figcaption", new HtmlElement("figcaption", Display.BLOCK));
76 | strict.put("blockquote", new HtmlElement("blockquote", Display.BLOCK));
77 |
78 | strict.put("q", new HtmlElement("q", "\"", "\""));
79 |
80 | // Handling of
81 | strict.put("pre", new HtmlElement("pre", Display.BLOCK, WhiteSpace.PRE));
82 | strict.put("xmp", new HtmlElement("xmp", Display.BLOCK, WhiteSpace.PRE));
83 | strict.put("listing", new HtmlElement("listing", Display.BLOCK, WhiteSpace.PRE));
84 | strict.put("plaintext", new HtmlElement("plaintext", Display.BLOCK, WhiteSpace.PRE));
85 |
86 | Map relaxed = new HashMap<>(strict);
87 | relaxed.put("div", new HtmlElement("div", Display.BLOCK, 2));
88 | relaxed.put("span", new HtmlElement("span", Display.INLINE, " ", " ", true));
89 |
90 | STRICT = new CssProfile(strict);
91 | RELAXED = new CssProfile(relaxed);
92 | }
93 |
94 | private Map settings;
95 |
96 | private CssProfile(Map settings) {
97 | this.settings = settings;
98 | }
99 |
100 | public HtmlElement get(String tag) {
101 | return settings.get(tag);
102 | }
103 |
104 | public HtmlElement getOrDefault(String tag, HtmlElement defaultElement) {
105 |
106 | HtmlElement htmlElement = settings.get(tag);
107 | if (htmlElement != null) {
108 | return htmlElement;
109 | }
110 |
111 | return defaultElement;
112 | }
113 | }
114 |
--------------------------------------------------------------------------------
/src/main/java/ch/x28/inscriptis/HtmlElement.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020 the original author or authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package ch.x28.inscriptis;
17 |
18 | import ch.x28.inscriptis.HtmlProperties.Display;
19 | import ch.x28.inscriptis.HtmlProperties.WhiteSpace;
20 |
21 | /**
22 | * The HtmlElement class stores the CSS properties.
23 | *
24 | * @author Sascha Wolski
25 | * @author Matthias Hewelt
26 | */
27 | class HtmlElement {
28 |
29 | /**
30 | * Name of the given HtmlElement
31 | */
32 | private String tag = "/";
33 | /**
34 | * Specifies a prefix that to insert before the tag's content.
35 | */
36 | private String prefix = "";
37 | /**
38 | * A suffix to append after the tag's content.
39 | */
40 | private String suffix = "";
41 | /**
42 | * {@link Display} strategy used for the content.
43 | */
44 | private Display display = null;
45 | /**
46 | * Vertical margin before the tag's content.
47 | */
48 | private int marginBefore = 0;
49 | /**
50 | * Vertical margin after the tag's content.
51 | */
52 | private int marginAfter = 0;
53 | /**
54 | * Horizontal padding before the tag's content.
55 | */
56 | private int padding = 0;
57 | /**
58 | * {@link WhiteSpace} handling strategy.
59 | */
60 | private WhiteSpace whitespace = null;
61 | /**
62 | * Limit printing of whitespace affixes to elements with `normal` whitepsace handling.
63 | */
64 | private boolean limitWhitespaceAffixes = false;
65 |
66 | public HtmlElement() {
67 | }
68 |
69 | public HtmlElement(String tag) {
70 | this.tag = tag;
71 | }
72 |
73 | public HtmlElement(String tag, Display display) {
74 | this.tag = tag;
75 | this.display = display;
76 | }
77 |
78 | public HtmlElement(String tag, Display display, int padding) {
79 |
80 | this.tag = tag;
81 | this.display = display;
82 | this.padding = padding;
83 | }
84 |
85 | public HtmlElement(String tag, Display display, int marginBefore, int marginAfter) {
86 | this.tag = tag;
87 | this.display = display;
88 | this.marginBefore = marginBefore;
89 | this.marginAfter = marginAfter;
90 | }
91 |
92 | public HtmlElement(String tag, Display display, int marginBefore, int marginAfter, int padding) {
93 | this.tag = tag;
94 | this.display = display;
95 | this.marginBefore = marginBefore;
96 | this.marginAfter = marginAfter;
97 | this.padding = padding;
98 | }
99 |
100 | public HtmlElement(String tag, Display display, String prefix, String suffix, boolean limitWhitespaceAffixes) {
101 |
102 | this.tag = tag;
103 | this.prefix = prefix;
104 | this.suffix = suffix;
105 | this.display = display;
106 | this.limitWhitespaceAffixes = limitWhitespaceAffixes;
107 | }
108 |
109 | public HtmlElement(String tag, Display display, WhiteSpace whitespace) {
110 | this.tag = tag;
111 | this.display = display;
112 | this.whitespace = whitespace;
113 | }
114 |
115 | public HtmlElement(
116 | String tag,
117 | Display display,
118 | WhiteSpace whitespace,
119 | String prefix,
120 | String suffix,
121 | int marginBefore,
122 | int marginAfter,
123 | int padding,
124 | boolean limitWhitespaceAffixes) {
125 |
126 | this.tag = tag;
127 | this.prefix = prefix;
128 | this.suffix = suffix;
129 | this.display = display;
130 | this.marginBefore = marginBefore;
131 | this.marginAfter = marginAfter;
132 | this.padding = padding;
133 | this.whitespace = whitespace;
134 | this.limitWhitespaceAffixes = limitWhitespaceAffixes;
135 | }
136 |
137 | public HtmlElement(String tag, String prefix, String suffix) {
138 |
139 | this.tag = tag;
140 | this.prefix = prefix;
141 | this.suffix = suffix;
142 | }
143 |
144 | /**
145 | * @return a clone of the current HtmlElement
146 | */
147 | @Override
148 | public HtmlElement clone() {
149 |
150 | return new HtmlElement(
151 | tag,
152 | display,
153 | whitespace,
154 | prefix,
155 | suffix,
156 | marginBefore,
157 | marginAfter,
158 | padding,
159 | limitWhitespaceAffixes);
160 | }
161 |
162 | public Display getDisplay() {
163 | return display;
164 | }
165 |
166 | public int getMarginAfter() {
167 | return marginAfter;
168 | }
169 |
170 | public int getMarginBefore() {
171 | return marginBefore;
172 | }
173 |
174 | public int getPadding() {
175 | return padding;
176 | }
177 |
178 | public String getPrefix() {
179 | return prefix;
180 | }
181 |
182 | /**
183 | * @param htmlElement the new HtmlElement to be applied to the current context.
184 | * @return the refined element with the context applied.
185 | */
186 | public HtmlElement getRefinedHtmlElement(HtmlElement htmlElement) {
187 |
188 | Display display = this.display == Display.NONE
189 | ? Display.NONE
190 | : htmlElement.getDisplay();
191 |
192 | WhiteSpace whiteSpace = null;
193 | if (htmlElement.getWhitespace() != null) {
194 | whiteSpace = htmlElement.getWhitespace();
195 | } else if (this.getWhitespace() != null) {
196 | whiteSpace = this.whitespace;
197 | }
198 |
199 | // do not display whitespace only affixes in Whitespace.pre areas
200 | // if `limit_whitespace_affixes` is set.
201 | String prefix = htmlElement.getPrefix();
202 | String suffix = htmlElement.getSuffix();
203 |
204 | if (htmlElement.isLimitWhitespaceAffixes() && whiteSpace == WhiteSpace.PRE) {
205 | if (StringUtils.isBlank(prefix)) {
206 | prefix = "";
207 | }
208 |
209 | if (StringUtils.isBlank(suffix)) {
210 | suffix = "";
211 | }
212 | }
213 |
214 | return new HtmlElement(
215 | htmlElement.getTag(),
216 | display,
217 | whiteSpace,
218 | prefix,
219 | suffix,
220 | htmlElement.getMarginBefore(),
221 | htmlElement.getMarginAfter(),
222 | htmlElement.getPadding(),
223 | false);
224 | }
225 |
226 | public String getSuffix() {
227 | return suffix;
228 | }
229 |
230 | public String getTag() {
231 | return tag;
232 | }
233 |
234 | public WhiteSpace getWhitespace() {
235 | return whitespace;
236 | }
237 |
238 | public boolean isLimitWhitespaceAffixes() {
239 | return limitWhitespaceAffixes;
240 | }
241 |
242 | public void setDisplay(Display display) {
243 | this.display = display;
244 | }
245 |
246 | public void setLimitWhitespaceAffixes(boolean limitWhitespaceAffixes) {
247 | this.limitWhitespaceAffixes = limitWhitespaceAffixes;
248 | }
249 |
250 | public void setMarginAfter(int marginAfter) {
251 | this.marginAfter = marginAfter;
252 | }
253 |
254 | public void setMarginBefore(int marginBefore) {
255 | this.marginBefore = marginBefore;
256 | }
257 |
258 | public void setPadding(int padding) {
259 | this.padding = padding;
260 | }
261 |
262 | public void setPrefix(String prefix) {
263 | this.prefix = prefix;
264 | }
265 |
266 | public void setSuffix(String suffix) {
267 | this.suffix = suffix;
268 | }
269 |
270 | public void setTag(String tag) {
271 | this.tag = tag;
272 | }
273 |
274 | public void setWhitespace(WhiteSpace whitespace) {
275 | this.whitespace = whitespace;
276 | }
277 |
278 | @Override
279 | public String toString() {
280 | return "HtmlElement [tag=" + tag +
281 | ", display=" + display +
282 | ", whitespace=" + whitespace +
283 | ", prefix=" + prefix +
284 | ", suffix=" + suffix +
285 | ", marginBefore=" + marginBefore +
286 | ", marginAfter=" + marginAfter +
287 | ", padding=" + padding +
288 | ", limitWhitespaceAffixes=" + limitWhitespaceAffixes + "]";
289 | }
290 |
291 | }
292 |
--------------------------------------------------------------------------------
/src/main/java/ch/x28/inscriptis/HtmlProperties.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020 the original author or authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package ch.x28.inscriptis;
17 |
18 | /**
19 | * @author Sascha Wolski
20 | * @author Matthias Hewelt
21 | */
22 | class HtmlProperties {
23 |
24 | /**
25 | * This enum specifies whether content will be rendered as inline, block or none (i.e. not rendered).
26 | */
27 | public enum Display {
28 | INLINE(1),
29 | BLOCK(2),
30 | NONE(3);
31 |
32 | private final int value;
33 |
34 | private Display(int value) {
35 | this.value = value;
36 | }
37 |
38 | public int getValue() {
39 | return value;
40 | }
41 | }
42 |
43 | /**
44 | * This enum specifies the vertical alignment.
45 | */
46 | public enum HorizontalAlignment {
47 | LEFT('<'),
48 | RIGHT('>'),
49 | CENTER('^');
50 |
51 | private final char value;
52 |
53 | private HorizontalAlignment(char value) {
54 | this.value = value;
55 | }
56 |
57 | public char getValue() {
58 | return value;
59 | }
60 | }
61 |
62 | /**
63 | * This enum specifies the whitespace handling used for an HTML element as outlined in the Cascading Style Sheets
64 | * specification.
65 | *
66 | * @NORMAL Sequences of whitespaces will be collapsed into a single one.
67 | * @PRE Sequences of whitespaces will preserved.
68 | */
69 | public enum WhiteSpace {
70 | NORMAL(1),
71 | PRE(3);
72 |
73 | private final int value;
74 |
75 | private WhiteSpace(int value) {
76 | this.value = value;
77 | }
78 |
79 | public int getValue() {
80 | return value;
81 | }
82 | }
83 |
84 | }
85 |
--------------------------------------------------------------------------------
/src/main/java/ch/x28/inscriptis/Inscriptis.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020 the original author or authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package ch.x28.inscriptis;
17 |
18 | import java.util.ArrayList;
19 | import java.util.List;
20 | import java.util.Stack;
21 | import java.util.stream.Collectors;
22 |
23 | import org.w3c.dom.Document;
24 | import org.w3c.dom.NamedNodeMap;
25 | import org.w3c.dom.Node;
26 | import org.w3c.dom.NodeList;
27 |
28 | import ch.x28.inscriptis.HtmlProperties.Display;
29 | import ch.x28.inscriptis.HtmlProperties.WhiteSpace;
30 |
31 | /**
32 | * The Inscriptis class translates a W3C document to its corresponding text representation.
33 | *
43 | *
44 | * @author Sascha Wolski
45 | * @author Matthias Hewelt
46 | */
47 | public class Inscriptis {
48 |
49 | private static final String[] UL_COUNTER = { "* ", "+ ", "o ", "- " };
50 | private static final HtmlElement DEFAULT_ELEMENT = new HtmlElement();
51 |
52 | private final ParserConfig config;
53 |
54 | private final Stack currentTag;
55 | private final Stack currentLine;
56 | private final Stack nextLine;
57 | /**
58 | * The canvases used for displaying text. cleanTextLines[0] refers to the root canvas; tables write into child
59 | * canvases that are created for every table line and merged with the root canvas at the end of a table.
60 | */
61 | private Stack> cleanTextLines;
62 |
63 | private Stack