├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── build.gradle.kts ├── gradle.properties ├── gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── gradlew ├── gradlew.bat ├── settings.gradle.kts ├── source ├── main │ └── java │ │ └── dsk │ │ └── anotex │ │ ├── AnnotationExtractor.java │ │ ├── ConsoleRunner.java │ │ ├── Constants.java │ │ ├── core │ │ ├── AnnotatedDocument.java │ │ ├── Annotation.java │ │ ├── FileFormat.java │ │ └── package-info.java │ │ ├── exporter │ │ ├── AnnotationExporter.java │ │ ├── ExporterFactory.java │ │ ├── MarkdownExporter.java │ │ └── package-info.java │ │ ├── importer │ │ ├── AnnotationImporter.java │ │ ├── ImporterFactory.java │ │ ├── PdfAnnotationImporter.java │ │ ├── PdfTextExtractionStrategy.java │ │ └── package-info.java │ │ ├── package-info.java │ │ └── util │ │ ├── CommandLineParser.java │ │ └── package-info.java └── test │ └── java │ └── dsk │ └── anotex │ ├── AnnotationExtractorTest.java │ ├── ConsoleRunnerTest.java │ ├── TestBase.java │ ├── exporter │ └── MarkdownExporterTest.java │ └── importer │ └── PdfAnnotationImporterTest.java └── work ├── DyAnnotationExtractor ├── DyAnnotationExtractor.bat ├── documents ├── Highlight_Example_1.png └── Manual.md └── testing ├── Test_Pdf_1.pdf ├── Test_Pdf_2.pdf ├── Test_Pdf_3.pdf ├── Test_Pdf_4.pdf ├── Test_Pdf_5.pdf ├── Test_Pdf_6.pdf └── Test_Pdf_7.pdf /.gitignore: -------------------------------------------------------------------------------- 1 | .gradle/ 2 | .idea/ 3 | build/ 4 | internal/ 5 | work/program/ 6 | work/temp/ 7 | work/testing/ 8 | work/tests/ -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | jdk: 3 | - oraclejdk8 4 | 5 | script: 6 | - gradle dist 7 | deploy: 8 | provider: releases 9 | api_key: 10 | secure: "015pOfqAAtemG/IJ+A/xNZoKCe6+dbP/9ZED9YcTILmHALnwx7RMrIz+u07Q285/xLg4eX3jsFvNXa0LwLyHpI0mrnMX8sewheUeYITkwEkvnIIyBueUi80cTRFDZQsamesFtp2nILI 11 | UblpVnEKWZK50jSPOfb8L/V0Ocqd3/wYeDCrXsAhLQgfmauwxBtN7ZqmN9CSRyVllSdecNqfG8e10WM0u4z12Nj5dUCqISRgbTghtB7zIN1AJgvn6wiNSbjpnbcDAbn1ohYiOx/tVFJkv70XrebkRw 12 | R6UedQWShQ1PgeORO04+JAmuLFRGxJV5+OqSr07nGj849bGxJ489ZPdIEVBSxcDw/2gOMOWyYpRoRqUnMZdNQu8AmCc0YtT/S7bIFTqA/oWmtnxWWxqJSfJ8YF/Qv/8blqnuaYpcZwoX6sStNT21kj 13 | vSlQoz3TXdwZ2NkIntMVwnXtqRmGnmqLcvsHWyrwkfPwG46IeevMdkoeBN9STda3RQD11dh4/bDPmOFZk6ls1txMPynZsK5EGAEQjCpv9n1YFFEc8RY+mgDQICfSfF2kfH+lmCnKgJ/5AJTzVBHd+k 14 | ccTfcQiWcSRaDbQQO9VtvKVuFPH4i1c9ssqThmQakh5JFnvstC0qFwVU2z7ZeROgp6jgevTLjpLfXz4A1PEUdzHRpiWFHA=" 15 | file_glob: true 16 | file: "build/distribution/*" 17 | skip_cleanup: true 18 | on: 19 | tags 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DyAnnotationExtractor # 2 | 3 | DyAnnotationExtractor is software for extracting annotations (highlighted text and comments) from e-documents like PDF. The extracted parts can be used to build summary/resume of the document. 4 | 5 | 6 | Note! The AI bot [ChatGPT](https://chatgpt.com/) is now cable to extract highlighted text from PDF file and export the summary into Markdown. There is no stimulus to develop this project further. 7 | DyAnnotationExtractor remains usable in its current state - it can be applied in task automation tools which cannot call AI service (because of sensitive documents or internet restrictions). 8 | 9 | 10 | ## Usage ## 11 | 12 | Imagine you have ebook (PDF) which is 100 pages long. While reading the book, 13 | you **highlight** the important parts in your favorite reader: 14 | 15 | ![](work/documents/Highlight_Example_1.png) 16 | 17 | Then use the DyAnnotationExtractor tool to get just the highlighted parts. 18 | 19 | On the command line execute following command. 20 | For Windows: 21 | ``` 22 | DyAnnotationExtractor -input "Getting Started with Ubuntu 16.04.pdf" 23 | ``` 24 | For Linux: 25 | ``` 26 | ./DyAnnotationExtractor -input "Getting Started with Ubuntu 16.04.pdf" 27 | ``` 28 | 29 | This will create a file with same name in the same directory, with added '.md' suffix. 30 | Note that the file name is enclosed with quotas - this is required when the file name contains spaces. 31 | 32 | Now you have extract of the book which is not 100 but 5-6 pages. So, you can skim just the exported text instead of re-reading the entire book. 33 | 34 | ## Supported Input Formats ## 35 | 36 | - PDF (Portable Document Format) 37 | 38 | ## Supported Output Formats ## 39 | 40 | - MD (Markdown) 41 | 42 | ## Requirements ## 43 | 44 | - Java 21+. 45 | 46 | ## Download ## 47 | 48 | Get the [latest release](https://github.com/dimi2/DyAnnotationExtractor/releases/latest). 49 | 50 | End users need to download only the distribution jar. 51 | 52 | ## Installation ## 53 | 54 | Extract the downloaded archive in some local directory.
55 | Run the provided 'DyAnnotationExtractor' script to perform extraction. 56 | 57 | ## Build ## 58 | 59 | To build the project from sources, you will need [Gradle](https://gradle.org/) build tool. 60 | Go into the project home directory (PROJ_HOME) and execute command: 61 | ``` 62 | gradle 63 | ``` 64 | The result will appear in directory `PROJ_HOME/build/distribution`. This is portable distribution of the application. If you need just the library (without dependencies and start scripts), use the JAR file generated in `PROJ_HOME/build/libs` directory. 65 | 66 | -------------------------------------------------------------------------------- /build.gradle.kts: -------------------------------------------------------------------------------- 1 | /* 2 | * Project build instructions and dependencies (for Gradle). 3 | */ 4 | 5 | plugins { 6 | id("java") 7 | idea 8 | } 9 | 10 | // Project dependencies versions (in alphabetical order). 11 | val iTextPdfVersion = "9.1.0" 12 | val junitVersion = "5.12.2" 13 | val log4jVersion = "2.24.3" 14 | 15 | dependencies { 16 | implementation("com.itextpdf:kernel:$iTextPdfVersion") { 17 | exclude(group = "org.slf4j") 18 | } 19 | implementation("org.apache.logging.log4j:log4j-api:$log4jVersion") 20 | implementation("org.apache.logging.log4j:log4j-core:$log4jVersion") 21 | implementation("org.apache.logging.log4j:log4j-slf4j-impl:$log4jVersion") 22 | 23 | testImplementation("org.junit.jupiter:junit-jupiter:$junitVersion") 24 | testRuntimeOnly("org.junit.platform:junit-platform-launcher") 25 | } 26 | 27 | // Define project specific directories (intentionally use custom project structure). 28 | layout.buildDirectory = file("build") 29 | val workDir = "$projectDir/work" 30 | val programDirName = "program" 31 | val programDir = "$workDir/$programDirName" 32 | val testsDirName = "tests" 33 | val libraryDirName = "library" 34 | val testsDir = "$workDir/$testsDirName" 35 | val distDir = "${layout.buildDirectory.get()}/distribution" 36 | val autoDocDir = "$distDir/autodoc" 37 | 38 | // Configure project source and compilation directories. 39 | sourceSets { 40 | main { 41 | java { 42 | java.setSrcDirs(listOf("source/main/java")) 43 | java.destinationDirectory.set(file(programDir)) 44 | } 45 | } 46 | test { 47 | java { 48 | java.setSrcDirs(listOf("source/test/java")) 49 | java.destinationDirectory.set(file(testsDir)) 50 | } 51 | } 52 | } 53 | 54 | // Define custom build tasks. 55 | tasks.register("checkEnv") { 56 | description = "Check the build pre-conditions" 57 | 58 | val javaVersion = JavaVersion.current() 59 | val minJavaVersion = project.java.targetCompatibility 60 | if (javaVersion < minJavaVersion) { 61 | throw GradleException("Inappropriate Java version ($javaVersion). " + 62 | "Needs ($minJavaVersion) or higher.") 63 | } 64 | 65 | val gradleVersion = GradleVersion.version(project.gradle.gradleVersion) 66 | val minGradleVersion = GradleVersion.version(project.extra["minGradleVersion"] as String) 67 | if (gradleVersion < minGradleVersion) { 68 | throw GradleException("Inappropriate Gradle version ($gradleVersion)." + 69 | " Needs ($minGradleVersion) or higher.") 70 | } 71 | } 72 | 73 | tasks.register("dist") { 74 | description = "Create project distribution" 75 | 76 | // Ensure ordered execution of dependent tasks (this is workaround for Gradle design weakness). 77 | val tList = listOf("clean", "checkEnv", "build") 78 | .stream().map { t -> tasks[t] }.toList() 79 | for (i in 0 until tList.size - 1) { 80 | tList[i + 1].mustRunAfter(tList[i]) 81 | } 82 | dependsOn(tList) 83 | 84 | destinationDir = file(distDir) 85 | into(project.extra["APP_NAME"] as String) { 86 | duplicatesStrategy = DuplicatesStrategy.EXCLUDE 87 | from(workDir) 88 | exclude(testsDirName, "testing") 89 | into(libraryDirName) { 90 | from(project.configurations.runtimeClasspath) 91 | } 92 | includeEmptyDirs = true 93 | } 94 | } 95 | 96 | // Customize project build tasks. 97 | tasks { 98 | clean { 99 | // Clean the compilation target directories. 100 | delete(rootProject.layout.buildDirectory) 101 | delete(programDir) 102 | delete(testsDir) 103 | } 104 | 105 | build { 106 | // Check build tools versions first. 107 | dependsOn("checkEnv") 108 | } 109 | 110 | test { 111 | // Workaround for test executions from Intellij Idea IDE. 112 | useJUnitPlatform() 113 | } 114 | 115 | jar { 116 | archiveBaseName.set(project.extra["APP_NAME"] as String) 117 | manifest { 118 | attributes["Specification-Title"] = project.name 119 | attributes["Specification-Version"] = version 120 | attributes["Implementation-Vendor"] = "DSK" 121 | attributes["Main-Class"] = "dsk.anotex.ConsoleRunner" 122 | } 123 | } 124 | // Do not produce jar file for the project. 125 | //jar.get().enabled = false 126 | 127 | java { 128 | group = "dsk" 129 | version = project.extra["APP_VERSION"] as String 130 | } 131 | 132 | javadoc { 133 | setDestinationDir(file(autoDocDir)) 134 | options { 135 | this as StandardJavadocDocletOptions 136 | addStringOption("Xdoclint:none", "-quiet") 137 | } 138 | } 139 | 140 | // Force IntelliJ Idea IDE to use the same build directories as Gradle (avoid recompilation). 141 | idea { 142 | module { 143 | outputDir = file(programDir) 144 | testOutputDir = file(testsDir) 145 | //downloadJavadoc = true 146 | //downloadSources = true 147 | } 148 | } 149 | } 150 | 151 | defaultTasks("dist") 152 | -------------------------------------------------------------------------------- /gradle.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Project build configuration settings (for Gradle). 3 | # 4 | 5 | APP_NAME = DyAnnotationExtractor 6 | APP_VERSION = 1.5 7 | 8 | # Minimal build tools versions. 9 | minJavaVersion = 21 10 | minGradleVersion = 8.0 11 | 12 | # Additional build settings. 13 | org.gradle.warning.mode = all 14 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dimi2/DyAnnotationExtractor/3e99f750a0b811e55cc182371ed5b3f348a40982/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-8.13-bin.zip 4 | networkTimeout=10000 5 | validateDistributionUrl=true 6 | zipStoreBase=GRADLE_USER_HOME 7 | zipStorePath=wrapper/dists 8 | -------------------------------------------------------------------------------- /gradlew: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # 4 | # Copyright © 2015-2021 the original authors. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # https://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | # SPDX-License-Identifier: Apache-2.0 19 | # 20 | 21 | ############################################################################## 22 | # 23 | # Gradle start up script for POSIX generated by Gradle. 24 | # 25 | # Important for running: 26 | # 27 | # (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is 28 | # noncompliant, but you have some other compliant shell such as ksh or 29 | # bash, then to run this script, type that shell name before the whole 30 | # command line, like: 31 | # 32 | # ksh Gradle 33 | # 34 | # Busybox and similar reduced shells will NOT work, because this script 35 | # requires all of these POSIX shell features: 36 | # * functions; 37 | # * expansions «$var», «${var}», «${var:-default}», «${var+SET}», 38 | # «${var#prefix}», «${var%suffix}», and «$( cmd )»; 39 | # * compound commands having a testable exit status, especially «case»; 40 | # * various built-in commands including «command», «set», and «ulimit». 41 | # 42 | # Important for patching: 43 | # 44 | # (2) This script targets any POSIX shell, so it avoids extensions provided 45 | # by Bash, Ksh, etc; in particular arrays are avoided. 46 | # 47 | # The "traditional" practice of packing multiple parameters into a 48 | # space-separated string is a well documented source of bugs and security 49 | # problems, so this is (mostly) avoided, by progressively accumulating 50 | # options in "$@", and eventually passing that to Java. 51 | # 52 | # Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS, 53 | # and GRADLE_OPTS) rely on word-splitting, this is performed explicitly; 54 | # see the in-line comments for details. 55 | # 56 | # There are tweaks for specific operating systems such as AIX, CygWin, 57 | # Darwin, MinGW, and NonStop. 58 | # 59 | # (3) This script is generated from the Groovy template 60 | # https://github.com/gradle/gradle/blob/HEAD/platforms/jvm/plugins-application/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt 61 | # within the Gradle project. 62 | # 63 | # You can find Gradle at https://github.com/gradle/gradle/. 64 | # 65 | ############################################################################## 66 | 67 | # Attempt to set APP_HOME 68 | 69 | # Resolve links: $0 may be a link 70 | app_path=$0 71 | 72 | # Need this for daisy-chained symlinks. 73 | while 74 | APP_HOME=${app_path%"${app_path##*/}"} # leaves a trailing /; empty if no leading path 75 | [ -h "$app_path" ] 76 | do 77 | ls=$( ls -ld "$app_path" ) 78 | link=${ls#*' -> '} 79 | case $link in #( 80 | /*) app_path=$link ;; #( 81 | *) app_path=$APP_HOME$link ;; 82 | esac 83 | done 84 | 85 | # This is normally unused 86 | # shellcheck disable=SC2034 87 | APP_BASE_NAME=${0##*/} 88 | # Discard cd standard output in case $CDPATH is set (https://github.com/gradle/gradle/issues/25036) 89 | APP_HOME=$( cd -P "${APP_HOME:-./}" > /dev/null && printf '%s\n' "$PWD" ) || exit 90 | 91 | # Use the maximum available, or set MAX_FD != -1 to use that value. 92 | MAX_FD=maximum 93 | 94 | warn () { 95 | echo "$*" 96 | } >&2 97 | 98 | die () { 99 | echo 100 | echo "$*" 101 | echo 102 | exit 1 103 | } >&2 104 | 105 | # OS specific support (must be 'true' or 'false'). 106 | cygwin=false 107 | msys=false 108 | darwin=false 109 | nonstop=false 110 | case "$( uname )" in #( 111 | CYGWIN* ) cygwin=true ;; #( 112 | Darwin* ) darwin=true ;; #( 113 | MSYS* | MINGW* ) msys=true ;; #( 114 | NONSTOP* ) nonstop=true ;; 115 | esac 116 | 117 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 118 | 119 | 120 | # Determine the Java command to use to start the JVM. 121 | if [ -n "$JAVA_HOME" ] ; then 122 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 123 | # IBM's JDK on AIX uses strange locations for the executables 124 | JAVACMD=$JAVA_HOME/jre/sh/java 125 | else 126 | JAVACMD=$JAVA_HOME/bin/java 127 | fi 128 | if [ ! -x "$JAVACMD" ] ; then 129 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 130 | 131 | Please set the JAVA_HOME variable in your environment to match the 132 | location of your Java installation." 133 | fi 134 | else 135 | JAVACMD=java 136 | if ! command -v java >/dev/null 2>&1 137 | then 138 | die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 139 | 140 | Please set the JAVA_HOME variable in your environment to match the 141 | location of your Java installation." 142 | fi 143 | fi 144 | 145 | # Increase the maximum file descriptors if we can. 146 | if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then 147 | case $MAX_FD in #( 148 | max*) 149 | # In POSIX sh, ulimit -H is undefined. That's why the result is checked to see if it worked. 150 | # shellcheck disable=SC2039,SC3045 151 | MAX_FD=$( ulimit -H -n ) || 152 | warn "Could not query maximum file descriptor limit" 153 | esac 154 | case $MAX_FD in #( 155 | '' | soft) :;; #( 156 | *) 157 | # In POSIX sh, ulimit -n is undefined. That's why the result is checked to see if it worked. 158 | # shellcheck disable=SC2039,SC3045 159 | ulimit -n "$MAX_FD" || 160 | warn "Could not set maximum file descriptor limit to $MAX_FD" 161 | esac 162 | fi 163 | 164 | # Collect all arguments for the java command, stacking in reverse order: 165 | # * args from the command line 166 | # * the main class name 167 | # * -classpath 168 | # * -D...appname settings 169 | # * --module-path (only if needed) 170 | # * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables. 171 | 172 | # For Cygwin or MSYS, switch paths to Windows format before running java 173 | if "$cygwin" || "$msys" ; then 174 | APP_HOME=$( cygpath --path --mixed "$APP_HOME" ) 175 | CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" ) 176 | 177 | JAVACMD=$( cygpath --unix "$JAVACMD" ) 178 | 179 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 180 | for arg do 181 | if 182 | case $arg in #( 183 | -*) false ;; # don't mess with options #( 184 | /?*) t=${arg#/} t=/${t%%/*} # looks like a POSIX filepath 185 | [ -e "$t" ] ;; #( 186 | *) false ;; 187 | esac 188 | then 189 | arg=$( cygpath --path --ignore --mixed "$arg" ) 190 | fi 191 | # Roll the args list around exactly as many times as the number of 192 | # args, so each arg winds up back in the position where it started, but 193 | # possibly modified. 194 | # 195 | # NB: a `for` loop captures its iteration list before it begins, so 196 | # changing the positional parameters here affects neither the number of 197 | # iterations, nor the values presented in `arg`. 198 | shift # remove old arg 199 | set -- "$@" "$arg" # push replacement arg 200 | done 201 | fi 202 | 203 | 204 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 205 | DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' 206 | 207 | # Collect all arguments for the java command: 208 | # * DEFAULT_JVM_OPTS, JAVA_OPTS, and optsEnvironmentVar are not allowed to contain shell fragments, 209 | # and any embedded shellness will be escaped. 210 | # * For example: A user cannot expect ${Hostname} to be expanded, as it is an environment variable and will be 211 | # treated as '${Hostname}' itself on the command line. 212 | 213 | set -- \ 214 | "-Dorg.gradle.appname=$APP_BASE_NAME" \ 215 | -classpath "$CLASSPATH" \ 216 | org.gradle.wrapper.GradleWrapperMain \ 217 | "$@" 218 | 219 | # Stop when "xargs" is not available. 220 | if ! command -v xargs >/dev/null 2>&1 221 | then 222 | die "xargs is not available" 223 | fi 224 | 225 | # Use "xargs" to parse quoted args. 226 | # 227 | # With -n1 it outputs one arg per line, with the quotes and backslashes removed. 228 | # 229 | # In Bash we could simply go: 230 | # 231 | # readarray ARGS < <( xargs -n1 <<<"$var" ) && 232 | # set -- "${ARGS[@]}" "$@" 233 | # 234 | # but POSIX shell has neither arrays nor command substitution, so instead we 235 | # post-process each arg (as a line of input to sed) to backslash-escape any 236 | # character that might be a shell metacharacter, then use eval to reverse 237 | # that process (while maintaining the separation between arguments), and wrap 238 | # the whole thing up as a single "set" statement. 239 | # 240 | # This will of course break if any of these variables contains a newline or 241 | # an unmatched quote. 242 | # 243 | 244 | eval "set -- $( 245 | printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" | 246 | xargs -n1 | 247 | sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' | 248 | tr '\n' ' ' 249 | )" '"$@"' 250 | 251 | exec "$JAVACMD" "$@" 252 | -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @rem 2 | @rem Copyright 2015 the original author or authors. 3 | @rem 4 | @rem Licensed under the Apache License, Version 2.0 (the "License"); 5 | @rem you may not use this file except in compliance with the License. 6 | @rem You may obtain a copy of the License at 7 | @rem 8 | @rem https://www.apache.org/licenses/LICENSE-2.0 9 | @rem 10 | @rem Unless required by applicable law or agreed to in writing, software 11 | @rem distributed under the License is distributed on an "AS IS" BASIS, 12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @rem See the License for the specific language governing permissions and 14 | @rem limitations under the License. 15 | @rem 16 | @rem SPDX-License-Identifier: Apache-2.0 17 | @rem 18 | 19 | @if "%DEBUG%"=="" @echo off 20 | @rem ########################################################################## 21 | @rem 22 | @rem Gradle startup script for Windows 23 | @rem 24 | @rem ########################################################################## 25 | 26 | @rem Set local scope for the variables with windows NT shell 27 | if "%OS%"=="Windows_NT" setlocal 28 | 29 | set DIRNAME=%~dp0 30 | if "%DIRNAME%"=="" set DIRNAME=. 31 | @rem This is normally unused 32 | set APP_BASE_NAME=%~n0 33 | set APP_HOME=%DIRNAME% 34 | 35 | @rem Resolve any "." and ".." in APP_HOME to make it shorter. 36 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi 37 | 38 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 39 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" 40 | 41 | @rem Find java.exe 42 | if defined JAVA_HOME goto findJavaFromJavaHome 43 | 44 | set JAVA_EXE=java.exe 45 | %JAVA_EXE% -version >NUL 2>&1 46 | if %ERRORLEVEL% equ 0 goto execute 47 | 48 | echo. 1>&2 49 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 1>&2 50 | echo. 1>&2 51 | echo Please set the JAVA_HOME variable in your environment to match the 1>&2 52 | echo location of your Java installation. 1>&2 53 | 54 | goto fail 55 | 56 | :findJavaFromJavaHome 57 | set JAVA_HOME=%JAVA_HOME:"=% 58 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 59 | 60 | if exist "%JAVA_EXE%" goto execute 61 | 62 | echo. 1>&2 63 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 1>&2 64 | echo. 1>&2 65 | echo Please set the JAVA_HOME variable in your environment to match the 1>&2 66 | echo location of your Java installation. 1>&2 67 | 68 | goto fail 69 | 70 | :execute 71 | @rem Setup the command line 72 | 73 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 74 | 75 | 76 | @rem Execute Gradle 77 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* 78 | 79 | :end 80 | @rem End local scope for the variables with windows NT shell 81 | if %ERRORLEVEL% equ 0 goto mainEnd 82 | 83 | :fail 84 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 85 | rem the _cmd.exe /c_ return code! 86 | set EXIT_CODE=%ERRORLEVEL% 87 | if %EXIT_CODE% equ 0 set EXIT_CODE=1 88 | if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE% 89 | exit /b %EXIT_CODE% 90 | 91 | :mainEnd 92 | if "%OS%"=="Windows_NT" endlocal 93 | 94 | :omega 95 | -------------------------------------------------------------------------------- /settings.gradle.kts: -------------------------------------------------------------------------------- 1 | /* 2 | * Main setup for the project build system - Gradle. 3 | */ 4 | rootProject.name = extra["APP_NAME"] as String 5 | 6 | dependencyResolutionManagement { 7 | repositories { 8 | // Use Maven Central for resolving dependencies. 9 | mavenCentral() 10 | } 11 | } 12 | 13 | 14 | -------------------------------------------------------------------------------- /source/main/java/dsk/anotex/AnnotationExtractor.java: -------------------------------------------------------------------------------- 1 | package dsk.anotex; 2 | 3 | import dsk.anotex.core.AnnotatedDocument; 4 | import dsk.anotex.core.FileFormat; 5 | import dsk.anotex.exporter.AnnotationExporter; 6 | import dsk.anotex.exporter.ExporterFactory; 7 | import dsk.anotex.importer.AnnotationImporter; 8 | import dsk.anotex.importer.ImporterFactory; 9 | 10 | import java.io.BufferedWriter; 11 | import java.io.File; 12 | import java.io.IOException; 13 | import java.io.OutputStreamWriter; 14 | import java.io.Writer; 15 | import java.nio.charset.StandardCharsets; 16 | import java.nio.file.Files; 17 | import java.util.Map; 18 | import java.util.TreeMap; 19 | 20 | /** 21 | * Document annotation extractor. 22 | */ 23 | public class AnnotationExtractor { 24 | protected Map formats; 25 | 26 | public AnnotationExtractor() { 27 | super(); 28 | formats = getKnownFileFormats(); 29 | } 30 | 31 | /** 32 | * Execute annotation extraction from file. 33 | * @param inputFile Input file name. 34 | * @param settings Additional export settings. 35 | * @param outputFile Output file name. If null - default will be used. If the output file already 36 | * exists, it will be overwritten. 37 | * @return The name of the created output file. 38 | */ 39 | public String extractAnnotations(String inputFile, Map settings, String outputFile) { 40 | // Extract the annotations. 41 | AnnotatedDocument document = readAnnotations(inputFile); 42 | 43 | // Get appropriate exporter. 44 | String sFormat = (String) settings.get(Constants.EXPORT_FORMAT); 45 | FileFormat exportFormat = FileFormat.getByName(sFormat); 46 | if (sFormat == null) { 47 | // Use the default export format. 48 | exportFormat = getDefaultExportFormat(); 49 | } 50 | AnnotationExporter exporter = ExporterFactory.createExporter(exportFormat); 51 | 52 | // Write the output. 53 | if (outputFile == null) { 54 | // Use default output file. 55 | outputFile = inputFile + exportFormat.getExtension(); 56 | } 57 | try (Writer output = getOutputWriter(outputFile)) { 58 | exporter.export(document, settings, output); 59 | } 60 | catch (IOException e) { 61 | throw new RuntimeException("Extraction error", e); 62 | } 63 | return outputFile; 64 | } 65 | 66 | /** 67 | * Read annotations from given document file. 68 | * @param fileName Document file name. 69 | * @return Document annotations. 70 | */ 71 | public AnnotatedDocument readAnnotations(String fileName) { 72 | FileFormat format = detectFileFormat(fileName); 73 | AnnotationImporter importer = ImporterFactory.createImporter(format); 74 | AnnotatedDocument document = importer.readAnnotations(fileName); 75 | postProcess(document); 76 | return document; 77 | } 78 | 79 | /** 80 | * Get the default export format. 81 | * @return Export format. 82 | */ 83 | protected FileFormat getDefaultExportFormat() { 84 | return FileFormat.MARKDOWN; 85 | } 86 | 87 | /** 88 | * Get output writer for specified input file. 89 | * @param outputFile Output file name. 90 | * @return Output writer. 91 | */ 92 | protected Writer getOutputWriter(String outputFile) { 93 | File outFile = new File(outputFile); 94 | 95 | // Create necessary directories fore the output path. 96 | File outFileDir = outFile.getParentFile(); 97 | if (outFileDir != null) { 98 | outFileDir.mkdirs(); 99 | } 100 | 101 | // Crate buffered file writer. 102 | Writer writer; 103 | try { 104 | writer = new BufferedWriter(new OutputStreamWriter(Files.newOutputStream(outFile.toPath()), 105 | StandardCharsets.UTF_8)); 106 | } 107 | catch (IOException e) { 108 | throw new RuntimeException(e); 109 | } 110 | return writer; 111 | } 112 | 113 | /** 114 | * Detect the file format. 115 | * @param fileName Document file name. 116 | * @return Detected format, null for unknown formats. 117 | */ 118 | protected FileFormat detectFileFormat(String fileName) { 119 | // We use the file name extension to determine the format. 120 | // Reading the first few bytes (signature) from the file would provide more reliable detection. 121 | // But this one is good enough, since the associated importer will parse the file anyway and will 122 | // detect if the file format is wrong (for example, PNG file, renamed with PDF extension). 123 | String extension = getFileExtension(fileName); 124 | return formats.get(extension); 125 | } 126 | 127 | /** 128 | * Get file name extension of specified file. Example: for 'file1.ext' it will return '.ext' 129 | * @param fileName The file name. 130 | * @return File extension (in lowercase) or empty string if there is no extension. 131 | */ 132 | protected String getFileExtension(String fileName) { 133 | String ret = ""; 134 | if (fileName != null) { 135 | int idx = fileName.lastIndexOf('.'); 136 | if (idx > 0 && (idx < fileName.length() - 1)) { 137 | ret = fileName.substring(idx).toLowerCase(); 138 | } 139 | } 140 | return ret; 141 | } 142 | 143 | /** 144 | * Create mapping between file extensions and the known file formats. 145 | * @return The mapping. 146 | */ 147 | protected Map getKnownFileFormats() { 148 | TreeMap mapping = new TreeMap<>(); 149 | mapping.put(FileFormat.PDF.getExtension(), FileFormat.PDF); 150 | mapping.put(FileFormat.MARKDOWN.getExtension(), FileFormat.MARKDOWN); 151 | return mapping; 152 | } 153 | 154 | /** 155 | * Post-process annotated document. This is extension point. 156 | * @param document The annotated document. 157 | */ 158 | @SuppressWarnings("unused") 159 | protected void postProcess(AnnotatedDocument document) { 160 | } 161 | 162 | } 163 | 164 | -------------------------------------------------------------------------------- /source/main/java/dsk/anotex/ConsoleRunner.java: -------------------------------------------------------------------------------- 1 | package dsk.anotex; 2 | 3 | import dsk.anotex.util.CommandLineParser; 4 | 5 | import java.util.HashMap; 6 | import java.util.Map; 7 | 8 | /** 9 | * Console runner for the application. 10 | */ 11 | public class ConsoleRunner { 12 | // Recognized command line arguments. 13 | public static final String ARG_INPUT = "input"; 14 | public static final String ARG_OUTPUT = "output"; 15 | public static final String ARG_HELP = "help"; 16 | 17 | /** 18 | * Execute annotation extraction from file. 19 | * @param inputFile Input file name. 20 | * @param settings Additional export settings. 21 | * @param outputFile Output file name. 22 | */ 23 | public void doExtract(String inputFile, Map settings, String outputFile) { 24 | printMessage(String.format("Reading input document: '%s'", inputFile)); 25 | String outFile = new AnnotationExtractor().extractAnnotations(inputFile, settings, outputFile); 26 | printMessage(String.format("Annotations extracted to: '%s'", outFile)); 27 | } 28 | 29 | /** 30 | * Print message to the console. 31 | * @param message The message. 32 | */ 33 | protected void printMessage(String message) { 34 | System.out.println(message); 35 | } 36 | 37 | /** 38 | * Print error message to the console. 39 | * @param error The error message. 40 | */ 41 | protected void printError(String error) { 42 | System.err.println(error); 43 | System.err.flush(); 44 | } 45 | 46 | /** 47 | * Get the application start message. 48 | * @return Start message. 49 | */ 50 | protected String getStartMessage() { 51 | return String.format("%s (document annotation extractor)", Constants.APP_NAME); 52 | } 53 | 54 | /** 55 | * Get information about the supported command line arguments. 56 | * Override to add your application specific parameters. 57 | * @return Command line arguments description. 58 | */ 59 | protected String getHelpMessage() { 60 | return "Usage:\n" 61 | + String.format("DyAnnotationExtractor -%s -%s \n", 62 | ARG_INPUT, ARG_OUTPUT) 63 | + "where:\n" 64 | + " = input file name.\n" 65 | + " = output file name (optional).\n" 66 | + "additional arguments:\n" 67 | + String.format("-%s : Prints the supported command line arguments.\n", ARG_HELP); 68 | } 69 | 70 | /** 71 | * Execution entry point. 72 | * @param args Command line arguments. 73 | */ 74 | public static void main(String[] args) { 75 | // Start the application. 76 | ConsoleRunner runner = new ConsoleRunner(); 77 | runner.printMessage(runner.getStartMessage()); 78 | 79 | // Parse the command line. 80 | CommandLineParser parser = new CommandLineParser(args); 81 | parser.parseArguments(args); 82 | 83 | String inputFile = parser.getArgumentValue(ARG_INPUT); 84 | if ((inputFile != null)) { 85 | // Holder for additional execution settings. 86 | HashMap settings = new HashMap<>(); 87 | // Retrieve the output file name. 88 | String outputFile = parser.getArgumentValue(ARG_OUTPUT); 89 | // Execute the annotation extraction. 90 | runner.doExtract(inputFile, settings, outputFile); 91 | } 92 | else { 93 | // Print additional information. 94 | if (parser.hasArgument(ARG_HELP)) { 95 | runner.printMessage(runner.getHelpMessage()); 96 | } 97 | else { 98 | runner.printError("Error: Invalid command line argument(s)"); 99 | runner.printMessage(runner.getHelpMessage()); 100 | } 101 | } 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /source/main/java/dsk/anotex/Constants.java: -------------------------------------------------------------------------------- 1 | package dsk.anotex; 2 | 3 | /** 4 | * Global application constants. 5 | */ 6 | public class Constants { 7 | public static final String APP_NAME = "DyAnnotationExtractor"; 8 | 9 | public static final String EXPORT_FORMAT = "exportFormat"; 10 | 11 | // Prevent instance creation. 12 | private Constants() { 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /source/main/java/dsk/anotex/core/AnnotatedDocument.java: -------------------------------------------------------------------------------- 1 | package dsk.anotex.core; 2 | 3 | import java.io.Serializable; 4 | import java.util.LinkedList; 5 | import java.util.List; 6 | 7 | /** 8 | * Represents annotated document. It is independent of the original document format. 9 | */ 10 | public class AnnotatedDocument implements Serializable { 11 | protected String title; 12 | protected String subject; 13 | protected String author; 14 | protected List keywords; 15 | protected List annotations; 16 | 17 | public AnnotatedDocument() { 18 | super(); 19 | } 20 | 21 | public String getTitle() { 22 | return title; 23 | } 24 | 25 | public void setTitle(String title) { 26 | this.title = title; 27 | } 28 | 29 | public String getSubject() { 30 | return subject; 31 | } 32 | 33 | public void setSubject(String subject) { 34 | this.subject = subject; 35 | } 36 | 37 | public String getAuthor() { 38 | return author; 39 | } 40 | 41 | public void setAuthor(String author) { 42 | this.author = author; 43 | } 44 | 45 | public List getAnnotations() { 46 | if (annotations == null) { 47 | annotations = new LinkedList<>(); 48 | } 49 | return annotations; 50 | } 51 | 52 | public void setAnnotations(List annotations) { 53 | this.annotations = annotations; 54 | } 55 | 56 | public List getKeywords() { 57 | if (keywords == null) { 58 | keywords = new LinkedList<>(); 59 | } 60 | return keywords; 61 | } 62 | 63 | public void setKeywords(List keywords) { 64 | this.keywords = keywords; 65 | } 66 | 67 | @Override 68 | public String toString() { 69 | return "{" + title + '}'; 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /source/main/java/dsk/anotex/core/Annotation.java: -------------------------------------------------------------------------------- 1 | package dsk.anotex.core; 2 | 3 | import java.io.Serializable; 4 | 5 | /** 6 | * Represents document annotation (highlight/comment). It is independent of the document format. 7 | */ 8 | public class Annotation implements Serializable { 9 | protected String text; 10 | 11 | public Annotation() { 12 | } 13 | 14 | public Annotation(String text) { 15 | this(); 16 | setText(text); 17 | } 18 | 19 | public String getText() { 20 | return text; 21 | } 22 | 23 | public void setText(String text) { 24 | this.text = text; 25 | } 26 | 27 | @Override 28 | public String toString() { 29 | return "{" + text + '}'; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /source/main/java/dsk/anotex/core/FileFormat.java: -------------------------------------------------------------------------------- 1 | package dsk.anotex.core; 2 | 3 | /** 4 | * File format enumeration. 5 | */ 6 | public enum FileFormat { 7 | PDF("Pdf", ".pdf"), 8 | MARKDOWN("Markdown", ".md"); 9 | 10 | String name; 11 | String extension; 12 | 13 | public String getName() { 14 | return name; 15 | } 16 | 17 | public String getExtension() { 18 | return extension; 19 | } 20 | 21 | FileFormat(String name, String fileExtension) { 22 | this.name = name; 23 | this.extension = fileExtension; 24 | } 25 | 26 | public static FileFormat getByName(String name) { 27 | FileFormat match = null; 28 | for (FileFormat v : values()) { 29 | if (v.getName().equals(name)) { 30 | match = v; 31 | break; 32 | } 33 | } // 34 | return match; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /source/main/java/dsk/anotex/core/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Core part of the application. 3 | */ 4 | package dsk.anotex.core; -------------------------------------------------------------------------------- /source/main/java/dsk/anotex/exporter/AnnotationExporter.java: -------------------------------------------------------------------------------- 1 | package dsk.anotex.exporter; 2 | 3 | import dsk.anotex.core.AnnotatedDocument; 4 | 5 | import java.io.Writer; 6 | import java.util.Map; 7 | 8 | /** 9 | * Interface for exporting annotated document to some standard format. 10 | */ 11 | public interface AnnotationExporter { 12 | 13 | /** 14 | * Export annotated document. 15 | * @param document The document to be converted. 16 | * @param context Conversion context. 17 | * @param output Stream where to write the output. 18 | */ 19 | public void export(AnnotatedDocument document, Map context, Writer output); 20 | } 21 | -------------------------------------------------------------------------------- /source/main/java/dsk/anotex/exporter/ExporterFactory.java: -------------------------------------------------------------------------------- 1 | package dsk.anotex.exporter; 2 | 3 | import dsk.anotex.core.FileFormat; 4 | 5 | /** 6 | * Annotation exporter factory. 7 | */ 8 | public class ExporterFactory { 9 | 10 | /* 11 | * Prevent instance creation. 12 | */ 13 | private ExporterFactory() { 14 | } 15 | 16 | /** 17 | * Create annotation exporter for specified file format. 18 | * @param format Desired file format. 19 | * @return Exporter instance for this format. 20 | */ 21 | public static AnnotationExporter createExporter(FileFormat format) { 22 | AnnotationExporter exporter; 23 | if (FileFormat.MARKDOWN == format) { 24 | exporter = new MarkdownExporter(); 25 | } 26 | else { 27 | String message = String.format("Unsupported export format '%s'", format); 28 | throw new IllegalArgumentException(message); 29 | } 30 | return exporter; 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /source/main/java/dsk/anotex/exporter/MarkdownExporter.java: -------------------------------------------------------------------------------- 1 | package dsk.anotex.exporter; 2 | 3 | import dsk.anotex.core.AnnotatedDocument; 4 | import dsk.anotex.core.Annotation; 5 | 6 | import java.io.IOException; 7 | import java.io.Writer; 8 | import java.util.List; 9 | import java.util.Map; 10 | 11 | /** 12 | * Export annotated document to Markdown format. 13 | */ 14 | public class MarkdownExporter implements AnnotationExporter { 15 | 16 | @Override 17 | public void export(AnnotatedDocument document, Map context, Writer output) { 18 | String mdDocument = convert(document); 19 | try { 20 | output.write(mdDocument); 21 | } 22 | catch (IOException e) { 23 | throw new RuntimeException(e); 24 | } 25 | } 26 | 27 | /** 28 | * Convert annotated document to string in Markdown format. 29 | * @param document Document to convert. 30 | * @return The document as string. 31 | */ 32 | protected String convert(AnnotatedDocument document) { 33 | final String BR = System.lineSeparator(); 34 | // TODO: Use specialized Markdown library if the requirements evolve 35 | // (currently this would be overkill). 36 | StringBuilder buf = new StringBuilder(1024); 37 | if (document.getTitle() != null) { 38 | buf.append("# ").append(document.getTitle()).append(" #"); 39 | buf.append(BR); 40 | buf.append(BR); 41 | } 42 | String subject = document.getSubject(); 43 | if (subject != null) { 44 | buf.append("\"").append(subject).append("\""); 45 | buf.append(BR); 46 | } 47 | List keywords = document.getKeywords(); 48 | if (!keywords.isEmpty()) { 49 | buf.append(keywords); 50 | buf.append(BR); 51 | } 52 | buf.append(BR); 53 | for (Annotation annotation : document.getAnnotations()) { 54 | buf.append(annotation.getText()); 55 | buf.append(BR); 56 | } // 57 | return buf.toString(); 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /source/main/java/dsk/anotex/exporter/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Document annotation exporting. 3 | * Use the {@link dsk.anotex.exporter.ExporterFactory} to get appropriate exporter for given file format. 4 | */ 5 | package dsk.anotex.exporter; -------------------------------------------------------------------------------- /source/main/java/dsk/anotex/importer/AnnotationImporter.java: -------------------------------------------------------------------------------- 1 | package dsk.anotex.importer; 2 | 3 | import dsk.anotex.core.AnnotatedDocument; 4 | 5 | /** 6 | * Interface for importing annotations for different documents. 7 | */ 8 | public interface AnnotationImporter { 9 | 10 | /** 11 | * Read annotations from given document file. 12 | * @param fileName Document file name. 13 | * @return Document annotations. 14 | */ 15 | public AnnotatedDocument readAnnotations(String fileName); 16 | 17 | } 18 | -------------------------------------------------------------------------------- /source/main/java/dsk/anotex/importer/ImporterFactory.java: -------------------------------------------------------------------------------- 1 | package dsk.anotex.importer; 2 | 3 | import dsk.anotex.core.FileFormat; 4 | 5 | /** 6 | * Annotation importer factory. 7 | */ 8 | public class ImporterFactory { 9 | 10 | /* 11 | * Prevent instance creation. 12 | */ 13 | private ImporterFactory() { 14 | } 15 | 16 | /** 17 | * Create annotation importer for specified file format. 18 | * @param format Desired file format. 19 | * @return Importer instance for this format. 20 | */ 21 | public static AnnotationImporter createImporter(FileFormat format) { 22 | AnnotationImporter importer; 23 | if (FileFormat.PDF == format) { 24 | importer = new PdfAnnotationImporter(); 25 | } 26 | else { 27 | String message = String.format("Unsupported import format '%s'", format); 28 | throw new IllegalArgumentException(message); 29 | } 30 | return importer; 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /source/main/java/dsk/anotex/importer/PdfAnnotationImporter.java: -------------------------------------------------------------------------------- 1 | package dsk.anotex.importer; 2 | 3 | import com.itextpdf.kernel.geom.Rectangle; 4 | import com.itextpdf.kernel.pdf.PdfArray; 5 | import com.itextpdf.kernel.pdf.PdfDocument; 6 | import com.itextpdf.kernel.pdf.PdfDocumentInfo; 7 | import com.itextpdf.kernel.pdf.PdfName; 8 | import com.itextpdf.kernel.pdf.PdfPage; 9 | import com.itextpdf.kernel.pdf.PdfReader; 10 | import com.itextpdf.kernel.pdf.PdfString; 11 | import com.itextpdf.kernel.pdf.annot.PdfAnnotation; 12 | import com.itextpdf.kernel.pdf.annot.PdfTextMarkupAnnotation; 13 | import com.itextpdf.kernel.pdf.canvas.parser.PdfTextExtractor; 14 | import com.itextpdf.kernel.pdf.canvas.parser.filter.TextRegionEventFilter; 15 | import com.itextpdf.kernel.pdf.canvas.parser.listener.FilteredTextEventListener; 16 | import dsk.anotex.core.AnnotatedDocument; 17 | import dsk.anotex.core.Annotation; 18 | import org.apache.logging.log4j.LogManager; 19 | import org.apache.logging.log4j.Logger; 20 | 21 | import java.io.File; 22 | import java.util.Arrays; 23 | import java.util.LinkedList; 24 | import java.util.List; 25 | 26 | /** 27 | * Import annotations form PFD files. 28 | */ 29 | public class PdfAnnotationImporter implements AnnotationImporter { 30 | protected Logger log = LogManager.getLogger(this.getClass()); 31 | 32 | public AnnotatedDocument readAnnotations(String fileName) { 33 | // Check the file existence. 34 | File file = new File(fileName).getAbsoluteFile(); 35 | if (!file.isFile()) { 36 | String message = String.format("File '%s' does not exist", file.getName()); 37 | throw new IllegalArgumentException(message); 38 | } 39 | 40 | // Extract the annotations. 41 | PdfDocument pdfDocument = readDocument(file); 42 | return extractAnnotations(pdfDocument); 43 | } 44 | 45 | /** 46 | * Read PDF document from file. 47 | * @param file File name. 48 | * @return PDF document. 49 | */ 50 | protected PdfDocument readDocument(File file) { 51 | PdfDocument document; 52 | try { 53 | document = new PdfDocument(new PdfReader(file.getAbsolutePath())); 54 | } 55 | catch (Exception e) { 56 | throw new IllegalArgumentException(e); 57 | } 58 | return document; 59 | } 60 | 61 | /** 62 | * Extract annotations from given PDF document. 63 | * @param pdfDocument PDF document. 64 | * @return Extracted annotations. 65 | */ 66 | protected AnnotatedDocument extractAnnotations(PdfDocument pdfDocument) { 67 | AnnotatedDocument document = new AnnotatedDocument(); 68 | PdfDocumentInfo pdfInfo = pdfDocument.getDocumentInfo(); 69 | document.setTitle(pdfInfo.getTitle()); 70 | document.setSubject(pdfInfo.getSubject()); 71 | document.setAuthor(pdfInfo.getAuthor()); 72 | List keywords = convertToKeywords(pdfInfo.getKeywords()); 73 | document.setKeywords(keywords); 74 | 75 | List annotations = new LinkedList<>(); 76 | for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++) { 77 | PdfPage page = pdfDocument.getPage(i); 78 | for (PdfAnnotation pdfAnnotation : page.getAnnotations()) { 79 | Annotation annotation = convertAnnotation(pdfAnnotation); 80 | if (annotation != null) { 81 | annotations.add(annotation); 82 | } 83 | } // 84 | } // 85 | document.setAnnotations(annotations); 86 | 87 | return document; 88 | } 89 | 90 | /** 91 | * Convert document annotation to independent format. 92 | * @param pdfAnnotation Annotation to be converted. 93 | * @return Converted annotation. 94 | */ 95 | protected Annotation convertAnnotation(PdfAnnotation pdfAnnotation) { 96 | String text = null; 97 | PdfString pdfText = pdfAnnotation.getContents(); 98 | if (pdfText != null) { 99 | // The text is included in the annotation content (this is configurable feature of some PDF 100 | // readers). Use that text directly. 101 | if (pdfText.getEncoding() == null) { 102 | text = pdfText.toUnicodeString(); 103 | } 104 | else { 105 | text = pdfText.getValue(); 106 | } 107 | } 108 | if ((text == null) || (text.isEmpty())) { 109 | // The text is not included in the annotation content - extract from highlighted text. 110 | if (PdfName.Highlight.equals(pdfAnnotation.getSubtype())) { 111 | PdfTextMarkupAnnotation annotation = (PdfTextMarkupAnnotation) pdfAnnotation; 112 | PdfArray textCoordinates = annotation.getRectangle(); 113 | Rectangle highlightedArea = textCoordinates.toRectangle(); 114 | log.debug("Rectangle coordinates: {}", annotation.getRectangle()); 115 | PdfTextExtractionStrategy strategy = new PdfTextExtractionStrategy(highlightedArea); 116 | FilteredTextEventListener textFilter = new FilteredTextEventListener( 117 | strategy, new TextRegionEventFilter(highlightedArea)); 118 | String highlightedText = PdfTextExtractor.getTextFromPage(annotation.getPage(), 119 | textFilter); 120 | log.debug("Highlighted text: {}", highlightedText); 121 | // TODO: This could be part of the extraction strategy. 122 | text = normalizeHighlightedText(highlightedText); 123 | } 124 | } 125 | 126 | Annotation annotation = null; 127 | if (text != null) { 128 | annotation = new Annotation(); 129 | text = stripUnwantedChunks(text); 130 | text = removePollutionChars(text); 131 | annotation.setText(text); 132 | } 133 | return annotation; 134 | } 135 | 136 | /** 137 | * Convert comma separated string to list of keywords. 138 | * @param sKeywords String to be converted. 139 | * @return List of keywords. 140 | */ 141 | protected List convertToKeywords(String sKeywords) { 142 | List keywords; 143 | if ((sKeywords != null) && !sKeywords.isEmpty()) { 144 | // The string can be surrounded with double quotes. 145 | sKeywords = stripDoubleQuotes(sKeywords); 146 | // Split on comma (and trim around it). 147 | String[] words = sKeywords.split(" ?, ?"); 148 | keywords = Arrays.asList(words); 149 | } 150 | else { 151 | keywords = new LinkedList<>(); 152 | } 153 | return keywords; 154 | } 155 | 156 | /** 157 | * Strip double quotes enclosing string. For example: 158 | *
159 |      *     "Flower" becomes Flower
160 |      * 
161 | * @param text Text to be stripped. 162 | * @return Stripped text. 163 | */ 164 | protected String stripDoubleQuotes(String text) { 165 | String st = text; 166 | if (!text.isEmpty()) { 167 | char dQuota = '"'; 168 | int endPos = text.length() - 1; 169 | if ((text.charAt(0) == dQuota) && (text.charAt(endPos) == dQuota)) { 170 | st = text.substring(1, endPos).trim(); 171 | } 172 | } 173 | return st; 174 | } 175 | 176 | /** 177 | * Normalize highlighted text - when retrieved from PDF renderer, it contains defects (like 178 | * additional spaces, inappropriate characters). 179 | * @param highlightedText Highlighted text. 180 | * @return Normalized text. 181 | */ 182 | protected String normalizeHighlightedText(String highlightedText) { 183 | return highlightedText.replaceAll("\\s+", " ").replaceAll("[“”]", "\""); 184 | } 185 | 186 | /** 187 | * Strip unwanted character before or after the annotation (these chunks are PDF library issue). 188 | * @param text The text to strip. 189 | * @return Stripped text. 190 | */ 191 | protected String stripUnwantedChunks(String text) { 192 | text = text.replaceFirst("^\\p{javaLowerCase}?[.?!]? ", "") 193 | .replaceFirst(" \\p{IsAlphabetic}?$", ""); 194 | text = stripDoubleQuotes(text); 195 | return text; 196 | } 197 | 198 | /** 199 | * Remove the pollution characters from the annotation text. These characters appear, without being 200 | * part of the original text: 201 | *
    202 | *
  • Tab chars appear between words if the original text it aligned on both sides.
  • 203 | *
204 | * @param text The text to clean. 205 | * @return Cleaned text. 206 | */ 207 | protected String removePollutionChars(String text) { 208 | text = text.replaceAll("\t", " "); 209 | text = stripDoubleQuotes(text); 210 | return text; 211 | } 212 | } 213 | -------------------------------------------------------------------------------- /source/main/java/dsk/anotex/importer/PdfTextExtractionStrategy.java: -------------------------------------------------------------------------------- 1 | package dsk.anotex.importer; 2 | 3 | import com.itextpdf.kernel.geom.Rectangle; 4 | import com.itextpdf.kernel.pdf.canvas.parser.EventType; 5 | import com.itextpdf.kernel.pdf.canvas.parser.data.IEventData; 6 | import com.itextpdf.kernel.pdf.canvas.parser.data.TextRenderInfo; 7 | import com.itextpdf.kernel.pdf.canvas.parser.listener.CharacterRenderInfo; 8 | import com.itextpdf.kernel.pdf.canvas.parser.listener.LocationTextExtractionStrategy; 9 | 10 | /** 11 | * Pdf text extraction strategy, which cuts the text chunks crossing the extraction area. 12 | * By default, IText library does not cut such text snippets, so we do it here. 13 | */ 14 | public class PdfTextExtractionStrategy extends LocationTextExtractionStrategy { 15 | protected Rectangle extractionArea; 16 | 17 | public PdfTextExtractionStrategy(Rectangle extractionArea) { 18 | super(); 19 | this.extractionArea = extractionArea; 20 | } 21 | 22 | @Override 23 | public void eventOccurred(IEventData eventData, EventType eventType) { 24 | if (EventType.RENDER_TEXT == eventType) { 25 | TextRenderInfo data = (TextRenderInfo) eventData; 26 | // Split the text snippet to chars. 27 | for (TextRenderInfo renderInfo : data.getCharacterRenderInfos()) { 28 | // Get the char rendering boundaries. 29 | Rectangle charArea = new CharacterRenderInfo(renderInfo).getBoundingBox(); 30 | if (isInsideExtractionArea(charArea)) { 31 | // Extract this char. 32 | super.eventOccurred(renderInfo, eventType); 33 | } 34 | } // 35 | } 36 | } 37 | 38 | /** 39 | * Check if the rendered text intersects the extraction area. 40 | * @param textArea Text rendering area. 41 | * @return True if the text is inside. 42 | */ 43 | protected boolean isInsideExtractionArea(Rectangle textArea) { 44 | return extractionArea.contains(textArea); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /source/main/java/dsk/anotex/importer/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Document annotation importing. 3 | * Use the {@link dsk.anotex.importer.ImporterFactory} to get appropriate importer for given file format. 4 | */ 5 | package dsk.anotex.importer; -------------------------------------------------------------------------------- /source/main/java/dsk/anotex/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Annotation extractor application. 3 | * To start it on the command line, use the {@link dsk.anotex.ConsoleRunner}. 4 | */ 5 | package dsk.anotex; -------------------------------------------------------------------------------- /source/main/java/dsk/anotex/util/CommandLineParser.java: -------------------------------------------------------------------------------- 1 | package dsk.anotex.util; 2 | 3 | import java.util.LinkedHashMap; 4 | import java.util.LinkedList; 5 | import java.util.List; 6 | import java.util.Map; 7 | 8 | /** 9 | * Parser for command line arguments. Expected format: 10 | *
-arg1 argValue1 -arg2 value3
11 | *

12 | * This implementation fits in single class! There are enough java libraries for parsing command line. 13 | * Most of them are example for over-engineering (but it is fun to see how much code was written just 14 | * to parse array of strings). 15 | *

16 | */ 17 | public class CommandLineParser { 18 | protected Map args; 19 | protected List values; 20 | protected String argPrefix; 21 | 22 | /** 23 | * Create empty instance. 24 | */ 25 | public CommandLineParser() { 26 | super(); 27 | values = new LinkedList<>(); 28 | argPrefix = "-"; 29 | } 30 | 31 | /** 32 | * Constructor with specified parameters. 33 | * @param args Command line arguments. 34 | */ 35 | public CommandLineParser(String[] args) { 36 | this(); 37 | parse(args); 38 | } 39 | 40 | /** 41 | * Parse specified command line arguments. 42 | * @param arguments Command line arguments. 43 | */ 44 | public void parse(String[] arguments) { 45 | args = parseArguments(arguments); 46 | } 47 | 48 | /** 49 | * Check if specified argument was passed. 50 | * @param name Argument name. 51 | * @return True if this argument exists. 52 | */ 53 | public boolean hasArgument(String name) { 54 | return args.containsKey(name); 55 | } 56 | 57 | /** 58 | * Get argument with specified name. 59 | * @param name Argument name. 60 | * @return Argument value or null. 61 | * 62 | * @see #parse(String[]) 63 | */ 64 | public String getArgumentValue(String name) { 65 | return getArgumentValue(name, null); 66 | } 67 | 68 | /** 69 | * Get argument with specified name. 70 | * @param name Argument name. 71 | * @param defaultValue Default value. 72 | * @return Argument value or the default value. 73 | * 74 | * @see #parse(String[]) 75 | */ 76 | public String getArgumentValue(String name, String defaultValue) { 77 | String ret = args.get(name); 78 | if (ret == null) { 79 | ret = defaultValue; 80 | } 81 | return ret; 82 | } 83 | 84 | /** 85 | * Get parsed command line arguments. 86 | * @return Parsed arguments. 87 | * 88 | * @see #parse(String[]) 89 | */ 90 | @SuppressWarnings("unused") 91 | public Map getArguments() { 92 | return args; 93 | } 94 | 95 | /** 96 | * Get the command line values without the arguments. Example: 97 | *
 'command -arg1 v1 arg2' 
98 | * The result will be [v1, arg2]. 99 | * @return The value without argument name. 100 | */ 101 | @SuppressWarnings("unused") 102 | public List getValues() { 103 | return values; 104 | } 105 | 106 | /** 107 | * Parse command line arguments (parameters). Parameters which contains spaces should be 108 | * enclosed with double quotas. Double quote sign itself (if present in command value) should 109 | * be escaped with \. 110 | * @param args Command line arguments, passed to the application. 111 | * 112 | * @return Map with parsed keys and values. Or empty map (if no command line options passed). 113 | */ 114 | public Map parseArguments(String[] args) { 115 | LinkedHashMap arguments = new LinkedHashMap<>(); 116 | if ((args == null) || (args.length == 0)) { 117 | // No parameters passed. 118 | return arguments; 119 | } 120 | 121 | for (int i = 0; i < args.length; i++) { 122 | String s = args[i]; 123 | if ((s == null) || (s.isEmpty())) { 124 | // Invalid argument. Skip it. 125 | continue; 126 | } 127 | 128 | String sOption = null; 129 | String sValue = null; 130 | if (s.startsWith(argPrefix)) { 131 | // It is argument. 132 | sOption = s.substring(argPrefix.length()); 133 | if (args.length - i > 1) { 134 | // Argument value. 135 | String ss = args[i + 1]; 136 | if (!ss.startsWith(argPrefix)) { 137 | sValue = ss; 138 | i++; 139 | } 140 | } 141 | } 142 | else { 143 | // It is value. 144 | sValue = s; 145 | } 146 | 147 | // Remove value enclosing quotas (if any). 148 | final String dQuota = "\""; 149 | final String sQuota = "'"; 150 | if ((sValue != null) && (sValue.startsWith(sQuota))) { 151 | if (((sValue.startsWith(dQuota) && sValue.endsWith(dQuota))) || 152 | ((sValue.startsWith(sQuota) && sValue.endsWith(sQuota)))) { 153 | sValue = sValue.substring(1, sValue.length() - 1); 154 | } 155 | } 156 | 157 | // Add to parameter map. 158 | if (sOption != null) { 159 | arguments.put(sOption, sValue); 160 | } 161 | if (sValue != null) { 162 | values.add(sValue); 163 | } 164 | } // 165 | 166 | return arguments; 167 | } 168 | } 169 | -------------------------------------------------------------------------------- /source/main/java/dsk/anotex/util/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Various utilities. 3 | */ 4 | package dsk.anotex.util; -------------------------------------------------------------------------------- /source/test/java/dsk/anotex/AnnotationExtractorTest.java: -------------------------------------------------------------------------------- 1 | package dsk.anotex; 2 | 3 | import dsk.anotex.core.AnnotatedDocument; 4 | import dsk.anotex.core.Annotation; 5 | import org.junit.jupiter.api.Assertions; 6 | import org.junit.jupiter.api.Test; 7 | 8 | import java.util.List; 9 | 10 | import static org.junit.jupiter.api.Assertions.assertEquals; 11 | 12 | public class AnnotationExtractorTest extends TestBase { 13 | 14 | @Test 15 | public void testMissingFile() { 16 | AnnotationExtractor extractor = new AnnotationExtractor(); 17 | Assertions.assertThrows(IllegalArgumentException.class, () -> { 18 | extractor.readAnnotations(resDir + "/Missing.pdf"); 19 | }); 20 | } 21 | 22 | @Test 23 | public void testUnsupportedFile() { 24 | AnnotationExtractor extractor = new AnnotationExtractor(); 25 | Assertions.assertThrows(IllegalArgumentException.class, () -> { 26 | extractor.readAnnotations(resDir + "/Test_Pdf_4.pdf"); 27 | }); 28 | } 29 | 30 | @Test 31 | public void testHighlightingOnly() { 32 | AnnotationExtractor extractor = new AnnotationExtractor(); 33 | AnnotatedDocument document = extractor.readAnnotations(resDir + "/Test_Pdf_5.pdf"); 34 | List annotations = document.getAnnotations(); 35 | Annotation annot = annotations.getFirst(); 36 | assertEquals("One Two", annot.getText()); 37 | assertEquals(1, annotations.size()); 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /source/test/java/dsk/anotex/ConsoleRunnerTest.java: -------------------------------------------------------------------------------- 1 | package dsk.anotex; 2 | 3 | import org.junit.jupiter.api.BeforeEach; 4 | import org.junit.jupiter.api.Test; 5 | 6 | import static org.junit.jupiter.api.Assertions.assertEquals; 7 | 8 | public class ConsoleRunnerTest extends TestBase { 9 | 10 | @BeforeEach 11 | public void beforeEach() { 12 | cleanTempDirectory(); 13 | } 14 | 15 | @Test 16 | public void testExtraction1() { 17 | String inputFile = resDir + "/Test_Pdf_2.pdf"; 18 | String outputFile = tempDir + "/Test_Pdf_2.pdf.md"; 19 | ConsoleRunner.main(new String[]{"-input", inputFile, "-output", outputFile}); 20 | String outputContent = readFile(outputFile); 21 | assertEquals("94d6378bf0eacfef6ec05e6b187673ac88f2d6ba4556acba584bb031f79f4ffa", 22 | calcChecksum(outputContent)); 23 | } 24 | } -------------------------------------------------------------------------------- /source/test/java/dsk/anotex/TestBase.java: -------------------------------------------------------------------------------- 1 | package dsk.anotex; 2 | 3 | import java.io.File; 4 | import java.io.FileWriter; 5 | import java.io.IOException; 6 | import java.io.InputStreamReader; 7 | import java.io.Reader; 8 | import java.math.BigInteger; 9 | import java.nio.charset.StandardCharsets; 10 | import java.nio.file.Files; 11 | import java.security.MessageDigest; 12 | import java.security.NoSuchAlgorithmException; 13 | 14 | /** 15 | * Base functionality for unit tests. 16 | */ 17 | public abstract class TestBase { 18 | public static final String WORK_DIR = "work"; 19 | public static final String TEMP_DIR = "temp"; 20 | protected static File workDir; 21 | protected static File tempDir; 22 | protected static String resDir; 23 | protected static MessageDigest digester; 24 | 25 | public TestBase() { 26 | super(); 27 | if (workDir == null) { 28 | workDir = setupWorkDirectory(WORK_DIR); 29 | tempDir = setupTempDirectory(TEMP_DIR); 30 | resDir = workDir + "/testing"; 31 | } 32 | } 33 | 34 | /** 35 | * Setup working directory to run the tests from. 36 | * @param dir Work directory name (relative to project root directory). Pass null to use default. 37 | * @return The work directory. 38 | */ 39 | protected File setupWorkDirectory(String dir) { 40 | File workDir; 41 | if (dir == null) { 42 | // Detect the application home directory. 43 | workDir = new File(getClass().getClassLoader().getResource(".").getFile()).getParentFile(); 44 | } 45 | else { 46 | // Try the requested directory. 47 | workDir = new File(dir).getAbsoluteFile(); 48 | if (!workDir.isDirectory()) { 49 | // Then use the current directory. 50 | workDir = new File("").getAbsoluteFile(); 51 | } 52 | } 53 | 54 | // Change the work directory (note - this will not affect ). 55 | System.setProperty("user.dir", workDir.getAbsolutePath()); 56 | return workDir; 57 | } 58 | 59 | /** 60 | * Setup temporary storage directory for tests. 61 | * @param dir Temporary directory name (relative to project root directory). Null = to use default. 62 | * @return The temp directory. 63 | */ 64 | protected File setupTempDirectory(String dir) { 65 | File tempDir; 66 | if (dir == null) { 67 | tempDir = new File(workDir, TEMP_DIR).getAbsoluteFile(); 68 | } 69 | else { 70 | tempDir = new File(dir).getAbsoluteFile(); 71 | } 72 | if (!tempDir.canWrite()) { 73 | // Not writable directory. Try the JVM temp directory. 74 | String systemTempDir = System.getProperty("java.io.tmpdir"); 75 | tempDir = new File(systemTempDir).getAbsoluteFile(); 76 | } 77 | tempDir.mkdirs(); 78 | return tempDir; 79 | } 80 | 81 | /** 82 | * Clean the temporary directory. 83 | */ 84 | protected void cleanTempDirectory() { 85 | if (tempDir != null) { 86 | if (tempDir.isDirectory()) { 87 | File[] files = tempDir.listFiles(); 88 | if (files != null) { 89 | for (File file : files) { 90 | removeDirectory(file); 91 | } // 92 | } 93 | } 94 | } 95 | } 96 | 97 | /** 98 | * Remove specified directory with its subdirectories. 99 | * @param dir Directory name. 100 | */ 101 | protected void removeDirectory(File dir) { 102 | if (dir.isDirectory()) { 103 | File[] files = dir.listFiles(); 104 | if (files != null) { 105 | for (File file : files) { 106 | removeDirectory(file); 107 | } // 108 | } 109 | dir.delete(); 110 | } else { 111 | dir.delete(); 112 | } 113 | } 114 | 115 | /** 116 | * Read complete file into string. Works with UTF-8 encoding. 117 | * @param fileName Name of the file to read. 118 | * @return File content. 119 | */ 120 | protected static String readFile(String fileName) { 121 | // The variant with 'Paths' is not used intentionally (it ignores work directory change). 122 | String ret = null; 123 | if (fileName != null) { 124 | StringBuilder content = new StringBuilder(); 125 | File file = new File(fileName).getAbsoluteFile(); 126 | int bufSize = (int) file.length(); 127 | if (bufSize > 0) { 128 | try { 129 | char[] buf = new char[bufSize]; 130 | Reader f = new InputStreamReader(Files.newInputStream(file.toPath()), 131 | StandardCharsets.UTF_8); 132 | int read; 133 | while ((read = f.read(buf, 0, bufSize)) != -1) { 134 | content.append(buf, 0, read); 135 | if (read < bufSize) { 136 | break; 137 | } 138 | } // 139 | f.close(); 140 | } 141 | catch (IOException e) { 142 | String message = String.format("Cannot read file '%s'", file); 143 | throw new IllegalArgumentException(message, e); 144 | } 145 | } 146 | ret = content.toString(); 147 | } 148 | return ret; 149 | } 150 | 151 | /** 152 | * Write specified string into file. Works with UTF-8 encoding. 153 | * @param fileName Desired file name. 154 | * @param fileContent File content. 155 | */ 156 | protected static void writeFile(String fileName, String fileContent) { 157 | // The variant with 'Paths' is not used intentionally (it ignores work directory change). 158 | if ((fileName != null) && (fileContent != null)) { 159 | File file = new File(fileName).getAbsoluteFile(); 160 | try { 161 | file.getParentFile().mkdirs(); 162 | FileWriter f = new FileWriter(file); 163 | f.write(new String(fileContent.getBytes(StandardCharsets.UTF_8))); 164 | f.close(); 165 | } 166 | catch (IOException e) { 167 | String message = String.format("Cannot write file '%s'", file); 168 | throw new IllegalArgumentException(message, e); 169 | } 170 | } 171 | } 172 | 173 | /** 174 | * Calculate SHA-256 checksum on given text (considering new line separator differences 175 | * between different OS). 176 | * @param text Input text. 177 | * @return Calculated checksum. 178 | */ 179 | protected String calcChecksum(String text) { 180 | String checksum = null; 181 | if ((text != null) && (!text.isEmpty())) { 182 | if (digester == null) { 183 | try { 184 | digester = MessageDigest.getInstance("SHA-256"); 185 | } 186 | catch (NoSuchAlgorithmException e) { 187 | throw new RuntimeException(e); 188 | } 189 | } 190 | text = text.replace("\r", ""); // Unify the new line characters. 191 | digester.update(text.getBytes(StandardCharsets.UTF_8)); 192 | checksum = String.format("%x", new BigInteger(1, digester.digest())); 193 | } 194 | return checksum; 195 | } 196 | 197 | } 198 | -------------------------------------------------------------------------------- /source/test/java/dsk/anotex/exporter/MarkdownExporterTest.java: -------------------------------------------------------------------------------- 1 | package dsk.anotex.exporter; 2 | 3 | import dsk.anotex.TestBase; 4 | import dsk.anotex.core.AnnotatedDocument; 5 | import dsk.anotex.core.Annotation; 6 | import org.junit.jupiter.api.Test; 7 | 8 | import java.io.StringWriter; 9 | import java.util.Arrays; 10 | import java.util.HashMap; 11 | 12 | import static org.junit.jupiter.api.Assertions.assertEquals; 13 | 14 | public class MarkdownExporterTest extends TestBase { 15 | 16 | @Test 17 | public void testExport1() { 18 | MarkdownExporter exporter = new MarkdownExporter(); 19 | AnnotatedDocument document = createDocument(); 20 | StringWriter output = new StringWriter(256); 21 | exporter.export(document, new HashMap<>(), output); 22 | String sResult = "# Title1 #\n" 23 | + "\n" 24 | + "\n" 25 | + "Text1\n" 26 | + "Text2\n"; 27 | String s = output.toString().replace("\r\n", "\n"); 28 | assertEquals(sResult, s); 29 | } 30 | 31 | protected AnnotatedDocument createDocument() { 32 | AnnotatedDocument document = new AnnotatedDocument(); 33 | document.setTitle("Title1"); 34 | Annotation annot1 = new Annotation("Text1"); 35 | Annotation annot2 = new Annotation("Text2"); 36 | document.setAnnotations(Arrays.asList(annot1, annot2)); 37 | return document; 38 | } 39 | } -------------------------------------------------------------------------------- /source/test/java/dsk/anotex/importer/PdfAnnotationImporterTest.java: -------------------------------------------------------------------------------- 1 | package dsk.anotex.importer; 2 | 3 | import dsk.anotex.TestBase; 4 | import dsk.anotex.core.AnnotatedDocument; 5 | import dsk.anotex.core.Annotation; 6 | import org.junit.jupiter.api.Test; 7 | 8 | import java.util.List; 9 | 10 | import static org.junit.jupiter.api.Assertions.assertEquals; 11 | 12 | public class PdfAnnotationImporterTest extends TestBase { 13 | 14 | @Test 15 | public void testCyrillicAnnotation() { 16 | PdfAnnotationImporter importer = new PdfAnnotationImporter(); 17 | AnnotatedDocument document = importer.readAnnotations(resDir + "/Test_Pdf_1.pdf"); 18 | List annotations = document.getAnnotations(); 19 | Annotation annot = annotations.getFirst(); 20 | assertEquals("\u041f\u0435\u0442", annot.getText()); // Пет ("five" in Cyrillic). 21 | assertEquals(1, annotations.size()); 22 | } 23 | 24 | @Test 25 | public void testOneAnnotation() { 26 | PdfAnnotationImporter importer = new PdfAnnotationImporter(); 27 | AnnotatedDocument document = importer.readAnnotations(resDir + "/Test_Pdf_2.pdf"); 28 | assertEquals("Title2", document.getTitle()); 29 | assertEquals("Subject2", document.getSubject()); 30 | assertEquals("Author2", document.getAuthor()); 31 | List annotations = document.getAnnotations(); 32 | Annotation annot = annotations.getFirst(); 33 | assertEquals("Two", annot.getText()); 34 | assertEquals(1, annotations.size()); 35 | } 36 | 37 | @Test 38 | public void testComments() { 39 | PdfAnnotationImporter importer = new PdfAnnotationImporter(); 40 | AnnotatedDocument document = importer.readAnnotations(resDir + "/Test_Pdf_3.pdf"); 41 | List annotations = document.getAnnotations(); 42 | Annotation annot1 = annotations.get(0); 43 | assertEquals("Four", annot1.getText()); 44 | Annotation annot2 = annotations.get(1); 45 | assertEquals("Five", annot2.getText()); 46 | Annotation annot3 = annotations.get(2); 47 | assertEquals("Six", annot3.getText()); 48 | assertEquals(3, annotations.size()); 49 | } 50 | 51 | @Test 52 | public void testStripUnwantedChunks() { 53 | PdfAnnotationImporter importer = new PdfAnnotationImporter(); 54 | 55 | String res1 = importer.stripUnwantedChunks("Be them. W"); 56 | assertEquals("Be them.", res1); 57 | 58 | String res2 = importer.stripUnwantedChunks("o? When"); 59 | assertEquals("When", res2); 60 | 61 | String res3 = importer.stripUnwantedChunks("I can be"); 62 | assertEquals("I can be", res3); 63 | 64 | String res4 = importer.stripUnwantedChunks("\"Awesome!\""); 65 | assertEquals("Awesome!", res4); 66 | } 67 | 68 | @Test 69 | public void testHighlightingBoundaries() { 70 | PdfAnnotationImporter importer = new PdfAnnotationImporter(); 71 | AnnotatedDocument document = importer.readAnnotations(resDir + "/Test_Pdf_6.pdf"); 72 | List annotations = document.getAnnotations(); 73 | Annotation annot1 = annotations.getFirst(); 74 | assertEquals("seven eight nine ten eleven twelve thirteen fourteen fifteen", annot1.getText()); 75 | } 76 | 77 | @Test 78 | public void testHighlightingWithContent() { 79 | PdfAnnotationImporter importer = new PdfAnnotationImporter(); 80 | AnnotatedDocument document = importer.readAnnotations(resDir + "/Test_Pdf_7.pdf"); 81 | List annotations = document.getAnnotations(); 82 | Annotation annot1 = annotations.getFirst(); 83 | assertEquals("The programs that a home user needs are email, web browser, pdf file viewer, " + 84 | "video an music playback software as well as, office program including spreadsheet, " + 85 | "word processing and presentation graphics. Today, cloud services, " + 86 | "web calls and other social", annot1.getText()); 87 | } 88 | } -------------------------------------------------------------------------------- /work/DyAnnotationExtractor: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | java -cp "program:library/*" dsk.anotex.ConsoleRunner $1 "$2" -------------------------------------------------------------------------------- /work/DyAnnotationExtractor.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | java -cp program;library/* dsk.anotex.ConsoleRunner %1 %2 %3 %4 -------------------------------------------------------------------------------- /work/documents/Highlight_Example_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dimi2/DyAnnotationExtractor/3e99f750a0b811e55cc182371ed5b3f348a40982/work/documents/Highlight_Example_1.png -------------------------------------------------------------------------------- /work/documents/Manual.md: -------------------------------------------------------------------------------- 1 | # DyAnnotationExtractor # 2 | 3 | DyAnnotationExtractor is software for extracting annotations (highlighted text and comments) from e-documents like PDF. The extracted parts can be used to build summary/resume of the document. 4 | 5 | 6 | Note! The AI bot [ChatGPT](https://chatgpt.com/) is now cable to extract highlighted text from PDF file and export the summary into Markdown. There is no stimulus to develop this project further. 7 | DyAnnotationExtractor remains usable in its current state - it can be applied in task automation tools which cannot call AI service (because of sensitive documents or internet restrictions). 8 | 9 | 10 | ## Usage ## 11 | 12 | Imagine you have ebook (PDF) which is 100 pages long. While reading the book, 13 | you **highlight** the important parts in your favorite reader: 14 | 15 | ![](Highlight_Example_1.png) 16 | 17 | Then use the DyAnnotationExtractor tool to get just the highlighted parts. 18 | 19 | Via the command line: 20 | ```console 21 | DyAnnotationExtractor -input "Getting Started with Ubuntu 16.04.pdf" 22 | ``` 23 | 24 | This will create a file with same name in the same directory, with added '.md' suffix. 25 | Note that the file name is enclosed with quotas - this is required when the file name contains spaces. 26 | 27 | Now you have extract of the book which is not 100 but 5-6 pages. So, you can skim just the exported text instead of re-reading the entire book. 28 | 29 | ## Supported Input Formats ## 30 | 31 | - PDF (Portable Document Format) 32 | 33 | ## Supported Output Formats ## 34 | 35 | - MD (Markdown) 36 | 37 | ## Requirements ## 38 | 39 | - Java 21+. 40 | 41 | ## Download ## 42 | 43 | Get the [latest release](https://github.com/dimi2/DyAnnotationExtractor/releases/latest). 44 | 45 | 46 | End users need to download only the distribution jar. 47 | 48 | ## Installation ## 49 | 50 | Extract the downloaded archive in some local directory.
51 | Run the provided 'DyAnnotationExtractor' script to perform extraction. 52 | 53 | ## Build ## 54 | 55 | To build the project from sources, you will need [Gradle](https://gradle.org/) build tool. 56 | Go into the project home directory (PROJ_HOME) and execute command: 57 | 58 | ``` 59 | gradle 60 | ``` 61 | The result will appear in directory `PROJ_HOME/build/distribution`. This is portable distribution of the application. If you need just the library (without dependencies and start scripts), use the JAR file generated in `PROJ_HOME/build/libs` directory. 62 | 63 | -------------------------------------------------------------------------------- /work/testing/Test_Pdf_1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dimi2/DyAnnotationExtractor/3e99f750a0b811e55cc182371ed5b3f348a40982/work/testing/Test_Pdf_1.pdf -------------------------------------------------------------------------------- /work/testing/Test_Pdf_2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dimi2/DyAnnotationExtractor/3e99f750a0b811e55cc182371ed5b3f348a40982/work/testing/Test_Pdf_2.pdf -------------------------------------------------------------------------------- /work/testing/Test_Pdf_3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dimi2/DyAnnotationExtractor/3e99f750a0b811e55cc182371ed5b3f348a40982/work/testing/Test_Pdf_3.pdf -------------------------------------------------------------------------------- /work/testing/Test_Pdf_4.pdf: -------------------------------------------------------------------------------- 1 | Invalid PDF file. -------------------------------------------------------------------------------- /work/testing/Test_Pdf_5.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dimi2/DyAnnotationExtractor/3e99f750a0b811e55cc182371ed5b3f348a40982/work/testing/Test_Pdf_5.pdf -------------------------------------------------------------------------------- /work/testing/Test_Pdf_6.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dimi2/DyAnnotationExtractor/3e99f750a0b811e55cc182371ed5b3f348a40982/work/testing/Test_Pdf_6.pdf -------------------------------------------------------------------------------- /work/testing/Test_Pdf_7.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dimi2/DyAnnotationExtractor/3e99f750a0b811e55cc182371ed5b3f348a40982/work/testing/Test_Pdf_7.pdf --------------------------------------------------------------------------------