keywords;
143 | if ((sKeywords != null) && !sKeywords.isEmpty()) {
144 | // The string can be surrounded with double quotes.
145 | sKeywords = stripDoubleQuotes(sKeywords);
146 | // Split on comma (and trim around it).
147 | String[] words = sKeywords.split(" ?, ?");
148 | keywords = Arrays.asList(words);
149 | }
150 | else {
151 | keywords = new LinkedList<>();
152 | }
153 | return keywords;
154 | }
155 |
156 | /**
157 | * Strip double quotes enclosing string. For example:
158 | *
159 | * "Flower" becomes Flower
160 | *
161 | * @param text Text to be stripped.
162 | * @return Stripped text.
163 | */
164 | protected String stripDoubleQuotes(String text) {
165 | String st = text;
166 | if (!text.isEmpty()) {
167 | char dQuota = '"';
168 | int endPos = text.length() - 1;
169 | if ((text.charAt(0) == dQuota) && (text.charAt(endPos) == dQuota)) {
170 | st = text.substring(1, endPos).trim();
171 | }
172 | }
173 | return st;
174 | }
175 |
176 | /**
177 | * Normalize highlighted text - when retrieved from PDF renderer, it contains defects (like
178 | * additional spaces, inappropriate characters).
179 | * @param highlightedText Highlighted text.
180 | * @return Normalized text.
181 | */
182 | protected String normalizeHighlightedText(String highlightedText) {
183 | return highlightedText.replaceAll("\\s+", " ").replaceAll("[“”]", "\"");
184 | }
185 |
186 | /**
187 | * Strip unwanted character before or after the annotation (these chunks are PDF library issue).
188 | * @param text The text to strip.
189 | * @return Stripped text.
190 | */
191 | protected String stripUnwantedChunks(String text) {
192 | text = text.replaceFirst("^\\p{javaLowerCase}?[.?!]? ", "")
193 | .replaceFirst(" \\p{IsAlphabetic}?$", "");
194 | text = stripDoubleQuotes(text);
195 | return text;
196 | }
197 |
198 | /**
199 | * Remove the pollution characters from the annotation text. These characters appear, without being
200 | * part of the original text:
201 | *
202 | * - Tab chars appear between words if the original text it aligned on both sides.
203 | *
204 | * @param text The text to clean.
205 | * @return Cleaned text.
206 | */
207 | protected String removePollutionChars(String text) {
208 | text = text.replaceAll("\t", " ");
209 | text = stripDoubleQuotes(text);
210 | return text;
211 | }
212 | }
213 |
--------------------------------------------------------------------------------
/source/main/java/dsk/anotex/importer/PdfTextExtractionStrategy.java:
--------------------------------------------------------------------------------
1 | package dsk.anotex.importer;
2 |
3 | import com.itextpdf.kernel.geom.Rectangle;
4 | import com.itextpdf.kernel.pdf.canvas.parser.EventType;
5 | import com.itextpdf.kernel.pdf.canvas.parser.data.IEventData;
6 | import com.itextpdf.kernel.pdf.canvas.parser.data.TextRenderInfo;
7 | import com.itextpdf.kernel.pdf.canvas.parser.listener.CharacterRenderInfo;
8 | import com.itextpdf.kernel.pdf.canvas.parser.listener.LocationTextExtractionStrategy;
9 |
10 | /**
11 | * Pdf text extraction strategy, which cuts the text chunks crossing the extraction area.
12 | * By default, IText library does not cut such text snippets, so we do it here.
13 | */
14 | public class PdfTextExtractionStrategy extends LocationTextExtractionStrategy {
15 | protected Rectangle extractionArea;
16 |
17 | public PdfTextExtractionStrategy(Rectangle extractionArea) {
18 | super();
19 | this.extractionArea = extractionArea;
20 | }
21 |
22 | @Override
23 | public void eventOccurred(IEventData eventData, EventType eventType) {
24 | if (EventType.RENDER_TEXT == eventType) {
25 | TextRenderInfo data = (TextRenderInfo) eventData;
26 | // Split the text snippet to chars.
27 | for (TextRenderInfo renderInfo : data.getCharacterRenderInfos()) {
28 | // Get the char rendering boundaries.
29 | Rectangle charArea = new CharacterRenderInfo(renderInfo).getBoundingBox();
30 | if (isInsideExtractionArea(charArea)) {
31 | // Extract this char.
32 | super.eventOccurred(renderInfo, eventType);
33 | }
34 | } //
35 | }
36 | }
37 |
38 | /**
39 | * Check if the rendered text intersects the extraction area.
40 | * @param textArea Text rendering area.
41 | * @return True if the text is inside.
42 | */
43 | protected boolean isInsideExtractionArea(Rectangle textArea) {
44 | return extractionArea.contains(textArea);
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/source/main/java/dsk/anotex/importer/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Document annotation importing.
3 | * Use the {@link dsk.anotex.importer.ImporterFactory} to get appropriate importer for given file format.
4 | */
5 | package dsk.anotex.importer;
--------------------------------------------------------------------------------
/source/main/java/dsk/anotex/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Annotation extractor application.
3 | * To start it on the command line, use the {@link dsk.anotex.ConsoleRunner}.
4 | */
5 | package dsk.anotex;
--------------------------------------------------------------------------------
/source/main/java/dsk/anotex/util/CommandLineParser.java:
--------------------------------------------------------------------------------
1 | package dsk.anotex.util;
2 |
3 | import java.util.LinkedHashMap;
4 | import java.util.LinkedList;
5 | import java.util.List;
6 | import java.util.Map;
7 |
8 | /**
9 | * Parser for command line arguments. Expected format:
10 | * -arg1 argValue1 -arg2 value3
11 | *
12 | * This implementation fits in single class! There are enough java libraries for parsing command line.
13 | * Most of them are example for over-engineering (but it is fun to see how much code was written just
14 | * to parse array of strings).
15 | *
16 | */
17 | public class CommandLineParser {
18 | protected Map args;
19 | protected List values;
20 | protected String argPrefix;
21 |
22 | /**
23 | * Create empty instance.
24 | */
25 | public CommandLineParser() {
26 | super();
27 | values = new LinkedList<>();
28 | argPrefix = "-";
29 | }
30 |
31 | /**
32 | * Constructor with specified parameters.
33 | * @param args Command line arguments.
34 | */
35 | public CommandLineParser(String[] args) {
36 | this();
37 | parse(args);
38 | }
39 |
40 | /**
41 | * Parse specified command line arguments.
42 | * @param arguments Command line arguments.
43 | */
44 | public void parse(String[] arguments) {
45 | args = parseArguments(arguments);
46 | }
47 |
48 | /**
49 | * Check if specified argument was passed.
50 | * @param name Argument name.
51 | * @return True if this argument exists.
52 | */
53 | public boolean hasArgument(String name) {
54 | return args.containsKey(name);
55 | }
56 |
57 | /**
58 | * Get argument with specified name.
59 | * @param name Argument name.
60 | * @return Argument value or null.
61 | *
62 | * @see #parse(String[])
63 | */
64 | public String getArgumentValue(String name) {
65 | return getArgumentValue(name, null);
66 | }
67 |
68 | /**
69 | * Get argument with specified name.
70 | * @param name Argument name.
71 | * @param defaultValue Default value.
72 | * @return Argument value or the default value.
73 | *
74 | * @see #parse(String[])
75 | */
76 | public String getArgumentValue(String name, String defaultValue) {
77 | String ret = args.get(name);
78 | if (ret == null) {
79 | ret = defaultValue;
80 | }
81 | return ret;
82 | }
83 |
84 | /**
85 | * Get parsed command line arguments.
86 | * @return Parsed arguments.
87 | *
88 | * @see #parse(String[])
89 | */
90 | @SuppressWarnings("unused")
91 | public Map getArguments() {
92 | return args;
93 | }
94 |
95 | /**
96 | * Get the command line values without the arguments. Example:
97 | * 'command -arg1 v1 arg2'
98 | * The result will be [v1, arg2].
99 | * @return The value without argument name.
100 | */
101 | @SuppressWarnings("unused")
102 | public List getValues() {
103 | return values;
104 | }
105 |
106 | /**
107 | * Parse command line arguments (parameters). Parameters which contains spaces should be
108 | * enclosed with double quotas. Double quote sign itself (if present in command value) should
109 | * be escaped with \.
110 | * @param args Command line arguments, passed to the application.
111 | *
112 | * @return Map with parsed keys and values. Or empty map (if no command line options passed).
113 | */
114 | public Map parseArguments(String[] args) {
115 | LinkedHashMap arguments = new LinkedHashMap<>();
116 | if ((args == null) || (args.length == 0)) {
117 | // No parameters passed.
118 | return arguments;
119 | }
120 |
121 | for (int i = 0; i < args.length; i++) {
122 | String s = args[i];
123 | if ((s == null) || (s.isEmpty())) {
124 | // Invalid argument. Skip it.
125 | continue;
126 | }
127 |
128 | String sOption = null;
129 | String sValue = null;
130 | if (s.startsWith(argPrefix)) {
131 | // It is argument.
132 | sOption = s.substring(argPrefix.length());
133 | if (args.length - i > 1) {
134 | // Argument value.
135 | String ss = args[i + 1];
136 | if (!ss.startsWith(argPrefix)) {
137 | sValue = ss;
138 | i++;
139 | }
140 | }
141 | }
142 | else {
143 | // It is value.
144 | sValue = s;
145 | }
146 |
147 | // Remove value enclosing quotas (if any).
148 | final String dQuota = "\"";
149 | final String sQuota = "'";
150 | if ((sValue != null) && (sValue.startsWith(sQuota))) {
151 | if (((sValue.startsWith(dQuota) && sValue.endsWith(dQuota))) ||
152 | ((sValue.startsWith(sQuota) && sValue.endsWith(sQuota)))) {
153 | sValue = sValue.substring(1, sValue.length() - 1);
154 | }
155 | }
156 |
157 | // Add to parameter map.
158 | if (sOption != null) {
159 | arguments.put(sOption, sValue);
160 | }
161 | if (sValue != null) {
162 | values.add(sValue);
163 | }
164 | } //
165 |
166 | return arguments;
167 | }
168 | }
169 |
--------------------------------------------------------------------------------
/source/main/java/dsk/anotex/util/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Various utilities.
3 | */
4 | package dsk.anotex.util;
--------------------------------------------------------------------------------
/source/test/java/dsk/anotex/AnnotationExtractorTest.java:
--------------------------------------------------------------------------------
1 | package dsk.anotex;
2 |
3 | import dsk.anotex.core.AnnotatedDocument;
4 | import dsk.anotex.core.Annotation;
5 | import org.junit.jupiter.api.Assertions;
6 | import org.junit.jupiter.api.Test;
7 |
8 | import java.util.List;
9 |
10 | import static org.junit.jupiter.api.Assertions.assertEquals;
11 |
12 | public class AnnotationExtractorTest extends TestBase {
13 |
14 | @Test
15 | public void testMissingFile() {
16 | AnnotationExtractor extractor = new AnnotationExtractor();
17 | Assertions.assertThrows(IllegalArgumentException.class, () -> {
18 | extractor.readAnnotations(resDir + "/Missing.pdf");
19 | });
20 | }
21 |
22 | @Test
23 | public void testUnsupportedFile() {
24 | AnnotationExtractor extractor = new AnnotationExtractor();
25 | Assertions.assertThrows(IllegalArgumentException.class, () -> {
26 | extractor.readAnnotations(resDir + "/Test_Pdf_4.pdf");
27 | });
28 | }
29 |
30 | @Test
31 | public void testHighlightingOnly() {
32 | AnnotationExtractor extractor = new AnnotationExtractor();
33 | AnnotatedDocument document = extractor.readAnnotations(resDir + "/Test_Pdf_5.pdf");
34 | List annotations = document.getAnnotations();
35 | Annotation annot = annotations.getFirst();
36 | assertEquals("One Two", annot.getText());
37 | assertEquals(1, annotations.size());
38 | }
39 |
40 | }
41 |
--------------------------------------------------------------------------------
/source/test/java/dsk/anotex/ConsoleRunnerTest.java:
--------------------------------------------------------------------------------
1 | package dsk.anotex;
2 |
3 | import org.junit.jupiter.api.BeforeEach;
4 | import org.junit.jupiter.api.Test;
5 |
6 | import static org.junit.jupiter.api.Assertions.assertEquals;
7 |
8 | public class ConsoleRunnerTest extends TestBase {
9 |
10 | @BeforeEach
11 | public void beforeEach() {
12 | cleanTempDirectory();
13 | }
14 |
15 | @Test
16 | public void testExtraction1() {
17 | String inputFile = resDir + "/Test_Pdf_2.pdf";
18 | String outputFile = tempDir + "/Test_Pdf_2.pdf.md";
19 | ConsoleRunner.main(new String[]{"-input", inputFile, "-output", outputFile});
20 | String outputContent = readFile(outputFile);
21 | assertEquals("94d6378bf0eacfef6ec05e6b187673ac88f2d6ba4556acba584bb031f79f4ffa",
22 | calcChecksum(outputContent));
23 | }
24 | }
--------------------------------------------------------------------------------
/source/test/java/dsk/anotex/TestBase.java:
--------------------------------------------------------------------------------
1 | package dsk.anotex;
2 |
3 | import java.io.File;
4 | import java.io.FileWriter;
5 | import java.io.IOException;
6 | import java.io.InputStreamReader;
7 | import java.io.Reader;
8 | import java.math.BigInteger;
9 | import java.nio.charset.StandardCharsets;
10 | import java.nio.file.Files;
11 | import java.security.MessageDigest;
12 | import java.security.NoSuchAlgorithmException;
13 |
14 | /**
15 | * Base functionality for unit tests.
16 | */
17 | public abstract class TestBase {
18 | public static final String WORK_DIR = "work";
19 | public static final String TEMP_DIR = "temp";
20 | protected static File workDir;
21 | protected static File tempDir;
22 | protected static String resDir;
23 | protected static MessageDigest digester;
24 |
25 | public TestBase() {
26 | super();
27 | if (workDir == null) {
28 | workDir = setupWorkDirectory(WORK_DIR);
29 | tempDir = setupTempDirectory(TEMP_DIR);
30 | resDir = workDir + "/testing";
31 | }
32 | }
33 |
34 | /**
35 | * Setup working directory to run the tests from.
36 | * @param dir Work directory name (relative to project root directory). Pass null to use default.
37 | * @return The work directory.
38 | */
39 | protected File setupWorkDirectory(String dir) {
40 | File workDir;
41 | if (dir == null) {
42 | // Detect the application home directory.
43 | workDir = new File(getClass().getClassLoader().getResource(".").getFile()).getParentFile();
44 | }
45 | else {
46 | // Try the requested directory.
47 | workDir = new File(dir).getAbsoluteFile();
48 | if (!workDir.isDirectory()) {
49 | // Then use the current directory.
50 | workDir = new File("").getAbsoluteFile();
51 | }
52 | }
53 |
54 | // Change the work directory (note - this will not affect ).
55 | System.setProperty("user.dir", workDir.getAbsolutePath());
56 | return workDir;
57 | }
58 |
59 | /**
60 | * Setup temporary storage directory for tests.
61 | * @param dir Temporary directory name (relative to project root directory). Null = to use default.
62 | * @return The temp directory.
63 | */
64 | protected File setupTempDirectory(String dir) {
65 | File tempDir;
66 | if (dir == null) {
67 | tempDir = new File(workDir, TEMP_DIR).getAbsoluteFile();
68 | }
69 | else {
70 | tempDir = new File(dir).getAbsoluteFile();
71 | }
72 | if (!tempDir.canWrite()) {
73 | // Not writable directory. Try the JVM temp directory.
74 | String systemTempDir = System.getProperty("java.io.tmpdir");
75 | tempDir = new File(systemTempDir).getAbsoluteFile();
76 | }
77 | tempDir.mkdirs();
78 | return tempDir;
79 | }
80 |
81 | /**
82 | * Clean the temporary directory.
83 | */
84 | protected void cleanTempDirectory() {
85 | if (tempDir != null) {
86 | if (tempDir.isDirectory()) {
87 | File[] files = tempDir.listFiles();
88 | if (files != null) {
89 | for (File file : files) {
90 | removeDirectory(file);
91 | } //
92 | }
93 | }
94 | }
95 | }
96 |
97 | /**
98 | * Remove specified directory with its subdirectories.
99 | * @param dir Directory name.
100 | */
101 | protected void removeDirectory(File dir) {
102 | if (dir.isDirectory()) {
103 | File[] files = dir.listFiles();
104 | if (files != null) {
105 | for (File file : files) {
106 | removeDirectory(file);
107 | } //
108 | }
109 | dir.delete();
110 | } else {
111 | dir.delete();
112 | }
113 | }
114 |
115 | /**
116 | * Read complete file into string. Works with UTF-8 encoding.
117 | * @param fileName Name of the file to read.
118 | * @return File content.
119 | */
120 | protected static String readFile(String fileName) {
121 | // The variant with 'Paths' is not used intentionally (it ignores work directory change).
122 | String ret = null;
123 | if (fileName != null) {
124 | StringBuilder content = new StringBuilder();
125 | File file = new File(fileName).getAbsoluteFile();
126 | int bufSize = (int) file.length();
127 | if (bufSize > 0) {
128 | try {
129 | char[] buf = new char[bufSize];
130 | Reader f = new InputStreamReader(Files.newInputStream(file.toPath()),
131 | StandardCharsets.UTF_8);
132 | int read;
133 | while ((read = f.read(buf, 0, bufSize)) != -1) {
134 | content.append(buf, 0, read);
135 | if (read < bufSize) {
136 | break;
137 | }
138 | } //
139 | f.close();
140 | }
141 | catch (IOException e) {
142 | String message = String.format("Cannot read file '%s'", file);
143 | throw new IllegalArgumentException(message, e);
144 | }
145 | }
146 | ret = content.toString();
147 | }
148 | return ret;
149 | }
150 |
151 | /**
152 | * Write specified string into file. Works with UTF-8 encoding.
153 | * @param fileName Desired file name.
154 | * @param fileContent File content.
155 | */
156 | protected static void writeFile(String fileName, String fileContent) {
157 | // The variant with 'Paths' is not used intentionally (it ignores work directory change).
158 | if ((fileName != null) && (fileContent != null)) {
159 | File file = new File(fileName).getAbsoluteFile();
160 | try {
161 | file.getParentFile().mkdirs();
162 | FileWriter f = new FileWriter(file);
163 | f.write(new String(fileContent.getBytes(StandardCharsets.UTF_8)));
164 | f.close();
165 | }
166 | catch (IOException e) {
167 | String message = String.format("Cannot write file '%s'", file);
168 | throw new IllegalArgumentException(message, e);
169 | }
170 | }
171 | }
172 |
173 | /**
174 | * Calculate SHA-256 checksum on given text (considering new line separator differences
175 | * between different OS).
176 | * @param text Input text.
177 | * @return Calculated checksum.
178 | */
179 | protected String calcChecksum(String text) {
180 | String checksum = null;
181 | if ((text != null) && (!text.isEmpty())) {
182 | if (digester == null) {
183 | try {
184 | digester = MessageDigest.getInstance("SHA-256");
185 | }
186 | catch (NoSuchAlgorithmException e) {
187 | throw new RuntimeException(e);
188 | }
189 | }
190 | text = text.replace("\r", ""); // Unify the new line characters.
191 | digester.update(text.getBytes(StandardCharsets.UTF_8));
192 | checksum = String.format("%x", new BigInteger(1, digester.digest()));
193 | }
194 | return checksum;
195 | }
196 |
197 | }
198 |
--------------------------------------------------------------------------------
/source/test/java/dsk/anotex/exporter/MarkdownExporterTest.java:
--------------------------------------------------------------------------------
1 | package dsk.anotex.exporter;
2 |
3 | import dsk.anotex.TestBase;
4 | import dsk.anotex.core.AnnotatedDocument;
5 | import dsk.anotex.core.Annotation;
6 | import org.junit.jupiter.api.Test;
7 |
8 | import java.io.StringWriter;
9 | import java.util.Arrays;
10 | import java.util.HashMap;
11 |
12 | import static org.junit.jupiter.api.Assertions.assertEquals;
13 |
14 | public class MarkdownExporterTest extends TestBase {
15 |
16 | @Test
17 | public void testExport1() {
18 | MarkdownExporter exporter = new MarkdownExporter();
19 | AnnotatedDocument document = createDocument();
20 | StringWriter output = new StringWriter(256);
21 | exporter.export(document, new HashMap<>(), output);
22 | String sResult = "# Title1 #\n"
23 | + "\n"
24 | + "\n"
25 | + "Text1\n"
26 | + "Text2\n";
27 | String s = output.toString().replace("\r\n", "\n");
28 | assertEquals(sResult, s);
29 | }
30 |
31 | protected AnnotatedDocument createDocument() {
32 | AnnotatedDocument document = new AnnotatedDocument();
33 | document.setTitle("Title1");
34 | Annotation annot1 = new Annotation("Text1");
35 | Annotation annot2 = new Annotation("Text2");
36 | document.setAnnotations(Arrays.asList(annot1, annot2));
37 | return document;
38 | }
39 | }
--------------------------------------------------------------------------------
/source/test/java/dsk/anotex/importer/PdfAnnotationImporterTest.java:
--------------------------------------------------------------------------------
1 | package dsk.anotex.importer;
2 |
3 | import dsk.anotex.TestBase;
4 | import dsk.anotex.core.AnnotatedDocument;
5 | import dsk.anotex.core.Annotation;
6 | import org.junit.jupiter.api.Test;
7 |
8 | import java.util.List;
9 |
10 | import static org.junit.jupiter.api.Assertions.assertEquals;
11 |
12 | public class PdfAnnotationImporterTest extends TestBase {
13 |
14 | @Test
15 | public void testCyrillicAnnotation() {
16 | PdfAnnotationImporter importer = new PdfAnnotationImporter();
17 | AnnotatedDocument document = importer.readAnnotations(resDir + "/Test_Pdf_1.pdf");
18 | List annotations = document.getAnnotations();
19 | Annotation annot = annotations.getFirst();
20 | assertEquals("\u041f\u0435\u0442", annot.getText()); // Пет ("five" in Cyrillic).
21 | assertEquals(1, annotations.size());
22 | }
23 |
24 | @Test
25 | public void testOneAnnotation() {
26 | PdfAnnotationImporter importer = new PdfAnnotationImporter();
27 | AnnotatedDocument document = importer.readAnnotations(resDir + "/Test_Pdf_2.pdf");
28 | assertEquals("Title2", document.getTitle());
29 | assertEquals("Subject2", document.getSubject());
30 | assertEquals("Author2", document.getAuthor());
31 | List annotations = document.getAnnotations();
32 | Annotation annot = annotations.getFirst();
33 | assertEquals("Two", annot.getText());
34 | assertEquals(1, annotations.size());
35 | }
36 |
37 | @Test
38 | public void testComments() {
39 | PdfAnnotationImporter importer = new PdfAnnotationImporter();
40 | AnnotatedDocument document = importer.readAnnotations(resDir + "/Test_Pdf_3.pdf");
41 | List annotations = document.getAnnotations();
42 | Annotation annot1 = annotations.get(0);
43 | assertEquals("Four", annot1.getText());
44 | Annotation annot2 = annotations.get(1);
45 | assertEquals("Five", annot2.getText());
46 | Annotation annot3 = annotations.get(2);
47 | assertEquals("Six", annot3.getText());
48 | assertEquals(3, annotations.size());
49 | }
50 |
51 | @Test
52 | public void testStripUnwantedChunks() {
53 | PdfAnnotationImporter importer = new PdfAnnotationImporter();
54 |
55 | String res1 = importer.stripUnwantedChunks("Be them. W");
56 | assertEquals("Be them.", res1);
57 |
58 | String res2 = importer.stripUnwantedChunks("o? When");
59 | assertEquals("When", res2);
60 |
61 | String res3 = importer.stripUnwantedChunks("I can be");
62 | assertEquals("I can be", res3);
63 |
64 | String res4 = importer.stripUnwantedChunks("\"Awesome!\"");
65 | assertEquals("Awesome!", res4);
66 | }
67 |
68 | @Test
69 | public void testHighlightingBoundaries() {
70 | PdfAnnotationImporter importer = new PdfAnnotationImporter();
71 | AnnotatedDocument document = importer.readAnnotations(resDir + "/Test_Pdf_6.pdf");
72 | List annotations = document.getAnnotations();
73 | Annotation annot1 = annotations.getFirst();
74 | assertEquals("seven eight nine ten eleven twelve thirteen fourteen fifteen", annot1.getText());
75 | }
76 |
77 | @Test
78 | public void testHighlightingWithContent() {
79 | PdfAnnotationImporter importer = new PdfAnnotationImporter();
80 | AnnotatedDocument document = importer.readAnnotations(resDir + "/Test_Pdf_7.pdf");
81 | List annotations = document.getAnnotations();
82 | Annotation annot1 = annotations.getFirst();
83 | assertEquals("The programs that a home user needs are email, web browser, pdf file viewer, " +
84 | "video an music playback software as well as, office program including spreadsheet, " +
85 | "word processing and presentation graphics. Today, cloud services, " +
86 | "web calls and other social", annot1.getText());
87 | }
88 | }
--------------------------------------------------------------------------------
/work/DyAnnotationExtractor:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | java -cp "program:library/*" dsk.anotex.ConsoleRunner $1 "$2"
--------------------------------------------------------------------------------
/work/DyAnnotationExtractor.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | java -cp program;library/* dsk.anotex.ConsoleRunner %1 %2 %3 %4
--------------------------------------------------------------------------------
/work/documents/Highlight_Example_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dimi2/DyAnnotationExtractor/3e99f750a0b811e55cc182371ed5b3f348a40982/work/documents/Highlight_Example_1.png
--------------------------------------------------------------------------------
/work/documents/Manual.md:
--------------------------------------------------------------------------------
1 | # DyAnnotationExtractor #
2 |
3 | DyAnnotationExtractor is software for extracting annotations (highlighted text and comments) from e-documents like PDF. The extracted parts can be used to build summary/resume of the document.
4 |
5 |
6 | Note! The AI bot [ChatGPT](https://chatgpt.com/) is now cable to extract highlighted text from PDF file and export the summary into Markdown. There is no stimulus to develop this project further.
7 | DyAnnotationExtractor remains usable in its current state - it can be applied in task automation tools which cannot call AI service (because of sensitive documents or internet restrictions).
8 |
9 |
10 | ## Usage ##
11 |
12 | Imagine you have ebook (PDF) which is 100 pages long. While reading the book,
13 | you **highlight** the important parts in your favorite reader:
14 |
15 | 
16 |
17 | Then use the DyAnnotationExtractor tool to get just the highlighted parts.
18 |
19 | Via the command line:
20 | ```console
21 | DyAnnotationExtractor -input "Getting Started with Ubuntu 16.04.pdf"
22 | ```
23 |
24 | This will create a file with same name in the same directory, with added '.md' suffix.
25 | Note that the file name is enclosed with quotas - this is required when the file name contains spaces.
26 |
27 | Now you have extract of the book which is not 100 but 5-6 pages. So, you can skim just the exported text instead of re-reading the entire book.
28 |
29 | ## Supported Input Formats ##
30 |
31 | - PDF (Portable Document Format)
32 |
33 | ## Supported Output Formats ##
34 |
35 | - MD (Markdown)
36 |
37 | ## Requirements ##
38 |
39 | - Java 21+.
40 |
41 | ## Download ##
42 |
43 | Get the [latest release](https://github.com/dimi2/DyAnnotationExtractor/releases/latest).
44 |
45 |
46 | End users need to download only the distribution jar.
47 |
48 | ## Installation ##
49 |
50 | Extract the downloaded archive in some local directory.
51 | Run the provided 'DyAnnotationExtractor' script to perform extraction.
52 |
53 | ## Build ##
54 |
55 | To build the project from sources, you will need [Gradle](https://gradle.org/) build tool.
56 | Go into the project home directory (PROJ_HOME) and execute command:
57 |
58 | ```
59 | gradle
60 | ```
61 | The result will appear in directory `PROJ_HOME/build/distribution`. This is portable distribution of the application. If you need just the library (without dependencies and start scripts), use the JAR file generated in `PROJ_HOME/build/libs` directory.
62 |
63 |
--------------------------------------------------------------------------------
/work/testing/Test_Pdf_1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dimi2/DyAnnotationExtractor/3e99f750a0b811e55cc182371ed5b3f348a40982/work/testing/Test_Pdf_1.pdf
--------------------------------------------------------------------------------
/work/testing/Test_Pdf_2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dimi2/DyAnnotationExtractor/3e99f750a0b811e55cc182371ed5b3f348a40982/work/testing/Test_Pdf_2.pdf
--------------------------------------------------------------------------------
/work/testing/Test_Pdf_3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dimi2/DyAnnotationExtractor/3e99f750a0b811e55cc182371ed5b3f348a40982/work/testing/Test_Pdf_3.pdf
--------------------------------------------------------------------------------
/work/testing/Test_Pdf_4.pdf:
--------------------------------------------------------------------------------
1 | Invalid PDF file.
--------------------------------------------------------------------------------
/work/testing/Test_Pdf_5.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dimi2/DyAnnotationExtractor/3e99f750a0b811e55cc182371ed5b3f348a40982/work/testing/Test_Pdf_5.pdf
--------------------------------------------------------------------------------
/work/testing/Test_Pdf_6.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dimi2/DyAnnotationExtractor/3e99f750a0b811e55cc182371ed5b3f348a40982/work/testing/Test_Pdf_6.pdf
--------------------------------------------------------------------------------
/work/testing/Test_Pdf_7.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dimi2/DyAnnotationExtractor/3e99f750a0b811e55cc182371ed5b3f348a40982/work/testing/Test_Pdf_7.pdf
--------------------------------------------------------------------------------