├── .gitignore
├── CODEOWNERS
├── COPYING
├── NOTICE
├── README.md
├── pom.xml
├── src
├── assembly
│ ├── tgz.xml
│ └── zip.xml
├── main
│ ├── java
│ │ └── io
│ │ │ └── mfj
│ │ │ └── textricator
│ │ │ ├── Textricator.kt
│ │ │ ├── Version.kt
│ │ │ ├── cli
│ │ │ └── TextricatorCli.kt
│ │ │ ├── extractor
│ │ │ ├── TextExtractor.kt
│ │ │ ├── TextExtractorFactory.kt
│ │ │ ├── TextExtractorOptions.kt
│ │ │ ├── csv
│ │ │ │ ├── CsvTextExtractor.kt
│ │ │ │ └── CvsTextExtractorFactory.kt
│ │ │ ├── itext5
│ │ │ │ ├── Box.kt
│ │ │ │ ├── Boxtricator.kt
│ │ │ │ ├── Buffer.kt
│ │ │ │ ├── Itext5TextExtractor.kt
│ │ │ │ ├── Itext5TextExtractorFactory.kt
│ │ │ │ ├── Shenanigans.kt
│ │ │ │ └── Size.kt
│ │ │ ├── itext7
│ │ │ │ ├── Itext7TextExtractor.kt
│ │ │ │ └── Itext7TextExtractorFactory.kt
│ │ │ ├── json
│ │ │ │ ├── JsonTextExtractor.kt
│ │ │ │ └── JsonTextExtractorFactory.kt
│ │ │ └── pdfbox
│ │ │ │ ├── PdfboxTextExtractor.kt
│ │ │ │ ├── PdfboxTextExtractorFactory.kt
│ │ │ │ └── TextBoxPdfTextStripper.kt
│ │ │ ├── form
│ │ │ ├── FormParseEventListener.kt
│ │ │ ├── FsmEventListener.kt
│ │ │ ├── FsmParser.kt
│ │ │ ├── LoggingEventListener.kt
│ │ │ ├── RecordParser.kt
│ │ │ ├── RecordParserEventListener.kt
│ │ │ ├── StateValue.kt
│ │ │ ├── WriterEventListener.kt
│ │ │ └── config
│ │ │ │ ├── DefaultAndPages.kt
│ │ │ │ ├── FormParseConfig.kt
│ │ │ │ ├── FormParseConfigUtil.kt
│ │ │ │ ├── State.kt
│ │ │ │ ├── Transition.kt
│ │ │ │ └── VariableSet.kt
│ │ │ ├── record
│ │ │ ├── Record.kt
│ │ │ ├── RecordFilter.kt
│ │ │ ├── RecordModel.kt
│ │ │ ├── Value.kt
│ │ │ └── output
│ │ │ │ ├── CsvRecordOutput.kt
│ │ │ │ ├── JsonFlatRecordOutput.kt
│ │ │ │ ├── JsonRecordOutput.kt
│ │ │ │ ├── NullOutput.kt
│ │ │ │ ├── RecordOutput.kt
│ │ │ │ └── XmlRecordOutput.kt
│ │ │ ├── table
│ │ │ ├── Table.kt
│ │ │ ├── TableParser.kt
│ │ │ └── config
│ │ │ │ ├── TableParseConfig.kt
│ │ │ │ └── TableParseConfigUtil.kt
│ │ │ └── text
│ │ │ ├── Page.kt
│ │ │ ├── PageFilter.kt
│ │ │ ├── RowGrouper.kt
│ │ │ ├── Text.kt
│ │ │ └── output
│ │ │ ├── CsvTextOutput.kt
│ │ │ ├── JsonTextOutput.kt
│ │ │ └── TextOutput.kt
│ └── resources
│ │ └── io
│ │ └── mfj
│ │ └── textricator
│ │ ├── extractor
│ │ └── textExtractor.properties
│ │ ├── logback.xml
│ │ └── version.properties
├── scripts
│ ├── textricator
│ └── textricator.bat
└── test
│ ├── java
│ └── io
│ │ └── mfj
│ │ └── textricator
│ │ ├── examples
│ │ └── ExamplesTest.kt
│ │ ├── form
│ │ ├── NodeMembersTest.kt
│ │ ├── PatternReplacementTest.kt
│ │ └── RecordParserTest.kt
│ │ └── record
│ │ ├── RecordFilterTest.kt
│ │ └── output
│ │ ├── CsvOutputTest.kt
│ │ └── XmlOutputTest.kt
│ └── resources
│ └── io
│ └── mfj
│ └── textricator
│ └── examples
│ ├── probes.pdf
│ ├── probes.yml
│ ├── rap-sheet.csv
│ ├── rap-sheet.pdf
│ ├── rap-sheet.yml
│ ├── school-employee-list.csv
│ ├── school-employee-list.pdf
│ └── school-employee-list.yml
├── textricator-logo-text-paths.png
└── textricator-mascot.png
/.gitignore:
--------------------------------------------------------------------------------
1 | target/
2 | .idea/
3 | *.iml
4 |
--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # This is the CODEOWNERS file for the textricator repo.
2 |
3 | # These owners will be the default owners for everything in the repo, unless a
4 | # later match takes precedence.
5 | * @lschumann-mfj @wstumbo-mfj
6 |
7 | # Make sure that DevOps is aware of anything GitHub related
8 | /.github/ @lschumann-mfj @wstumbo-mfj @SB-MFJ @meghanbissonnette-mfj
--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | Textricator
2 | Copyright 2018 Measures for Justice Institute.
3 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | io.mfj
8 | textricator
9 | 10.2-SNAPSHOT
10 |
11 | ${project.groupId}:${project.artifactId}
12 | A tool to extract text from documents and generate structured data
13 |
14 | https://textricator.mfj.io/
15 |
16 |
17 |
18 | Measures for Justice
19 | opensource@measuresforjustice.org
20 | Measures for Justice
21 | https://measuresforjustice.org/
22 |
23 |
24 |
25 |
26 |
27 | GNU Affero General Public License, Version 3
28 | https://www.gnu.org/licenses/agpl-3.0.txt
29 |
30 |
31 |
32 |
33 | 11
34 | 11
35 | 11
36 | 1.9.25
37 | UTF-8
38 | 2025
39 | https://github.com/measuresforjustice/textricator
40 |
41 |
42 |
43 |
44 | org.jetbrains.kotlinx
45 | kotlinx-coroutines-core
46 | 1.7.3
47 |
48 |
49 |
50 | org.jetbrains.kotlin
51 | kotlin-stdlib-jdk8
52 | ${kotlin.version}
53 |
54 |
55 |
56 | org.jetbrains.kotlin
57 | kotlin-test-junit
58 | ${kotlin.version}
59 |
60 | test
61 |
62 |
64 |
65 | org.jetbrains.kotlin
66 | kotlin-reflect
67 | ${kotlin.version}
68 |
69 |
70 |
71 |
72 | io.mfj
73 | expr
74 | 6.2.39
75 |
76 |
77 |
78 |
79 |
80 | org.apache.pdfbox
81 | pdfbox
82 | 2.0.30
83 |
84 |
85 |
86 | org.apache.pdfbox
87 | pdfbox-tools
88 | 2.0.30
89 |
90 |
91 |
92 |
93 |
94 | com.itextpdf
95 | itextpdf
96 | 5.5.13.3
97 |
98 |
99 |
100 | org.bouncycastle
101 | bcprov-jdk18on
102 | 1.78
103 |
104 |
105 |
106 |
107 |
108 | com.itextpdf
109 | kernel
110 | 8.0.2
111 |
112 |
113 |
114 | com.itextpdf
115 | layout
116 | 8.0.2
117 |
118 |
119 |
120 |
121 | com.fasterxml.jackson.core
122 | jackson-databind
123 | 2.15.3
124 |
125 |
126 |
127 | com.fasterxml.jackson.dataformat
128 | jackson-dataformat-xml
129 | 2.15.3
130 |
131 |
132 |
133 | com.fasterxml.jackson.dataformat
134 | jackson-dataformat-yaml
135 | 2.15.3
136 |
137 |
138 |
139 | com.fasterxml.jackson.module
140 | jackson-module-kotlin
141 | 2.15.3
142 |
143 |
144 |
145 | org.jetbrains.kotlin
146 | kotlin-stdlib
147 |
148 |
149 | org.jetbrains.kotlin
150 | kotlin-reflect
151 |
152 |
153 |
154 |
155 |
156 | org.slf4j
157 | slf4j-api
158 | 2.0.9
159 |
160 |
161 |
162 | ch.qos.logback
163 | logback-classic
164 | 1.4.12
165 |
166 |
167 |
168 | org.apache.commons
169 | commons-csv
170 | 1.10.0
171 |
172 |
173 |
174 |
175 | com.offbytwo
176 | docopt
177 | 0.6.0.20150202
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 | src/main/resources
190 | true
191 |
192 | **/version.properties
193 |
194 |
195 |
196 | src/main/resources
197 | false
198 |
199 | **/version.properties
200 |
201 |
202 |
203 |
204 |
205 |
206 | org.jetbrains.kotlin
207 | kotlin-maven-plugin
208 | ${kotlin.version}
209 |
210 |
211 | compile
212 | compile
213 |
214 | compile
215 |
216 |
217 |
218 | ${project.basedir}/src/main/java
219 |
220 |
221 |
222 |
223 | test-compile
224 | test-compile
225 |
226 | test-compile
227 |
228 |
229 |
230 | ${project.basedir}/src/test/java
231 |
232 |
233 |
234 |
235 |
236 |
237 | org.jetbrains.dokka
238 | dokka-maven-plugin
239 | 2.0.0
240 |
241 |
242 | prepare-package
243 |
244 | dokka
245 | javadoc
246 | javadocJar
247 |
248 |
249 |
250 |
251 |
252 | org.apache.maven.plugins
253 | maven-source-plugin
254 | 3.2.1
255 |
256 |
257 | attach-sources
258 |
259 | jar
260 |
261 |
262 |
263 |
264 |
265 | org.apache.maven.plugins
266 | maven-surefire-plugin
267 | 2.22.1
268 |
269 | false
270 |
271 |
272 |
273 | org.apache.maven.plugins
274 | maven-assembly-plugin
275 | 3.3.0
276 |
277 |
278 | package
279 |
280 | single
281 |
282 |
283 |
284 |
285 |
286 | src/assembly/tgz.xml
287 | src/assembly/zip.xml
288 |
289 |
290 |
291 |
292 |
293 | org.apache.maven.plugins
294 | maven-enforcer-plugin
295 | 1.4.1
296 |
297 |
298 | enforce-no-snapshots
299 |
300 | enforce
301 |
302 |
303 |
304 |
305 | Cannot have snapshot dependencies of a release!
306 | true
307 | true
308 | true
309 |
310 |
311 | true
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
--------------------------------------------------------------------------------
/src/assembly/tgz.xml:
--------------------------------------------------------------------------------
1 |
3 | bin
4 |
5 | tgz
6 |
7 |
8 |
9 | /
10 | ${project.basedir}/src/scripts
11 |
12 | textricator
13 |
14 | 0755
15 |
16 |
17 | ${project.basedir}
18 |
19 | README.*
20 | NOTICE
21 | COPYING
22 | examples/
23 |
24 |
25 |
26 |
27 |
28 | lib
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/src/assembly/zip.xml:
--------------------------------------------------------------------------------
1 |
3 | bin
4 |
5 | zip
6 |
7 |
8 |
9 | /
10 | ${project.basedir}/src/scripts
11 |
12 | textricator.bat
13 |
14 |
15 |
16 | ${project.basedir}
17 |
18 | README.*
19 | NOTICE
20 | COPYING
21 | examples/
22 |
23 |
24 |
25 |
26 |
27 | lib
28 |
29 |
30 |
31 |
--------------------------------------------------------------------------------
/src/main/java/io/mfj/textricator/Version.kt:
--------------------------------------------------------------------------------
1 | /*
2 | This file is part of Textricator.
3 | Copyright 2018 Measures for Justice Institute.
4 |
5 | This program is free software: you can redistribute it and/or modify it under
6 | the terms of the GNU Affero General Public License version 3 as published by the
7 | Free Software Foundation.
8 |
9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY
10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
12 |
13 | You should have received a copy of the GNU Affero General Public License along
14 | with this program. If not, see .
15 | */
16 |
17 | package io.mfj.textricator
18 |
19 | import java.util.*
20 |
21 | object Version {
22 |
23 | val version:String
24 | val copyrightYear:String
25 | val sourceLocation:String
26 |
27 | init {
28 | val props = Properties().apply {
29 | Version::class.java.getResourceAsStream( "version.properties" ).use { input ->
30 | load( input )
31 | }
32 | }
33 | version = props.getProperty("version")
34 | copyrightYear = props.getProperty("copyright.year")
35 | sourceLocation = props.getProperty("source.location")
36 | }
37 |
38 | }
39 |
--------------------------------------------------------------------------------
/src/main/java/io/mfj/textricator/cli/TextricatorCli.kt:
--------------------------------------------------------------------------------
1 | /*
2 | This file is part of Textricator.
3 | Copyright 2018 Measures for Justice Institute.
4 |
5 | This program is free software: you can redistribute it and/or modify it under
6 | the terms of the GNU Affero General Public License version 3 as published by the
7 | Free Software Foundation.
8 |
9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY
10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
12 |
13 | You should have received a copy of the GNU Affero General Public License along
14 | with this program. If not, see .
15 | */
16 |
17 | package io.mfj.textricator.cli
18 |
19 | import io.mfj.textricator.*
20 | import io.mfj.textricator.extractor.TextExtractorFactory
21 | import io.mfj.textricator.extractor.TextExtractorOptions
22 | import io.mfj.textricator.form.config.FormParseConfigUtil
23 | import io.mfj.textricator.table.config.TableParseConfigUtil
24 | import io.mfj.textricator.text.toPageFilter
25 |
26 | import java.io.File
27 | import java.io.InputStream
28 |
29 | import ch.qos.logback.classic.Logger
30 | import ch.qos.logback.classic.Level
31 |
32 | import org.docopt.Docopt
33 |
34 | import org.slf4j.LoggerFactory
35 | import kotlin.system.exitProcess
36 |
37 | /**
38 | * Command-line interface to [Textricator].
39 | *
40 | * Can just extract text or also run the form or table parser.
41 | */
42 | object TextricatorCli {
43 |
44 | private val help = """
45 | Textricator
46 |
47 | Textricator extracts content from PDFs.
48 |
49 | "text" extracts the text from a PDF and outputs to CSV or JSON.
50 | "form" parse a form (using a finite state machine) and generate records.
51 | "table" parses a table and generates records.
52 |
53 | Output is to standard out if not specified.
54 |
55 | Usage:
56 | textricator text [--debug] [--pages=] [--max-row-distance=] [--box-precision=] [--box-ignore-colors=] [--input-format=] [--output-format=] [