├── .gitignore ├── CODEOWNERS ├── COPYING ├── NOTICE ├── README.md ├── pom.xml ├── src ├── assembly │ ├── tgz.xml │ └── zip.xml ├── main │ ├── java │ │ └── io │ │ │ └── mfj │ │ │ └── textricator │ │ │ ├── Textricator.kt │ │ │ ├── Version.kt │ │ │ ├── cli │ │ │ └── TextricatorCli.kt │ │ │ ├── extractor │ │ │ ├── TextExtractor.kt │ │ │ ├── TextExtractorFactory.kt │ │ │ ├── TextExtractorOptions.kt │ │ │ ├── csv │ │ │ │ ├── CsvTextExtractor.kt │ │ │ │ └── CvsTextExtractorFactory.kt │ │ │ ├── itext5 │ │ │ │ ├── Box.kt │ │ │ │ ├── Boxtricator.kt │ │ │ │ ├── Buffer.kt │ │ │ │ ├── Itext5TextExtractor.kt │ │ │ │ ├── Itext5TextExtractorFactory.kt │ │ │ │ ├── Shenanigans.kt │ │ │ │ └── Size.kt │ │ │ ├── itext7 │ │ │ │ ├── Itext7TextExtractor.kt │ │ │ │ └── Itext7TextExtractorFactory.kt │ │ │ ├── json │ │ │ │ ├── JsonTextExtractor.kt │ │ │ │ └── JsonTextExtractorFactory.kt │ │ │ └── pdfbox │ │ │ │ ├── PdfboxTextExtractor.kt │ │ │ │ ├── PdfboxTextExtractorFactory.kt │ │ │ │ └── TextBoxPdfTextStripper.kt │ │ │ ├── form │ │ │ ├── FormParseEventListener.kt │ │ │ ├── FsmEventListener.kt │ │ │ ├── FsmParser.kt │ │ │ ├── LoggingEventListener.kt │ │ │ ├── RecordParser.kt │ │ │ ├── RecordParserEventListener.kt │ │ │ ├── StateValue.kt │ │ │ ├── WriterEventListener.kt │ │ │ └── config │ │ │ │ ├── DefaultAndPages.kt │ │ │ │ ├── FormParseConfig.kt │ │ │ │ ├── FormParseConfigUtil.kt │ │ │ │ ├── State.kt │ │ │ │ ├── Transition.kt │ │ │ │ └── VariableSet.kt │ │ │ ├── record │ │ │ ├── Record.kt │ │ │ ├── RecordFilter.kt │ │ │ ├── RecordModel.kt │ │ │ ├── Value.kt │ │ │ └── output │ │ │ │ ├── CsvRecordOutput.kt │ │ │ │ ├── JsonFlatRecordOutput.kt │ │ │ │ ├── JsonRecordOutput.kt │ │ │ │ ├── NullOutput.kt │ │ │ │ ├── RecordOutput.kt │ │ │ │ └── XmlRecordOutput.kt │ │ │ ├── table │ │ │ ├── Table.kt │ │ │ ├── TableParser.kt │ │ │ └── config │ │ │ │ ├── TableParseConfig.kt │ │ │ │ └── TableParseConfigUtil.kt │ │ │ └── text │ │ │ ├── Page.kt │ │ │ ├── PageFilter.kt │ │ │ ├── RowGrouper.kt │ │ │ ├── Text.kt │ │ │ └── output │ │ │ ├── CsvTextOutput.kt │ │ │ ├── JsonTextOutput.kt │ │ │ └── TextOutput.kt │ └── resources │ │ └── io │ │ └── mfj │ │ └── textricator │ │ ├── extractor │ │ └── textExtractor.properties │ │ ├── logback.xml │ │ └── version.properties ├── scripts │ ├── textricator │ └── textricator.bat └── test │ ├── java │ └── io │ │ └── mfj │ │ └── textricator │ │ ├── examples │ │ └── ExamplesTest.kt │ │ ├── form │ │ ├── NodeMembersTest.kt │ │ ├── PatternReplacementTest.kt │ │ └── RecordParserTest.kt │ │ └── record │ │ ├── RecordFilterTest.kt │ │ └── output │ │ ├── CsvOutputTest.kt │ │ └── XmlOutputTest.kt │ └── resources │ └── io │ └── mfj │ └── textricator │ └── examples │ ├── probes.pdf │ ├── probes.yml │ ├── rap-sheet.csv │ ├── rap-sheet.pdf │ ├── rap-sheet.yml │ ├── school-employee-list.csv │ ├── school-employee-list.pdf │ └── school-employee-list.yml ├── textricator-logo-text-paths.png └── textricator-mascot.png /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | .idea/ 3 | *.iml 4 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | # This is the CODEOWNERS file for the textricator repo. 2 | 3 | # These owners will be the default owners for everything in the repo, unless a 4 | # later match takes precedence. 5 | * @lschumann-mfj @wstumbo-mfj 6 | 7 | # Make sure that DevOps is aware of anything GitHub related 8 | /.github/ @lschumann-mfj @wstumbo-mfj @SB-MFJ @meghanbissonnette-mfj -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Textricator 2 | Copyright 2018 Measures for Justice Institute. 3 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | io.mfj 8 | textricator 9 | 10.2-SNAPSHOT 10 | 11 | ${project.groupId}:${project.artifactId} 12 | A tool to extract text from documents and generate structured data 13 | 14 | https://textricator.mfj.io/ 15 | 16 | 17 | 18 | Measures for Justice 19 | opensource@measuresforjustice.org 20 | Measures for Justice 21 | https://measuresforjustice.org/ 22 | 23 | 24 | 25 | 26 | 27 | GNU Affero General Public License, Version 3 28 | https://www.gnu.org/licenses/agpl-3.0.txt 29 | 30 | 31 | 32 | 33 | 11 34 | 11 35 | 11 36 | 1.9.25 37 | UTF-8 38 | 2025 39 | https://github.com/measuresforjustice/textricator 40 | 41 | 42 | 43 | 44 | org.jetbrains.kotlinx 45 | kotlinx-coroutines-core 46 | 1.7.3 47 | 48 | 49 | 50 | org.jetbrains.kotlin 51 | kotlin-stdlib-jdk8 52 | ${kotlin.version} 53 | 54 | 55 | 56 | org.jetbrains.kotlin 57 | kotlin-test-junit 58 | ${kotlin.version} 59 | 60 | test 61 | 62 | 64 | 65 | org.jetbrains.kotlin 66 | kotlin-reflect 67 | ${kotlin.version} 68 | 69 | 70 | 71 | 72 | io.mfj 73 | expr 74 | 6.2.39 75 | 76 | 77 | 78 | 79 | 80 | org.apache.pdfbox 81 | pdfbox 82 | 2.0.30 83 | 84 | 85 | 86 | org.apache.pdfbox 87 | pdfbox-tools 88 | 2.0.30 89 | 90 | 91 | 92 | 93 | 94 | com.itextpdf 95 | itextpdf 96 | 5.5.13.3 97 | 98 | 99 | 100 | org.bouncycastle 101 | bcprov-jdk18on 102 | 1.78 103 | 104 | 105 | 106 | 107 | 108 | com.itextpdf 109 | kernel 110 | 8.0.2 111 | 112 | 113 | 114 | com.itextpdf 115 | layout 116 | 8.0.2 117 | 118 | 119 | 120 | 121 | com.fasterxml.jackson.core 122 | jackson-databind 123 | 2.15.3 124 | 125 | 126 | 127 | com.fasterxml.jackson.dataformat 128 | jackson-dataformat-xml 129 | 2.15.3 130 | 131 | 132 | 133 | com.fasterxml.jackson.dataformat 134 | jackson-dataformat-yaml 135 | 2.15.3 136 | 137 | 138 | 139 | com.fasterxml.jackson.module 140 | jackson-module-kotlin 141 | 2.15.3 142 | 143 | 144 | 145 | org.jetbrains.kotlin 146 | kotlin-stdlib 147 | 148 | 149 | org.jetbrains.kotlin 150 | kotlin-reflect 151 | 152 | 153 | 154 | 155 | 156 | org.slf4j 157 | slf4j-api 158 | 2.0.9 159 | 160 | 161 | 162 | ch.qos.logback 163 | logback-classic 164 | 1.4.12 165 | 166 | 167 | 168 | org.apache.commons 169 | commons-csv 170 | 1.10.0 171 | 172 | 173 | 174 | 175 | com.offbytwo 176 | docopt 177 | 0.6.0.20150202 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | src/main/resources 190 | true 191 | 192 | **/version.properties 193 | 194 | 195 | 196 | src/main/resources 197 | false 198 | 199 | **/version.properties 200 | 201 | 202 | 203 | 204 | 205 | 206 | org.jetbrains.kotlin 207 | kotlin-maven-plugin 208 | ${kotlin.version} 209 | 210 | 211 | compile 212 | compile 213 | 214 | compile 215 | 216 | 217 | 218 | ${project.basedir}/src/main/java 219 | 220 | 221 | 222 | 223 | test-compile 224 | test-compile 225 | 226 | test-compile 227 | 228 | 229 | 230 | ${project.basedir}/src/test/java 231 | 232 | 233 | 234 | 235 | 236 | 237 | org.jetbrains.dokka 238 | dokka-maven-plugin 239 | 2.0.0 240 | 241 | 242 | prepare-package 243 | 244 | dokka 245 | javadoc 246 | javadocJar 247 | 248 | 249 | 250 | 251 | 252 | org.apache.maven.plugins 253 | maven-source-plugin 254 | 3.2.1 255 | 256 | 257 | attach-sources 258 | 259 | jar 260 | 261 | 262 | 263 | 264 | 265 | org.apache.maven.plugins 266 | maven-surefire-plugin 267 | 2.22.1 268 | 269 | false 270 | 271 | 272 | 273 | org.apache.maven.plugins 274 | maven-assembly-plugin 275 | 3.3.0 276 | 277 | 278 | package 279 | 280 | single 281 | 282 | 283 | 284 | 285 | 286 | src/assembly/tgz.xml 287 | src/assembly/zip.xml 288 | 289 | 290 | 291 | 292 | 293 | org.apache.maven.plugins 294 | maven-enforcer-plugin 295 | 1.4.1 296 | 297 | 298 | enforce-no-snapshots 299 | 300 | enforce 301 | 302 | 303 | 304 | 305 | Cannot have snapshot dependencies of a release! 306 | true 307 | true 308 | true 309 | 310 | 311 | true 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | -------------------------------------------------------------------------------- /src/assembly/tgz.xml: -------------------------------------------------------------------------------- 1 | 3 | bin 4 | 5 | tgz 6 | 7 | 8 | 9 | / 10 | ${project.basedir}/src/scripts 11 | 12 | textricator 13 | 14 | 0755 15 | 16 | 17 | ${project.basedir} 18 | 19 | README.* 20 | NOTICE 21 | COPYING 22 | examples/ 23 | 24 | 25 | 26 | 27 | 28 | lib 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /src/assembly/zip.xml: -------------------------------------------------------------------------------- 1 | 3 | bin 4 | 5 | zip 6 | 7 | 8 | 9 | / 10 | ${project.basedir}/src/scripts 11 | 12 | textricator.bat 13 | 14 | 15 | 16 | ${project.basedir} 17 | 18 | README.* 19 | NOTICE 20 | COPYING 21 | examples/ 22 | 23 | 24 | 25 | 26 | 27 | lib 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/Version.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator 18 | 19 | import java.util.* 20 | 21 | object Version { 22 | 23 | val version:String 24 | val copyrightYear:String 25 | val sourceLocation:String 26 | 27 | init { 28 | val props = Properties().apply { 29 | Version::class.java.getResourceAsStream( "version.properties" ).use { input -> 30 | load( input ) 31 | } 32 | } 33 | version = props.getProperty("version") 34 | copyrightYear = props.getProperty("copyright.year") 35 | sourceLocation = props.getProperty("source.location") 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/cli/TextricatorCli.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.cli 18 | 19 | import io.mfj.textricator.* 20 | import io.mfj.textricator.extractor.TextExtractorFactory 21 | import io.mfj.textricator.extractor.TextExtractorOptions 22 | import io.mfj.textricator.form.config.FormParseConfigUtil 23 | import io.mfj.textricator.table.config.TableParseConfigUtil 24 | import io.mfj.textricator.text.toPageFilter 25 | 26 | import java.io.File 27 | import java.io.InputStream 28 | 29 | import ch.qos.logback.classic.Logger 30 | import ch.qos.logback.classic.Level 31 | 32 | import org.docopt.Docopt 33 | 34 | import org.slf4j.LoggerFactory 35 | import kotlin.system.exitProcess 36 | 37 | /** 38 | * Command-line interface to [Textricator]. 39 | * 40 | * Can just extract text or also run the form or table parser. 41 | */ 42 | object TextricatorCli { 43 | 44 | private val help = """ 45 | Textricator 46 | 47 | Textricator extracts content from PDFs. 48 | 49 | "text" extracts the text from a PDF and outputs to CSV or JSON. 50 | "form" parse a form (using a finite state machine) and generate records. 51 | "table" parses a table and generates records. 52 | 53 | Output is to standard out if not specified. 54 | 55 | Usage: 56 | textricator text [--debug] [--pages=] [--max-row-distance=] [--box-precision=] [--box-ignore-colors=] [--input-format=] [--output-format=] [] 57 | textricator form [--debug] --config= [--pages=] [--input-format=] [--output-format=] [] 58 | textricator forms [--debug] --config= --input-format= [--output-format=] [] 59 | textricator table [--debug] --config= [--pages=] [--input-format=] [--output-format=] [] 60 | textricator -h | --help 61 | textricator --version 62 | 63 | Options: 64 | --config PATH Path to config file. 65 | --pages PAGES Pages to include. E.g.: 1-4,5,9. Default: all pages. 66 | --max-row-distance POINTS Order text boxes within this distance (points) by x-position. E.g.: 0.5. Default: no ordering. 67 | --box-precision POINTS Consider text inside a box if it overflows by less than this many points (float). 68 | --ignore-box-colors COLORS Ignore boxes of these colors (comma-separated). 69 | --input-format FORMAT Input format. If not set, determine from file extension. 70 | Valid values: ${TextExtractorFactory.extractorNames.sorted().joinToString(", ")} 71 | ("pdf" is an alias for ${Textricator.DEFAULT_PDF_PARSER}) 72 | --output-format FORMAT Output format. If not set, determine from file extension. 73 | Valid values: 74 | ${Textricator.RECORD_OUTPUT_FORMAT_CSV} (default if output to standard out) 75 | ${Textricator.RECORD_OUTPUT_FORMAT_JSON} 76 | ${Textricator.RECORD_OUTPUT_FORMAT_JSON_FLAT} 77 | ${Textricator.RECORD_OUTPUT_FORMAT_XML} 78 | ${Textricator.RECORD_OUTPUT_FORMAT_NULL} (no output) 79 | --debug Enable debug logging 80 | --version Show version, copyright, and license information. 81 | """.trimIndent().trim() 82 | 83 | private fun Map.boolean( key:String ):Boolean = containsKey(key) && get(key) as Boolean 84 | private fun Map.file( key:String ):File? = get(key)?.toString()?.let { File(it) } 85 | private fun Map.string( key:String ):String? = get(key)?.toString() 86 | private fun Map.float( key:String ):Float? = get(key)?.toString()?.toFloat() 87 | 88 | @JvmStatic 89 | fun main(args: Array) { 90 | 91 | // setup logging. This is done in main() (as opposed to with a logback.xml) so applications using Textricator 92 | // as a library are not affected by our logging config. 93 | System.setProperty("logback.configurationFile", "io/mfj/textricator/logback.xml") 94 | 95 | val opts = Docopt(help) 96 | .withHelp(true) 97 | .withExit(true) 98 | .parse(args.toList()) 99 | 100 | if (opts.boolean("--debug")) { 101 | ( LoggerFactory.getLogger(Logger.ROOT_LOGGER_NAME) as Logger ).level = Level.DEBUG 102 | } 103 | 104 | try { 105 | when { 106 | opts.boolean("--version") -> version() 107 | opts.boolean("text") -> text(opts) 108 | opts.boolean("table") -> table(opts) 109 | opts.boolean("form") -> form(opts) 110 | opts.boolean("forms") -> forms(opts) 111 | } 112 | } catch ( e:SystemExitException) { 113 | System.err.println(e.message) 114 | exitProcess(e.exitCode) 115 | } 116 | } 117 | 118 | private fun version() { 119 | println( 120 | """ 121 | Textricator ${Version.version} 122 | Copyright ${Version.copyrightYear} Measures for Justice Institute. 123 | 124 | Licensed under the GNU Affero General Public License, Version 3. 125 | (Loaded modules may be licensed differently.) 126 | 127 | Source code is available at ${Version.sourceLocation} 128 | """.trimIndent() 129 | ) 130 | } 131 | 132 | private fun text( opts:Map ) { 133 | 134 | val inputFile = opts.file("")!! 135 | val inputFormat = opts.string("--input-format") ?: inputFile.extension.lowercase() 136 | 137 | val outputFile = opts.file("") 138 | outputFile?.absoluteFile?.parentFile?.mkdirs() 139 | val outputFormat = opts.string("--output-format") ?: 140 | if ( outputFile == null ) { 141 | Textricator.TEXT_OUTPUT_FORMAT_CSV 142 | } else { 143 | outputFile.extension.lowercase() 144 | } 145 | 146 | val pages = opts.string("--pages").toPageFilter() 147 | val maxRowDistance = opts.float("--max-row-distance") ?: 0f 148 | val boxPrecision:Float = opts.float("--box-precision") ?: 0f 149 | val boxIgnoreColors:Set = opts.string("--box-ignore-colors")?.split(",")?.toSet() ?: emptySet() 150 | 151 | val options = TextExtractorOptions( 152 | boxPrecision = boxPrecision, 153 | boxIgnoreColors = boxIgnoreColors ) 154 | 155 | inputFile.inputStream().use { input -> 156 | 157 | ( if ( outputFile != null ) outputFile.outputStream() else System.out ).use { output -> 158 | 159 | Textricator.extractText( 160 | input = input, 161 | inputFormat = inputFormat, 162 | output = output, 163 | outputFormat = outputFormat, 164 | pageFilter = pages, 165 | textExtractorOptions = options, 166 | maxRowDistance = maxRowDistance ) 167 | 168 | } 169 | 170 | } 171 | } 172 | 173 | private fun form( opts:Map ) { 174 | 175 | val inputFile = opts.file("")!! 176 | val inputFormat = opts.string("--input-format") ?: inputFile.extension.lowercase() 177 | 178 | val outputFile = opts.file("") 179 | outputFile?.absoluteFile?.parentFile?.mkdirs() 180 | val outputFormat = opts.string("--output-format") ?: 181 | if ( outputFile == null ) { 182 | throw SystemExitException( "--output-format is required if is omitted.", 1 ) 183 | } else { 184 | outputFile.extension.lowercase() 185 | } 186 | 187 | val configFile = opts.file("--config")!! 188 | 189 | val config = FormParseConfigUtil.parseYaml(configFile) 190 | 191 | opts.string("--pages")?.apply { config.pages = this } 192 | 193 | inputFile.inputStream().use { input -> 194 | 195 | ( if ( outputFile != null ) outputFile.outputStream() else System.out ).use { output -> 196 | 197 | Textricator.parseForm( 198 | input = input, 199 | inputFormat = inputFormat, 200 | output = output, 201 | outputFormat = outputFormat, 202 | config = config ) 203 | 204 | } 205 | 206 | } 207 | 208 | 209 | } 210 | 211 | private fun forms( opts:Map ) { 212 | 213 | val inputDir = opts.file("")!! 214 | 215 | val inputFormat = opts.string("--input-format")!! 216 | val inputFormatUpper = inputFormat.uppercase() 217 | 218 | val outputFile = opts.file("") 219 | outputFile?.absoluteFile?.parentFile?.mkdirs() 220 | val outputFormat = opts.string("--output-format") ?: 221 | if ( outputFile == null ) { 222 | throw SystemExitException( "--output-format is required if is omitted.", 1 ) 223 | } else { 224 | outputFile.extension.lowercase() 225 | } 226 | 227 | val configFile = opts.file("--config")!! 228 | 229 | val config = FormParseConfigUtil.parseYaml(configFile) 230 | 231 | val inputs:SequenceInputStream,String>> = inputDir.walk().asSequence() 232 | .filter { file -> file.isFile } 233 | .filter { file -> file.extension.uppercase() == inputFormatUpper } 234 | .sorted() 235 | .map { inputFile -> 236 | Triple( inputFile.relativeTo(inputDir).path, { inputFile.inputStream() }, inputFormat ) 237 | } 238 | 239 | ( outputFile?.outputStream() ?: System.out ).use { output -> 240 | Textricator.parseForms( 241 | inputs = inputs, 242 | output = output, outputFormat = outputFormat, 243 | config = config 244 | ) 245 | } 246 | } 247 | 248 | private fun table( opts:Map ) { 249 | 250 | val inputFile = opts.file("")!! 251 | val inputFormat = opts.string("--input-format") ?: inputFile.extension.lowercase() 252 | 253 | val outputFile = opts.file("") 254 | outputFile?.absoluteFile?.parentFile?.mkdirs() 255 | val outputFormat = opts.string("--output-format") ?: 256 | if ( outputFile == null ) { 257 | throw SystemExitException( "--output-format is required if is omitted.", 1 ) 258 | } else { 259 | outputFile.extension.lowercase() 260 | } 261 | 262 | val configFile = opts.file("--config")!! 263 | 264 | val config = TableParseConfigUtil.parseYaml(configFile) 265 | 266 | opts.string("--pages")?.apply { config.pages = this } 267 | 268 | inputFile.inputStream().use { input -> 269 | 270 | ( if ( outputFile != null ) outputFile.outputStream() else System.out ).use { output -> 271 | 272 | Textricator.parseTable( 273 | input = input, 274 | inputFormat = inputFormat, 275 | output = output, 276 | outputFormat = outputFormat, 277 | config = config ) 278 | 279 | } 280 | 281 | } 282 | 283 | } 284 | 285 | private class SystemExitException(message:String,val exitCode:Int): Exception(message) 286 | 287 | } 288 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/extractor/TextExtractor.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.extractor 18 | 19 | import io.mfj.textricator.text.Text 20 | 21 | /** 22 | * Interface to extract text from a PDF. 23 | * 24 | * Create an instance and call [extract] for each page. 25 | * 26 | * @constructor Create an instance for the supplied PDF. 27 | */ 28 | interface TextExtractor:AutoCloseable { 29 | 30 | /** 31 | * Get the number of pages. 32 | */ 33 | fun getPageCount():Int 34 | 35 | /** 36 | * Extract text from the PDF, calling the callback for each text block. 37 | * 38 | * @param pageNumber Page to extract text from 39 | * 40 | */ 41 | fun extract(pageNumber:Int):List 42 | 43 | } 44 | 45 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/extractor/TextExtractorFactory.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.extractor 18 | 19 | import java.io.InputStream 20 | import java.util.* 21 | 22 | interface TextExtractorFactory { 23 | 24 | fun create( input:InputStream, options:TextExtractorOptions):TextExtractor 25 | 26 | companion object { 27 | 28 | /** path to properties files in classpath that has name=fqcn. */ 29 | private const val FACTORY_PROPERTIES = "io/mfj/textricator/extractor/textExtractor.properties" 30 | 31 | /** Map of name->factory class, loaded from all io/mfj/unbox/factory.properties on classpath. */ 32 | val classMap:Map> by lazy { 33 | TextExtractorFactory::class.java.classLoader.getResources( 34 | FACTORY_PROPERTIES) 35 | .asSequence() 36 | .map { url -> 37 | Properties().apply { url.openStream().use { load( it ) } } 38 | .entries 39 | .map { (k,v) -> 40 | val name = k as String 41 | val fqcn = v as String 42 | val factoryClass:Class = getFactoryClassFromFqcn( 43 | fqcn) 44 | Pair( name, factoryClass ) 45 | } 46 | } 47 | .flatten() 48 | .toMap() 49 | } 50 | 51 | val extractorNames:Collection by lazy { classMap.keys } 52 | 53 | /** Get the factory class for the specified FQCN. */ 54 | private fun getFactoryClassFromFqcn( fqcn:String ): Class = 55 | Class.forName(fqcn).let { class_ -> 56 | if ( TextExtractorFactory::class.java.isAssignableFrom( class_ ) ) { 57 | @Suppress("UNCHECKED_CAST") 58 | class_ as Class 59 | } else { 60 | throw Exception( "\"${fqcn}\" does not implement \"${TextExtractorFactory::class.java.name}\"" ) 61 | } 62 | } 63 | 64 | fun getFactory( name:String ):TextExtractorFactory { 65 | val factoryClass = classMap[name] ?: throw Exception( "No factory \"${name}\"" ) 66 | val factory = factoryClass.getDeclaredConstructor().newInstance() 67 | return factory 68 | } 69 | 70 | } 71 | 72 | } 73 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/extractor/TextExtractorOptions.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.extractor 18 | 19 | open class TextExtractorOptions( 20 | var boxPrecision:Float =0f, 21 | var boxIgnoreColors:Set = emptySet(), 22 | var maxRowDistance:Float = 0f, 23 | var extractor:String? = null, 24 | var pages:String? = null 25 | ) 26 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/extractor/csv/CsvTextExtractor.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.extractor.csv 18 | 19 | import io.mfj.textricator.extractor.TextExtractor 20 | import io.mfj.textricator.text.Text 21 | import io.mfj.textricator.text.output.CsvTextOutput 22 | 23 | import java.io.InputStream 24 | 25 | import org.apache.commons.csv.CSVParser 26 | import org.apache.commons.csv.CSVRecord 27 | 28 | /** 29 | * Parse the CSV output from [CsvTextOutput]. 30 | */ 31 | class CsvTextExtractor(input:InputStream):TextExtractor { 32 | 33 | // load into memory up front 34 | // Do not assume that pages are in order in the CSV. 35 | private val pages:MutableMap> = mutableMapOf>() 36 | .apply { 37 | CSVParser(input.bufferedReader(), CsvTextOutput.CSV_FORMAT).use { csvp -> 38 | csvp.asSequence() 39 | .drop(1) // header 40 | .map(::parseRec) 41 | .forEach { text -> 42 | getOrPut(text.pageNumber) { mutableListOf() }.add(text) 43 | } 44 | } 45 | } 46 | 47 | private val pageCount = pages.keys.sorted().lastOrNull() ?: 0 48 | 49 | // This matches CsvTextOutput.write(Text) 50 | private fun parseRec(rec:CSVRecord): Text = 51 | Text( 52 | pageNumber = rec[0].toInt(), 53 | ulx = rec[1].toFloat(), 54 | uly = rec[2].toFloat(), 55 | lrx = rec[3].toFloat(), 56 | lry = rec[4].toFloat(), 57 | // width (rec[5]) is calculated from ulx and lrx 58 | // height (rec[6]) is calculated from uly and lry 59 | content = rec[7], 60 | font = rec[8], 61 | fontSize = rec[9].toFloat(), 62 | color = rec[10], 63 | backgroundColor = rec[11], 64 | link = rec[12] 65 | ) 66 | 67 | override fun getPageCount():Int = pageCount 68 | 69 | override fun extract(pageNumber:Int):List = pages[pageNumber] ?: emptyList() 70 | 71 | override fun close() {} 72 | 73 | } 74 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/extractor/csv/CvsTextExtractorFactory.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.extractor.csv 18 | 19 | import io.mfj.textricator.extractor.TextExtractor 20 | import io.mfj.textricator.extractor.TextExtractorFactory 21 | import io.mfj.textricator.extractor.TextExtractorOptions 22 | 23 | import java.io.InputStream 24 | 25 | class CsvTextExtractorFactory: TextExtractorFactory { 26 | 27 | override fun create(input:InputStream, options:TextExtractorOptions):TextExtractor = CsvTextExtractor(input) 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/extractor/itext5/Box.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.extractor.itext5 18 | 19 | import java.beans.Transient 20 | 21 | /** A box drawn on the page. */ 22 | internal data class Box( 23 | val ulx:Float, 24 | val uly:Float, 25 | val lrx:Float, 26 | val lry:Float, 27 | val color:String? 28 | ) { 29 | val width:Float 30 | @Transient 31 | get() = lrx - ulx 32 | val height:Float 33 | @Transient 34 | get() = lry - uly 35 | } 36 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/extractor/itext5/Boxtricator.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.extractor.itext5 18 | 19 | import com.itextpdf.text.BaseColor 20 | 21 | import com.itextpdf.text.pdf.* 22 | import com.itextpdf.text.pdf.parser.* 23 | import io.mfj.textricator.extractor.itext5.Itext5TextExtractor.Companion.color 24 | 25 | import org.slf4j.LoggerFactory 26 | 27 | // adapted from 28 | // https://github.com/mkl-public/testarea-itext5/blob/master/src/test/java/mkl/testarea/itext5/extract/ExtractPaths.java 29 | internal class Boxtricator(private val reader:PdfReader, 30 | private val ignoreBoxColors:Set ) { 31 | 32 | companion object { 33 | private val log = LoggerFactory.getLogger( Boxtricator::class.java ) 34 | } 35 | 36 | fun getBoxes( pageNumber:Int, pageHeight:Float):List { 37 | 38 | val boxes:MutableList = mutableListOf() 39 | 40 | val parser = PdfReaderContentParser(reader) 41 | parser.processContent(pageNumber, 42 | MyExtRenderListener(boxes, pageHeight, ignoreBoxColors)) 43 | return boxes.ifEmpty { emptyList() } 44 | } 45 | 46 | private class MyExtRenderListener(val boxes:MutableList, val pageHeight:Float, 47 | val ignoreColors:Set ): ExtRenderListener { 48 | private val pathInfos:MutableList = mutableListOf() 49 | override fun beginTextBlock() {} 50 | override fun renderText(renderInfo:TextRenderInfo?) {} 51 | override fun endTextBlock() {} 52 | override fun renderImage(renderInfo:ImageRenderInfo?) {} 53 | override fun modifyPath(renderInfo:PathConstructionRenderInfo?) { 54 | pathInfos.add(renderInfo) 55 | } 56 | override fun renderPath(renderInfo:PathPaintingRenderInfo):Path? { 57 | val graphicsState:GraphicsState = renderInfo.gs 58 | 59 | val ctm = graphicsState.ctm 60 | 61 | val fill = (renderInfo.operation and PathPaintingRenderInfo.FILL) != 0 62 | 63 | if ( fill ) { 64 | val fillColor = graphicsState.fillColor 65 | val color = color(fillColor) 66 | 67 | if ( ! ignoreColors.contains( color ) ) { 68 | 69 | log.debug("\tthe path:") 70 | 71 | pathInfos.forEach { pathConstructionRenderInfo -> 72 | when ( pathConstructionRenderInfo?.operation ) { 73 | PathConstructionRenderInfo.MOVETO -> { 74 | log.debug("move to {} ", transform(ctm, pathConstructionRenderInfo.segmentData)) 75 | } 76 | PathConstructionRenderInfo.CLOSE -> { 77 | log.debug("\tclose {} ", transform(ctm, pathConstructionRenderInfo.segmentData)) 78 | 79 | } 80 | PathConstructionRenderInfo.CURVE_123 -> { 81 | log.debug("\tcurve123 {} ", transform(ctm, pathConstructionRenderInfo.segmentData)) 82 | 83 | } 84 | PathConstructionRenderInfo.CURVE_13 -> { 85 | log.debug("\tcurve13 {} ", transform(ctm, pathConstructionRenderInfo.segmentData)) 86 | 87 | } 88 | PathConstructionRenderInfo.CURVE_23 -> { 89 | log.debug("\tcurve23 {} ", transform(ctm, pathConstructionRenderInfo.segmentData)) 90 | 91 | } 92 | PathConstructionRenderInfo.LINETO -> { 93 | log.debug("\tline to {} ", transform(ctm, pathConstructionRenderInfo.segmentData)) 94 | 95 | } 96 | PathConstructionRenderInfo.RECT -> { 97 | val matrix = transform(ctm, expandRectangleCoordinates(pathConstructionRenderInfo.segmentData))!! 98 | log.debug("rectangle {} ", matrix) 99 | 100 | val box = Box(ulx = matrix[0 /* or 6 */], uly = calcY(matrix[5 /* or 7 */]), 101 | lrx = matrix[4 /* or 2 */], lry = calcY(matrix[1 /* or 3 */]), color = color(fillColor)) 102 | 103 | log.debug( "box: $box" ) 104 | boxes.add( box ) 105 | } 106 | /* 107 | else -> { 108 | throw Exception( "\t\tunhandled ${pathConstructionRenderInfo?.operation}" ) 109 | } 110 | */ 111 | } 112 | } 113 | } 114 | } 115 | 116 | pathInfos.clear() 117 | return null 118 | } 119 | 120 | override fun clipPath(rule:Int) {} 121 | 122 | private fun transform(ctm:Matrix, coordinates:List? ):List? { 123 | if ( coordinates == null ) return null 124 | val result:MutableList = mutableListOf() 125 | var i = 0 126 | while ( i < coordinates.size-1) { 127 | var vector = Vector(coordinates[i], coordinates[i + 1], 1f) 128 | vector = vector.cross(ctm) 129 | result.add(vector.get(Vector.I1)) 130 | result.add(vector.get(Vector.I2)) 131 | i+=2 132 | } 133 | return result 134 | } 135 | 136 | private fun expandRectangleCoordinates(rectangle:List ):List { 137 | if (rectangle.size < 4) return emptyList() 138 | 139 | return listOf( 140 | rectangle[0], // x (left) 141 | rectangle[1], // y (bottom) (from bottom) 142 | rectangle[0] + rectangle[2], // x (right) 143 | rectangle[1], // y (bottom) (from bottom) 144 | rectangle[0] + rectangle[2], // x (right) 145 | rectangle[1] + rectangle[3], // y (top) (from bottom) 146 | rectangle[0], // x (left) 147 | rectangle[1] + rectangle[3] // y (top) (from bottom) 148 | ) 149 | } 150 | 151 | private fun BaseColor?.format(): String = if (this == null) "DEFAULT" else "${red},${green},${blue}" 152 | 153 | // in iText, if y is positive, it is from the bottom, if y is negative, it is from the top. 154 | // We calculate from the top. 155 | fun calcY(y:Float):Float { 156 | return if (y >= 0) { 157 | pageHeight - y 158 | } else { 159 | y * -1 160 | } 161 | } 162 | } 163 | 164 | } 165 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/extractor/itext5/Buffer.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.extractor.itext5 18 | 19 | /** Buffer for accumulating values when parsing. */ 20 | internal data class Buffer ( 21 | val pageNumber:Int, 22 | val ulx:Float, 23 | val uly:Float, 24 | var lrx:Float, 25 | var lry:Float, 26 | val font:String, 27 | val fontSize:Float, 28 | val fontColor:String?, 29 | val content:StringBuffer = StringBuffer()) 30 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/extractor/itext5/Itext5TextExtractor.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.extractor.itext5 18 | 19 | import io.mfj.textricator.extractor.TextExtractor 20 | import io.mfj.textricator.text.Text 21 | 22 | import com.itextpdf.text.BaseColor 23 | import com.itextpdf.text.pdf.* 24 | import com.itextpdf.text.pdf.parser.ContentOperator 25 | import com.itextpdf.text.pdf.parser.FilteredTextRenderListener 26 | import com.itextpdf.text.pdf.parser.LocationTextExtractionStrategy 27 | import com.itextpdf.text.pdf.parser.PdfContentStreamProcessor 28 | import com.itextpdf.text.pdf.parser.PdfTextExtractor 29 | import com.itextpdf.text.pdf.parser.Vector 30 | 31 | import java.io.InputStream 32 | 33 | import kotlin.math.min 34 | import kotlin.math.max 35 | 36 | import org.slf4j.LoggerFactory 37 | 38 | /** 39 | * Class to extract text from a PDF. 40 | * 41 | * Create an instance and call [extract] for each page. 42 | * 43 | * @constructor Create an instance for the supplied PDF. 44 | */ 45 | class Itext5TextExtractor(input:InputStream, boxPrecision:Float?, boxIgnoreColors:Set? ): 46 | TextExtractor { 47 | 48 | private val reader = PdfReader(input) 49 | 50 | private val boxPrecision = boxPrecision?: 0f 51 | 52 | private val boxtricator = Boxtricator(reader, boxIgnoreColors ?: emptySet()) 53 | 54 | /** Number of pages in the PDF */ 55 | private val pageCount:Int = reader.numberOfPages 56 | private val pageRange = (1..pageCount) 57 | 58 | override fun getPageCount():Int { 59 | return pageCount 60 | } 61 | 62 | companion object { 63 | private val log = LoggerFactory.getLogger(Itext5TextExtractor::class.java) 64 | 65 | internal fun color( baseColor:BaseColor? ): String { 66 | if ( baseColor == null ) return "default" 67 | return "#%02x%02x%02x".format( baseColor.red, baseColor.green, baseColor.blue ) 68 | } 69 | 70 | data class Link( val url:String, val lrx:Float, val lry:Float, val ulx:Float, val uly:Float) 71 | } 72 | 73 | override fun close() { 74 | reader.close() 75 | } 76 | 77 | /** 78 | * Get the size, in points, of the specified page, as a width/height pair. 79 | * 80 | * @param pageNumber Page number 81 | */ 82 | private fun getPageSize(pageNumber:Int):Size { 83 | val rect = reader.getPageSize(pageNumber) 84 | return Size(rect.width, rect.height) 85 | } 86 | 87 | /** 88 | * Extract text from the PDF, calling the callback for each text block. 89 | * 90 | * @param pageNumber Page to extract text from 91 | * 92 | */ 93 | override fun extract(pageNumber:Int):List { 94 | 95 | if ( ! pageRange.contains( pageNumber ) ) { 96 | throw IllegalArgumentException( "Invalid page number: $pageNumber. Valid pages are $pageRange" ) 97 | } 98 | 99 | val pageHeight = getPageSize(pageNumber).height 100 | 101 | // in iText, if y is positive, it is from the bottom, if y is negative, it is from the top. 102 | // We calculate from the top. 103 | fun calcY(y:Float):Float { 104 | return if (y >= 0) { 105 | pageHeight - y 106 | } else { 107 | y * -1 108 | } 109 | } 110 | 111 | val boxes:List = boxtricator.getBoxes(pageNumber,pageHeight) 112 | 113 | val links = reader.getLinks(pageNumber) 114 | .mapNotNull { annotation -> 115 | val a = annotation.parameters[PdfName.A] 116 | if ( a is PdfDictionary ) { 117 | val uriPdfString = a[PdfName.URI] as PdfString 118 | val uri = uriPdfString.toUnicodeString() 119 | val rect = annotation.rect.map { o -> ( o as PdfNumber ).floatValue() } 120 | Link( 121 | url = uri, 122 | ulx = rect[0], 123 | uly = calcY(rect[3]), 124 | lrx = rect[2], 125 | lry = calcY(rect[1]) 126 | ) 127 | } else { 128 | // Found a file where it is a com.itextpdf.text.pdf.PRIndirectReference. 129 | // Did not need to get links from that document, so did not figure out what to do. 130 | null 131 | } 132 | } 133 | 134 | fun Buffer.toText():Text { 135 | val content = content.toString().trim() 136 | 137 | val link = links 138 | .firstOrNull { link -> 139 | ulx >= link.ulx 140 | && uly >= link.uly 141 | && lrx <= link.lrx 142 | && lry <= link.lry 143 | } 144 | ?.url 145 | 146 | return Text( 147 | content = content, 148 | pageNumber = pageNumber, 149 | ulx = ulx, 150 | uly = uly, 151 | lrx = lrx, 152 | lry = lry, 153 | font = font, 154 | fontSize = fontSize, 155 | color= fontColor, 156 | backgroundColor = getBackground(boxes, this, content), 157 | link = link ) 158 | } 159 | 160 | val texts:MutableList = mutableListOf() 161 | 162 | val strategy = FilteredTextRenderListener(LocationTextExtractionStrategy()) 163 | 164 | var buffer:Buffer? = null 165 | 166 | 167 | 168 | // start a new text segment 169 | fun start(x:Float, y:Float, font:String, fontSize:Float, fontColor:String?) { 170 | if (buffer != null) throw Exception("Forgot to call flush(). Text: $buffer") 171 | buffer = Buffer(pageNumber, x, y, x, y, font, fontSize, fontColor) 172 | } 173 | 174 | // append text to an existing segment 175 | // font and fontSize are ignored unless somebody forgot to call start() 176 | fun append(x:Float, y:Float, text:String, font:String, fontSize:Float, fontColor:String? ) { 177 | if ( buffer == null ) { 178 | // ' or " without Tj - I have seen this with some PDFs modified with qoppa's PDFStudio. 179 | log.warn("Forgot to call start()") 180 | start(x,y,font,fontSize,fontColor) 181 | } 182 | buffer!!.content.append(text) 183 | buffer!!.lrx = x 184 | buffer!!.lry = y 185 | } 186 | 187 | // Call the callback with the buffer content. 188 | fun flush() { 189 | if (buffer != null) { 190 | texts.add( buffer!!.toText() ) 191 | buffer = null 192 | } 193 | } 194 | 195 | // Define operators to capture text from the content stream 196 | 197 | // capture strings - start a new segment 198 | val stringOperator = ContentOperator { processor:PdfContentStreamProcessor, operator:PdfLiteral?, operands:ArrayList -> 199 | val string = operands[0] as PdfString 200 | val matrix = processor.textMatrix 201 | val x = matrix[6] 202 | val y = calcY( matrix[7] ) 203 | val bytes = string.bytes 204 | val gs = processor.gs() 205 | val text = gs.font.decode(bytes, 0, bytes.size) 206 | val fontSize = getFontSize(processor) 207 | val width = gs.font.getWidthPoint(text, fontSize) 208 | val fontColor = color(gs.fillColor) 209 | 210 | flush() 211 | 212 | log.debug("{${pageNumber}} string [ ${x}, $y ] $text") 213 | start(x, y, gs.font.postscriptFontName, fontSize, fontColor) 214 | append(x + width, y, text, gs.font.postscriptFontName, fontSize, fontColor ) 215 | } 216 | 217 | // capture continuation of previous string - append to existing segment 218 | val stringOperatorContinue = ContentOperator { processor:PdfContentStreamProcessor, operator:PdfLiteral?, operands:ArrayList -> 219 | val string = operands[0] as PdfString 220 | val matrix = processor.textMatrix 221 | val x = matrix[6] 222 | val y = calcY( matrix[7] ) 223 | val bytes = string.bytes 224 | val gs = processor.gs() 225 | val text = gs.font.decode(bytes, 0, bytes.size) 226 | val fontSize = getFontSize(processor) 227 | val width = gs.font.getWidthPoint(text, fontSize) 228 | val fontColor = color(gs.fillColor) 229 | 230 | // continuation from stringOperator 231 | log.debug("{${pageNumber}} continue [ ${x}, $y ] $text") 232 | append(x + width, y, " $text", gs.font.postscriptFontName, fontSize, fontColor) 233 | } 234 | 235 | // capture pdfarray - this is all one segment 236 | val arrayOperator = ContentOperator { processor:PdfContentStreamProcessor, operator:PdfLiteral?, operands:ArrayList -> 237 | val array = operands[0] as PdfArray 238 | val matrix = processor.textMatrix 239 | val x = matrix[6] 240 | val y = calcY( matrix[7] ) 241 | val gs = processor.gs() 242 | val fontSize = getFontSize(processor) 243 | 244 | // combine all PdfStrings in the PdfArray to one String 245 | val text = array.asSequence().filter { it is PdfString }.map { it as PdfString }.map { pdfString -> 246 | // PdfString.toUnicodeString() does not work in all cases. 247 | val bytes = pdfString.bytes 248 | gs.font.decode(bytes, 0, bytes.size) 249 | }.joinToString(separator = "") 250 | val width = gs.font.getWidthPoint(text, fontSize) 251 | val fontColor = color(gs.fillColor) 252 | 253 | flush() 254 | 255 | log.debug("{${pageNumber}} array [ $x , $y ] $text") 256 | start(x, y, gs.font.postscriptFontName, fontSize, fontColor) 257 | append(x + width, y, text, gs.font.postscriptFontName, fontSize, fontColor) 258 | flush() 259 | } 260 | 261 | // This is setting up what PdfContentStreamProcesses does internally. 262 | val twOperator = SetTextWordSpacing() 263 | val tcOperator = SetTextCharacterSpacing() 264 | val tdOperator = TextMoveStartNextLine() 265 | val tstarOperator = TextMoveNextLine(tdOperator) 266 | val tickOperator = MoveNextLineAndShowText(tstarOperator, stringOperatorContinue) 267 | val quoteOperator = MoveNextLineAndShowTextWithSpacing(twOperator, tcOperator, 268 | tickOperator) 269 | 270 | // This results in calls to the content operators defined above. We do not care about its return value. 271 | PdfTextExtractor.getTextFromPage(reader, pageNumber, strategy, 272 | mapOf("Tj" to stringOperator, "TJ" to arrayOperator, "'" to tickOperator, "\"" to quoteOperator)) 273 | 274 | flush() 275 | 276 | return texts 277 | .sortedWith( compareBy( {it.uly}, {it.ulx} ) ) 278 | } 279 | 280 | private fun getBackground(all:List?,buffer:Buffer,content:String): String? { 281 | if ( all == null ) return null 282 | val boxes = all.filter { box -> isBufferInBox( buffer, box ) } 283 | val box:Box? = boxes.lastOrNull() 284 | return box?.color 285 | } 286 | 287 | private fun isBufferInBox( buffer:Buffer, box:Box): Boolean { 288 | // shrink the buffer by the precision, but do not shrink width or height to negative. 289 | return box.ulx <= min( buffer.ulx + boxPrecision, buffer.lrx ) 290 | && box.uly <= min( buffer.uly + boxPrecision, buffer.lry ) 291 | && box.lrx >= max( buffer.lrx - boxPrecision, buffer.ulx ) 292 | && box.lry >= max( buffer.lry - boxPrecision, buffer.uly ) 293 | } 294 | 295 | private fun getFontSize( processor:PdfContentStreamProcessor ):Float { 296 | val gs = processor.gs() 297 | val effMatrix = processor.textMatrix.multiply( gs.ctm ) 298 | val fontSize = Vector(0f,gs.fontSize,0f).cross(effMatrix)[1] 299 | return fontSize 300 | } 301 | 302 | } 303 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/extractor/itext5/Itext5TextExtractorFactory.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.extractor.itext5 18 | 19 | import io.mfj.textricator.extractor.TextExtractor 20 | import io.mfj.textricator.extractor.TextExtractorFactory 21 | import io.mfj.textricator.extractor.TextExtractorOptions 22 | 23 | import java.io.InputStream 24 | 25 | class Itext5TextExtractorFactory:TextExtractorFactory { 26 | 27 | override fun create(input:InputStream, options:TextExtractorOptions):TextExtractor = 28 | Itext5TextExtractor( input, options.boxPrecision, options.boxIgnoreColors ) 29 | 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/extractor/itext5/Shenanigans.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.extractor.itext5 18 | 19 | import com.itextpdf.text.pdf.* 20 | import com.itextpdf.text.pdf.parser.* 21 | 22 | // This is copies of a bunch of stuff that is private in IText's PdfContentStreamProcessor and GraphicsState 23 | // DO NOT USE THIS!!! (except in Itext5TextExtractortor.kt) 24 | 25 | /** 26 | * A content operator implementation (Td). 27 | */ 28 | class TextMoveStartNextLine:ContentOperator { 29 | override fun invoke(processor:PdfContentStreamProcessor, operator:PdfLiteral?, operands:ArrayList) { 30 | val tx = (operands[0] as PdfNumber).floatValue() 31 | val ty = (operands[1] as PdfNumber).floatValue() 32 | 33 | val translationMatrix = Matrix(tx, ty) 34 | processor.textMatrix = translationMatrix.multiply(processor.textLineMatrix) 35 | processor.textLineMatrix = processor.textMatrix 36 | } 37 | 38 | } 39 | 40 | /** 41 | * A content operator implementation (T*). 42 | */ 43 | class TextMoveNextLine(private val moveStartNextLine:TextMoveStartNextLine):ContentOperator { 44 | override fun invoke(processor:PdfContentStreamProcessor, operator:PdfLiteral?, operands:ArrayList) { 45 | val tdoperands:ArrayList = ArrayList(2) 46 | tdoperands.add(0, PdfNumber(0)) 47 | tdoperands.add(1, PdfNumber(-processor.gs().leading)) 48 | moveStartNextLine.invoke(processor, null, tdoperands) 49 | } 50 | } 51 | 52 | /** 53 | * A content operator implementation ('). 54 | */ 55 | class MoveNextLineAndShowText(private val textMoveNextLine:TextMoveNextLine, private val showText:ContentOperator): 56 | ContentOperator { 57 | override fun invoke(processor:PdfContentStreamProcessor, operator:PdfLiteral?, operands:ArrayList) { 58 | textMoveNextLine.invoke(processor, null, ArrayList(0)) 59 | showText.invoke(processor, null, operands) 60 | } 61 | } 62 | 63 | /** 64 | * A content operator implementation (Tw). 65 | */ 66 | class SetTextWordSpacing:ContentOperator { 67 | override fun invoke(processor:PdfContentStreamProcessor, operator:PdfLiteral?, operands:ArrayList) { 68 | val wordSpace = operands[0] as PdfNumber 69 | processor.gs().setWordSpacing(wordSpace.floatValue()) 70 | } 71 | } 72 | 73 | /** 74 | * A content operator implementation (Tc). 75 | */ 76 | class SetTextCharacterSpacing:ContentOperator { 77 | override fun invoke(processor:PdfContentStreamProcessor, operator:PdfLiteral?, operands:ArrayList) { 78 | val charSpace = operands[0] as PdfNumber 79 | processor.gs().setCharacterSpacing(charSpace.floatValue()) 80 | } 81 | } 82 | 83 | /** 84 | * A content operator implementation ("). 85 | */ 86 | class MoveNextLineAndShowTextWithSpacing(private val setTextWordSpacing:SetTextWordSpacing, 87 | private val setTextCharacterSpacing:SetTextCharacterSpacing, 88 | private val moveNextLineAndShowText:MoveNextLineAndShowText):ContentOperator { 89 | 90 | override fun invoke(processor:PdfContentStreamProcessor, operator:PdfLiteral?, operands:ArrayList) { 91 | 92 | val aw:PdfNumber = operands[0] as PdfNumber 93 | val ac:PdfNumber = operands[1] as PdfNumber 94 | val string:PdfString = operands[2] as PdfString 95 | 96 | val twOperands:ArrayList = ArrayList(1) 97 | twOperands.add(0, aw) 98 | setTextWordSpacing.invoke(processor, null, twOperands) 99 | 100 | val tcOperands:ArrayList = ArrayList(1) 101 | tcOperands.add(0, ac) 102 | setTextCharacterSpacing.invoke(processor, null, tcOperands) 103 | 104 | val tickOperands:ArrayList = ArrayList(1) 105 | tickOperands.add(0, string) 106 | moveNextLineAndShowText.invoke(processor, null, tickOperands) 107 | } 108 | } 109 | 110 | /** 111 | * private/protected things we need to access 112 | */ 113 | var PdfContentStreamProcessor.textMatrix:Matrix 114 | get() = javaClass.getDeclaredField("textMatrix").let { 115 | it.isAccessible = true 116 | it.get(this) as Matrix 117 | } 118 | set(textMatrix) = javaClass.getDeclaredField("textMatrix").let { 119 | it.isAccessible = true 120 | it.set(this, textMatrix) 121 | } 122 | 123 | /// accessors for private itext stuff 124 | 125 | var PdfContentStreamProcessor.textLineMatrix:Matrix 126 | get() = javaClass.getDeclaredField("textLineMatrix").let { 127 | it.isAccessible = true 128 | it.get(this) as Matrix 129 | } 130 | set(textLineMatrix) = javaClass.getDeclaredField("textLineMatrix").let { 131 | it.isAccessible = true 132 | it.set(this, textLineMatrix) 133 | } 134 | 135 | fun GraphicsState.setWordSpacing(wordSpacing:Float) { 136 | javaClass.getDeclaredField("wordSpacing").let { 137 | it.isAccessible = true 138 | it.set(this, wordSpacing) 139 | } 140 | } 141 | 142 | fun GraphicsState.setCharacterSpacing(wordSpacing:Float) { 143 | javaClass.getDeclaredField("characterSpacing").let { 144 | it.isAccessible = true 145 | it.set(this, wordSpacing) 146 | } 147 | } 148 | 149 | val PathPaintingRenderInfo.gs:GraphicsState 150 | get() = javaClass.getDeclaredField("gs").let { 151 | it.isAccessible = true 152 | it.get(this) as GraphicsState 153 | } 154 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/extractor/itext5/Size.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.extractor.itext5 18 | 19 | /** Width and height, in points. */ 20 | data class Size(val width:Float, val height:Float) 21 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/extractor/itext7/Itext7TextExtractor.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.extractor.itext7 18 | import io.mfj.textricator.extractor.TextExtractor 19 | import io.mfj.textricator.text.Text 20 | 21 | import java.io.InputStream 22 | 23 | import com.itextpdf.kernel.colors.* 24 | import com.itextpdf.kernel.geom.Vector 25 | import com.itextpdf.kernel.pdf.* 26 | import com.itextpdf.kernel.pdf.canvas.parser.EventType 27 | import com.itextpdf.kernel.pdf.canvas.parser.PdfTextExtractor 28 | import com.itextpdf.kernel.pdf.canvas.parser.data.IEventData 29 | import com.itextpdf.kernel.pdf.canvas.parser.data.TextRenderInfo 30 | import com.itextpdf.kernel.pdf.canvas.parser.listener.LocationTextExtractionStrategy 31 | 32 | /** 33 | * Extract text using iText 7. 34 | * 35 | * This gets accurate locations, and does not attempt to group characters based on proximity or structure. 36 | */ 37 | class Itext7TextExtractor(input:InputStream):TextExtractor { 38 | 39 | private val reader = PdfReader(input) 40 | private val doc = PdfDocument(reader) 41 | 42 | /** Number of pages in the PDF */ 43 | private val pageCount:Int = doc.numberOfPages 44 | 45 | override fun getPageCount():Int { 46 | return pageCount 47 | } 48 | 49 | override fun close() { 50 | reader.close() 51 | } 52 | 53 | 54 | /** 55 | * Extract text from the PDF, calling the callback for each text block. 56 | * 57 | * @param pageNumber Page to extract text from 58 | * 59 | */ 60 | override fun extract(pageNumber:Int):List { 61 | 62 | val page = doc.getPage(pageNumber) 63 | 64 | val pageHeight = page.pageSize.height 65 | 66 | val links:List = page.annotations 67 | .filter { anno -> 68 | anno.subtype == PdfName.Link 69 | } 70 | .map { anno -> 71 | val aObj = anno.pdfObject[PdfName.A] as PdfDictionary 72 | val uriObj = (aObj[PdfName.URI] ?: aObj[PdfName.URL]) as PdfString 73 | val uri = uriObj.value 74 | // get the bounding box 75 | val rect = (anno.pdfObject[PdfName.Rect] as PdfArray).map { (it as PdfNumber).floatValue() } 76 | Link( 77 | url = uri, 78 | ulx = rect[0], 79 | uly = calcY(pageHeight, rect[3]), 80 | lrx = rect[2], 81 | lry = calcY(pageHeight, rect[1]) 82 | ) 83 | } 84 | 85 | val strategy = Strategy(pageNumber, pageHeight, links) 86 | 87 | PdfTextExtractor.getTextFromPage(page,strategy) 88 | 89 | return strategy.texts 90 | .sortedWith( compareBy( {it.uly}, {it.ulx} ) ) 91 | } 92 | 93 | private class Strategy( private val pageNumber:Int, private val pageHeight:Float, private val links:List): 94 | LocationTextExtractionStrategy() { 95 | val texts = mutableListOf() 96 | 97 | override fun eventOccurred(data:IEventData?, type:EventType?) { 98 | if ( type == EventType.RENDER_TEXT ) { 99 | val ri = data as TextRenderInfo 100 | val content = ri.text 101 | 102 | val matrix = ri.textMatrix 103 | val font = ri.font.fontProgram.fontNames.fontName 104 | 105 | val ulx = matrix[6] 106 | val lry = calcY( pageHeight, matrix[7] ) 107 | val width = ri.font.getWidth(content,ri.fontSize) 108 | val height = ri.font.getAscent(content, ri.fontSize) 109 | val uly = lry - height 110 | val lrx = ulx + width 111 | 112 | val color = ri.fillColor?.getHexColor() 113 | 114 | val link = links 115 | .firstOrNull { link -> 116 | ulx >= link.ulx 117 | && uly >= link.uly 118 | && lrx <= link.lrx 119 | && lry <= link.lry 120 | } 121 | ?.url 122 | 123 | val effMatrix = ri.textMatrix.multiply( ri.graphicsState.ctm ) 124 | val fontSize = Vector(0f,ri.fontSize,0f).cross(effMatrix)[1] 125 | 126 | val text = Text(content = content, backgroundColor = null, pageNumber = pageNumber, 127 | fontSize = fontSize, font = font, color = color, 128 | ulx = ulx, uly = uly, lrx = lrx, lry = lry, 129 | link = link ) 130 | 131 | texts.add( text ) 132 | } 133 | super.eventOccurred(data, type) 134 | } 135 | 136 | private fun Color.getHexColor(): String? = 137 | when ( this ) { 138 | is DeviceRgb -> getRgb( this ) 139 | is DeviceCmyk -> getRgb( Color.convertCmykToRgb( this ) ) 140 | else -> null 141 | } 142 | 143 | private fun getRgb( color:DeviceRgb ): String { 144 | val value = color.colorValue 145 | val r = ( value[0] * 255 ).toInt() 146 | val g = ( value[1] * 255 ).toInt() 147 | val b = ( value[2] * 255 ).toInt() 148 | return "#%02x%02x%02x".format( r, g, b ) 149 | } 150 | 151 | } 152 | 153 | companion object { 154 | data class Link(val url:String, val lrx:Float, val lry:Float, val ulx:Float, val uly:Float) 155 | 156 | // in iText, if y is positive, it is from the bottom, if y is negative, it is from the top. 157 | // We calculate from the top. 158 | fun calcY(pageHeight:Float, y:Float):Float { 159 | return if (y >= 0) { 160 | pageHeight - y 161 | } else { 162 | y * -1 163 | } 164 | } 165 | 166 | } 167 | 168 | } 169 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/extractor/itext7/Itext7TextExtractorFactory.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.extractor.itext7 18 | 19 | import io.mfj.textricator.extractor.TextExtractor 20 | import io.mfj.textricator.extractor.TextExtractorFactory 21 | import io.mfj.textricator.extractor.TextExtractorOptions 22 | 23 | import java.io.InputStream 24 | 25 | class Itext7TextExtractorFactory:TextExtractorFactory { 26 | 27 | override fun create(input:InputStream, options:TextExtractorOptions):TextExtractor = Itext7TextExtractor( 28 | input) 29 | 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/extractor/json/JsonTextExtractor.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.extractor.json 18 | 19 | import io.mfj.textricator.extractor.TextExtractor 20 | import io.mfj.textricator.text.Text 21 | 22 | import java.io.InputStream 23 | import java.util.* 24 | 25 | import com.fasterxml.jackson.core.JsonFactory 26 | import com.fasterxml.jackson.databind.ObjectMapper 27 | import com.fasterxml.jackson.module.kotlin.registerKotlinModule 28 | 29 | class JsonTextExtractor(input:InputStream):TextExtractor { 30 | 31 | private val texts:Map> 32 | private val pageCount:Int 33 | 34 | // load into memory up front 35 | init { 36 | val mapper = ObjectMapper(JsonFactory()).registerKotlinModule() 37 | 38 | val typeFactory = mapper.typeFactory 39 | 40 | val type = 41 | typeFactory.constructCollectionType( LinkedList::class.java, 42 | mapper.constructType(Text::class.java ) ) 43 | 44 | val list:List = mapper.readValue(input,type) 45 | 46 | val map:MutableMap> = mutableMapOf() 47 | list.forEach { text -> 48 | map 49 | .getOrPut( text.pageNumber) { mutableListOf() } 50 | .add( text ) 51 | } 52 | 53 | this.texts = map 54 | this.pageCount = texts.keys.maxOrNull() ?: 0 55 | } 56 | 57 | override fun getPageCount():Int { 58 | return pageCount 59 | } 60 | 61 | override fun extract(pageNumber:Int):List { 62 | return texts[pageNumber] ?: emptyList() 63 | } 64 | 65 | override fun close() {} 66 | 67 | } 68 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/extractor/json/JsonTextExtractorFactory.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.extractor.json 18 | 19 | import io.mfj.textricator.extractor.TextExtractor 20 | import io.mfj.textricator.extractor.TextExtractorFactory 21 | import io.mfj.textricator.extractor.TextExtractorOptions 22 | 23 | import java.io.InputStream 24 | 25 | class JsonTextExtractorFactory: TextExtractorFactory { 26 | 27 | override fun create(input:InputStream, options:TextExtractorOptions):TextExtractor = JsonTextExtractor(input) 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/extractor/pdfbox/PdfboxTextExtractor.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.extractor.pdfbox 18 | 19 | import io.mfj.textricator.text.Text 20 | import io.mfj.textricator.extractor.TextExtractor 21 | 22 | import java.io.InputStream 23 | 24 | import org.apache.pdfbox.pdmodel.PDDocument 25 | 26 | class PdfboxTextExtractor(input:InputStream):TextExtractor { 27 | 28 | private val doc = PDDocument.load(input) 29 | 30 | private val stripper = TextBoxPdfTextStripper() 31 | 32 | override fun extract(pageNumber:Int):List { 33 | 34 | stripper.startPage = pageNumber 35 | stripper.endPage = pageNumber 36 | stripper.getText(doc) 37 | 38 | return stripper.wordList[pageNumber]?.sortedWith( compareBy( Text::uly, Text::ulx ) ) ?: emptyList() 39 | } 40 | 41 | override fun getPageCount():Int = doc.numberOfPages 42 | 43 | override fun close() { 44 | doc.close() 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/extractor/pdfbox/PdfboxTextExtractorFactory.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.extractor.pdfbox 18 | 19 | import io.mfj.textricator.extractor.TextExtractor 20 | import io.mfj.textricator.extractor.TextExtractorFactory 21 | import io.mfj.textricator.extractor.TextExtractorOptions 22 | 23 | import java.io.InputStream 24 | 25 | class PdfboxTextExtractorFactory:TextExtractorFactory { 26 | 27 | override fun create(pdf:InputStream, options:TextExtractorOptions):TextExtractor = PdfboxTextExtractor( 28 | pdf) 29 | 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/extractor/pdfbox/TextBoxPdfTextStripper.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.extractor.pdfbox 18 | 19 | import io.mfj.textricator.text.Text 20 | 21 | import java.io.IOException 22 | import java.util.regex.Pattern 23 | 24 | import org.apache.pdfbox.text.PDFTextStripper 25 | import org.apache.pdfbox.text.TextPosition 26 | 27 | internal class TextBoxPdfTextStripper : PDFTextStripper() { 28 | 29 | companion object { 30 | const val NON_BREAKING_SPACE:String = "\u00A0" 31 | val NON_PRINTABLE:Pattern = Pattern.compile(".*[\\u0000-\\u0019]+.*") 32 | } 33 | 34 | internal val wordList: MutableMap> = mutableMapOf() 35 | 36 | /** 37 | * used to group words decoded at the same time. 38 | */ 39 | private var chunkIndex:Long = 0 40 | 41 | @Throws(IOException::class) 42 | override fun writeString(text:String, textPositions:List ) { 43 | chunkIndex++ 44 | val sb = StringBuilder() 45 | // add initial position 46 | 47 | // look for huge space gaps: 48 | var wordStart:Int = -1 49 | var current:TextPosition = textPositions.get(0) 50 | var previous: TextPosition 51 | var maxHeight:Float = 0f 52 | 53 | (0..textPositions.size-1).forEach { i -> 54 | previous = current 55 | 56 | current = textPositions.get(i) 57 | 58 | val separation:Float = current.getX() - (previous.getX() + previous.getWidth()) 59 | val sameFont:Boolean = current.getFont().equals(previous.getFont()) 60 | && current.getFontSize() == previous.getFontSize() 61 | 62 | 63 | if (current.getUnicode().endsWith(" ") || 64 | current.getUnicode().endsWith( 65 | NON_BREAKING_SPACE) || 66 | (wordStart != -1 && (separation >= previous.getWidthOfSpace()))) { 67 | addWord(sb, wordStart, i, textPositions, maxHeight) 68 | maxHeight = 0f 69 | wordStart = -1 70 | } else if ((i > 0 && separation < -(previous.getWidth()) / 2) || !sameFont) { 71 | // split in cases where words overlay (eg in excel print outs! // TODO : this should be a configuable param 72 | // have to exclude i = 0 as we set previous to the first char 73 | addWord(sb, wordStart, i, textPositions, maxHeight) 74 | maxHeight = 0f 75 | wordStart = i 76 | sb.append(current.getUnicode()) 77 | } else { 78 | if (wordStart == -1) { 79 | wordStart = i 80 | } 81 | sb.append(current.getUnicode()) 82 | } 83 | val gs = graphicsState 84 | val color = gs.strokingColor.toRGB() 85 | val nscolor = gs.nonStrokingColor.toRGB() 86 | maxHeight = Math.max(maxHeight, current.getHeight()) 87 | 88 | } 89 | 90 | addWord(sb, wordStart, textPositions.size, textPositions, maxHeight) 91 | } 92 | 93 | private fun expandNonPrintableUnicode(string:String) :String { 94 | if (NON_PRINTABLE.matcher(string).matches()) { 95 | val sb = StringBuilder() 96 | (0..string.length-1).forEach { i -> 97 | val cp = string.codePointAt(i) 98 | if (cp < 0x20) { 99 | sb.append(String.format("\\x%02x", cp)) 100 | } else { 101 | sb.append(string[i]) 102 | } 103 | } 104 | return sb.toString() 105 | } else { 106 | return string 107 | } 108 | } 109 | 110 | private fun addWord(sb: StringBuilder, startIndex:Int, endIndex:Int, 111 | textPositions: List, maxHeight:Float) { 112 | if (sb.length > 0) { 113 | val first = textPositions.get(startIndex) 114 | val previous = textPositions.get(endIndex - 1) 115 | 116 | val expanded = expandNonPrintableUnicode(sb.toString()) 117 | 118 | val textBox = Text(content = expanded, pageNumber = currentPageNo, ulx = first.x, 119 | uly = first.y - first.height, lrx = previous.x + previous.width, lry = first.y, font = first.font?.name ?: "", 120 | fontSize = first.fontSize, color = null, // TODO 121 | backgroundColor = null) 122 | 123 | wordList.putIfAbsent(textBox.pageNumber, mutableListOf()) 124 | val pageList = wordList.get(textBox.pageNumber)!! 125 | pageList.add(textBox) 126 | sb.setLength(0) 127 | } 128 | } 129 | 130 | 131 | } 132 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/form/FormParseEventListener.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.form 18 | 19 | interface FormParseEventListener: FsmEventListener, RecordParserEventListener 20 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/form/FsmEventListener.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.form 18 | 19 | import io.mfj.textricator.text.Text 20 | 21 | interface FsmEventListener { 22 | 23 | fun onText( text:Text) 24 | 25 | fun onHeader( text:Text) 26 | fun onFooter( text:Text) 27 | fun onLeftMargin( text:Text) 28 | fun onRightMargin( text:Text) 29 | fun onExclude( text:Text, condition:String ) 30 | 31 | fun onCheckTransition( currentState:String, condition:String, nextState:String ) 32 | fun onCheckTransition( currentState:String, condition:String, nextState:String, match:Boolean, message:String? ) 33 | 34 | fun onNoPrevious( source:String ) 35 | fun onCheckCondition( source:String, description:String, match:Boolean ) 36 | 37 | fun onPageStateChange( page:Int, state:String ) 38 | 39 | fun onStateChange( page:Int, state:String ) 40 | 41 | fun onVariableSet( currentState:String, name:String, value:String? ) 42 | 43 | fun onFsmEnd() 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/form/LoggingEventListener.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.form 18 | 19 | import io.mfj.textricator.text.Text 20 | import org.slf4j.LoggerFactory 21 | 22 | object LoggingEventListener: FormParseEventListener { 23 | 24 | private val log = LoggerFactory.getLogger( LoggingEventListener::class.java ) 25 | 26 | override fun onText(text:Text) { 27 | if ( log.isDebugEnabled ) { 28 | log.debug("============================") 29 | log.debug("text: \"${text.content}\"") 30 | log.debug("\tpageNumber: ${text.pageNumber} ul:[ ${text.ulx} , ${text.uly} ] lr: [ ${text.lrx} , ${text.lry} ]") 31 | log.debug("\tfont: ${text.font} - ${text.fontSize}") 32 | log.debug("\tbgcolor: ${text.backgroundColor}") 33 | log.debug("\tlink: ${text.link}") 34 | } 35 | } 36 | 37 | override fun onHeader(text:Text) { 38 | log.debug("\tpart of header. skip") 39 | } 40 | 41 | override fun onFooter(text:Text) { 42 | log.debug("\tpart of footer. skip") 43 | } 44 | 45 | override fun onLeftMargin(text:Text) { 46 | log.debug("\tpart of left gutter. skip") 47 | } 48 | 49 | override fun onRightMargin(text:Text) { 50 | log.debug("\tpart of right gutter. skip") 51 | } 52 | 53 | override fun onExclude(text: Text, condition: String) { 54 | log.debug("\texcluded by condition \"${condition}\"") 55 | } 56 | 57 | override fun onCheckTransition(currentState:String, condition:String, nextState:String) { 58 | log.debug("\tcheck transition \"${condition}\" (\"${currentState}\" -> \"${nextState}\")...") 59 | } 60 | 61 | override fun onNoPrevious(source:String) { 62 | log.debug("\t\tno previous [${source}]") 63 | } 64 | 65 | override fun onCheckTransition(currentState:String, condition:String, nextState:String, match:Boolean, message:String?) { 66 | log.debug("\t\t${match} ${if (message != null) " (${message})" else "" }") 67 | } 68 | 69 | override fun onCheckCondition(source:String, description:String, match:Boolean) { 70 | log.debug("\t\tcheck condition [${source}] ${description} : ${match}" ) 71 | } 72 | 73 | override fun onPageStateChange(page:Int, state:String) { 74 | log.debug( "State = ${state} (page:${page} reset)" ) 75 | } 76 | 77 | override fun onStateChange(page:Int, state:String) { 78 | log.debug( "State = ${state} (page:${page})" ) 79 | } 80 | 81 | override fun onVariableSet(currentState:String, name:String, value:String?) { 82 | log.debug( "Variable ${name} = ${if ( value != null ) "\"${value}\"" else "null" }" ) 83 | } 84 | 85 | override fun onFsmEnd() { 86 | log.debug( "fsm end" ) 87 | } 88 | 89 | override fun onRecordsEnd() { 90 | log.debug( "records end" ) 91 | } 92 | 93 | override fun onStateValue(sv:StateValue) { 94 | log.debug( "parse StateValue: ${sv.stateId} : ${sv.values}" ) 95 | } 96 | 97 | override fun onNewRecord(typeId:String) { 98 | log.debug( "\tnew ${typeId} record" ) 99 | } 100 | 101 | override fun onRecordAppend(typeId:String) { 102 | log.debug( "\tadd to ${typeId} record" ) 103 | } 104 | 105 | } 106 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/form/RecordParserEventListener.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.form 18 | 19 | interface RecordParserEventListener { 20 | 21 | fun onRecordsEnd() 22 | 23 | fun onStateValue( sv:StateValue) 24 | 25 | fun onNewRecord( typeId:String ) 26 | 27 | fun onRecordAppend( typeId:String ) 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/form/StateValue.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.form 18 | 19 | import io.mfj.textricator.form.config.State 20 | import io.mfj.textricator.record.Value 21 | 22 | /** 23 | * @param pageNumber The pageNumber that the state started on. 24 | */ 25 | data class StateValue( val source:String?, val pageNumber:Int, val stateId:String, val state:State, val values:List, 26 | val splitContinuation:Boolean=false ) 27 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/form/WriterEventListener.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.form 18 | 19 | import io.mfj.textricator.text.Text 20 | 21 | import java.io.Writer 22 | 23 | import org.slf4j.LoggerFactory 24 | 25 | class WriterEventListener(private val w:Writer): FormParseEventListener { 26 | 27 | private val log = LoggerFactory.getLogger(LoggingEventListener::class.java) 28 | 29 | override fun onText(text:Text) { 30 | write("============================") 31 | write("text: \"${text.content}\"") 32 | write("\tpageNumber: ${text.pageNumber} ul:[ ${text.ulx} , ${text.uly} ] lr: [ ${text.lrx} , ${text.lry} ]") 33 | write("\tfont: ${text.font} - ${text.fontSize}") 34 | write("\tbgcolor: ${text.backgroundColor}") 35 | } 36 | 37 | override fun onHeader(text:Text) { 38 | write("\tpart of header. skip") 39 | } 40 | 41 | override fun onFooter(text:Text) { 42 | write("\tpart of footer. skip") 43 | } 44 | 45 | override fun onLeftMargin(text:Text) { 46 | write("\tpart of left gutter. skip") 47 | } 48 | 49 | override fun onRightMargin(text:Text) { 50 | write("\tpart of right gutter. skip") 51 | } 52 | 53 | override fun onExclude(text: Text, condition: String) { 54 | write("\texcluded by condition \"${condition}\"") 55 | } 56 | 57 | override fun onCheckTransition(currentState:String, condition:String, nextState:String) { 58 | write("\tcheck transition \"${condition}\" (\"${currentState}\" -> \"${nextState}\")...") 59 | } 60 | 61 | override fun onCheckTransition(currentState:String, condition:String, nextState:String, match:Boolean, message:String?) { 62 | write("\t\t${match} ${if (message != null) " (${message})" else "" }") 63 | } 64 | 65 | override fun onNoPrevious(source:String) { 66 | write("\t\tno previous [${source}]") 67 | } 68 | 69 | override fun onCheckCondition(source:String, description:String, match:Boolean) { 70 | write("\t\tcheck condition [${source}] ${description} : ${match}" ) 71 | } 72 | 73 | override fun onPageStateChange(page:Int, state:String) { 74 | write( "State = ${state} (page:${page} reset)" ) 75 | } 76 | 77 | override fun onStateChange(page:Int, state:String) { 78 | write( "State = ${state} (page:${page})" ) 79 | } 80 | 81 | override fun onVariableSet(currentState:String, name:String, value:String?) { 82 | write( "Variable ${name} = ${if ( value != null ) "\"${value}\"" else "null" }" ) 83 | } 84 | 85 | override fun onFsmEnd() { 86 | write( "fsm end" ) 87 | } 88 | 89 | override fun onRecordsEnd() { 90 | write( "records end" ) 91 | } 92 | 93 | override fun onStateValue(sv:StateValue) { 94 | write( "parse StateValue: ${sv.stateId} : ${sv.values}" ) 95 | } 96 | 97 | override fun onNewRecord(typeId:String) { 98 | write( "\tnew ${typeId} record" ) 99 | } 100 | 101 | override fun onRecordAppend(typeId:String) { 102 | write( "\tadd to ${typeId} record" ) 103 | } 104 | 105 | 106 | var writes:Long = 0 107 | 108 | private fun write( s:String ) { 109 | try { 110 | w.appendLine(s) 111 | if ( writes++ % 100L == 0L ) w.flush() 112 | } catch ( e:Exception ) { 113 | log.error(e.message,e) 114 | } 115 | } 116 | 117 | } 118 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/form/config/DefaultAndPages.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.form.config 18 | 19 | /** 20 | * A class to contain a default value along with page specific values which override the default on a given page. 21 | */ 22 | data class DefaultAndPages(val default: String? = null,val pages: Map = mapOf()) 23 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/form/config/FormParseConfig.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.form.config 18 | 19 | import io.mfj.textricator.record.RecordModel 20 | import io.mfj.textricator.record.RecordType 21 | import io.mfj.textricator.extractor.TextExtractorOptions 22 | import io.mfj.textricator.record.ValueType 23 | 24 | /** 25 | * Defines the state machine 26 | */ 27 | class FormParseConfig( 28 | var states: MutableMap = mutableMapOf(), 29 | var stateDefaults: State? = null, 30 | var conditions: MutableMap = mutableMapOf(), 31 | var initialState: String = "INITIAL_STATE", 32 | var newPageState: String? = null, 33 | var header:DefaultAndPages = DefaultAndPages(), 34 | var footer:DefaultAndPages = DefaultAndPages(), 35 | var left:DefaultAndPages = DefaultAndPages(), 36 | var right:DefaultAndPages = DefaultAndPages(), 37 | override var rootRecordType: String = "root", 38 | override var recordTypes: Map = emptyMap(), 39 | override var valueTypes: Map = emptyMap(), 40 | /** Any [Text]s that match any of these condition names (the key in [conditions]) 41 | * are excluded; never processed by the finite-state machine. */ 42 | var excludeConditions: List = emptyList(), 43 | extractor:String? = null, 44 | pages:String? = null, 45 | maxRowDistance:Float = 0f, 46 | boxPrecision:Float =0f, 47 | boxIgnoreColors:MutableSet = mutableSetOf() 48 | ): TextExtractorOptions( 49 | extractor = extractor, 50 | boxPrecision = boxPrecision, 51 | boxIgnoreColors = boxIgnoreColors, 52 | maxRowDistance = maxRowDistance, 53 | pages = pages ), RecordModel 54 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/form/config/FormParseConfigUtil.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.form.config 18 | 19 | import java.io.File 20 | import java.io.InputStream 21 | 22 | import com.fasterxml.jackson.databind.ObjectMapper 23 | import com.fasterxml.jackson.dataformat.yaml.YAMLFactory 24 | import com.fasterxml.jackson.module.kotlin.* 25 | 26 | object FormParseConfigUtil { 27 | 28 | private val mapper = ObjectMapper(YAMLFactory()).registerKotlinModule() 29 | 30 | fun parseYaml(file: File):FormParseConfig = parseYaml(file.readText()) 31 | 32 | fun parseYaml(input:InputStream):FormParseConfig = mapper.readValue(input) 33 | 34 | fun parseYaml(text:String):FormParseConfig = mapper.readValue(text) 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/form/config/State.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.form.config 18 | 19 | import java.util.* 20 | 21 | /** 22 | * A state in the state machine 23 | */ 24 | 25 | data class State( 26 | /** If true, remove text in this state before record processing. */ 27 | val skip:Boolean = false, 28 | /** If false, do not create records for this state, but still include it in record processing 29 | * so it can split adjacent states and affect if the next sibling starts a record. */ 30 | val include:Boolean = true, 31 | val transitions: MutableList = ArrayList(), 32 | val startRecord: Boolean = false, 33 | val startRecordRequiredState: String? = null, 34 | val startRecordForEachValue:Boolean = false, 35 | val valueTypes: MutableList? = null, 36 | val combineLimit: Float? = null, 37 | val setVariables: MutableList = mutableListOf() 38 | ) 39 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/form/config/Transition.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.form.config 18 | 19 | /** 20 | * Defines a transition, consisting of a condition ID, the id of the next node and a unique id for this transition 21 | */ 22 | 23 | data class Transition(val condition: String, val nextState: String, val message:String? = null) 24 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/form/config/VariableSet.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.form.config 18 | 19 | data class VariableSet( val name:String? = null, var value:String? = null ) 20 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/record/Record.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.record 18 | 19 | import java.beans.Transient 20 | 21 | // cannot be a data class, because used as a map key in RecordParser. 22 | class Record( 23 | val source:String?=null, 24 | val pageNumber:Int, 25 | val typeId:String, 26 | val values:MutableMap = mutableMapOf(), 27 | val children:MutableMap> = mutableMapOf() 28 | ) { 29 | 30 | constructor( 31 | pageNumber:Int, 32 | typeId:String, 33 | values:MutableMap = mutableMapOf(), 34 | children:MutableMap> = mutableMapOf() 35 | ) : this(null,pageNumber,typeId,values,children) 36 | 37 | val isLeaf:Boolean 38 | @Transient 39 | get() { 40 | children.values.forEach { childList -> 41 | if ( childList.isNotEmpty() ) { 42 | return false 43 | } 44 | } 45 | return true 46 | } 47 | 48 | fun getValue( valueTypeId:String, attribute:String? ): String? = values[valueTypeId]?.getValue(attribute) 49 | 50 | override fun hashCode():Int = pageNumber + typeId.hashCode() + 17 * values.size + 27 * children.size 51 | 52 | override fun equals(other:Any?):Boolean = ( other != null ) && ( other is Record) && 53 | pageNumber == other.pageNumber && typeId == other.typeId && 54 | values.size == other.values.size && children.size == other.children.size && 55 | values == other.values && children == other.children 56 | 57 | } 58 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/record/RecordFilter.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.record 18 | 19 | import io.mfj.expr.* 20 | 21 | /** 22 | * Filters records based on [RecordType.filter]. 23 | * 24 | * @param config FormParseConfig 25 | */ 26 | class RecordFilter( private val config:RecordModel) { 27 | 28 | companion object { 29 | private val DEFAULT_TYPE:ExDataType = ExDataType.STRING 30 | } 31 | 32 | fun filter( seq:Sequence ): Sequence = 33 | seq.mapNotNull { record -> 34 | filter( record ) 35 | } 36 | 37 | /** Map of value type id to ExDataType */ 38 | private val valueTypeMap = config.valueTypes.map { (id,member) -> 39 | id to ( member.type?.let { ExDataType.valueOf( it.uppercase() ) } ?: DEFAULT_TYPE) 40 | }.toMap() 41 | 42 | /** Get the ExDataType for the specified member. */ 43 | private fun getExDataType( valueTypeId:String ) = valueTypeMap[valueTypeId] ?: DEFAULT_TYPE 44 | 45 | /** Map of record type ID to the Expr to filter it, or null if no filter. */ 46 | private val recordTypeToExpr:Map = 47 | config.recordTypes.entries 48 | .associate { (recordTypeId, recordType) -> 49 | recordTypeId to buildExpr(recordType) 50 | } 51 | 52 | /** 53 | * Build Expr for the specified record type. Null if no filter. 54 | */ 55 | private fun buildExpr( type:RecordType): Expr? { 56 | val filter = type.filter 57 | return if ( filter != null ) { 58 | val vtp:VarTypeProvider = object:VarTypeProvider { 59 | override fun contains(varName:String):Boolean = type.valueTypes.contains(varName) 60 | override fun get(varName:String):ExDataType = 61 | if ( type.valueTypes.contains(varName) ) { 62 | getExDataType( varName ) 63 | } else { 64 | throw IllegalArgumentException( "No such var \"${varName}\"" ) 65 | } 66 | override fun getKnownVars():Map = 67 | type.valueTypes.associateWith { getExDataType(it) } 68 | } 69 | ExprParser.parseToExpr( filter, vtp ) 70 | } else { 71 | null 72 | } 73 | } 74 | 75 | /** 76 | * Filter the supplied record. 77 | * If the record's type's filter passes, return the record with the children also filtered. 78 | * If the record's type's filter does not pass, return null. 79 | */ 80 | private fun filter( record:Record): Record? { 81 | return if ( evalFilter( record ) ) { 82 | filterChildren( record ) 83 | record 84 | } else { 85 | null 86 | } 87 | } 88 | 89 | /** 90 | * Filter the children of the supplied record. 91 | */ 92 | private fun filterChildren( record:Record) { 93 | record.children.values.forEach { children -> 94 | children.retainAll { child -> 95 | evalFilter( child ) 96 | } 97 | children.forEach { child -> 98 | filterChildren( child ) 99 | } 100 | } 101 | } 102 | 103 | /** 104 | * Evaluate the filter for the specified record. 105 | */ 106 | private fun evalFilter( record:Record): Boolean = 107 | recordTypeToExpr[record.typeId] 108 | ?.value( 109 | object:VarProvider { 110 | override fun contains(varName:String):Boolean = record.values.contains( varName ) 111 | override fun get(varName:String):Any? { 112 | val exDataType = getExDataType(varName) 113 | val s = record.values[varName]?.text // TODO filter based on other attributes 114 | return ExConvert.convertStr( s, exDataType ) 115 | } 116 | override fun getKnownVars():Set = record.values.keys 117 | } 118 | ) 119 | ?: true // accept if no filter 120 | 121 | } 122 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/record/RecordModel.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.record 18 | 19 | /** 20 | * Defines the state machine 21 | */ 22 | interface RecordModel { 23 | val rootRecordType: String 24 | val recordTypes: Map 25 | val valueTypes: Map 26 | } 27 | 28 | data class RecordType( 29 | var label: String, // user defined name 30 | var children: List = emptyList(), // List of child RecordType's ids 31 | var valueTypes: List = emptyList(), // List of the data and cell leaf states at this level. 32 | var pagePriority: Int = 0, // for output, use the page from the type with the highest priority 33 | var filter:String? = null 34 | ) 35 | 36 | open class ValueType( 37 | val label:String? = null, 38 | 39 | /** If true, remove duplicate values. */ 40 | val unrepeat:Boolean = false, 41 | 42 | /** Separator when combining values. Values are combined after unrepeating */ 43 | val separator:String = " ", 44 | 45 | /** Replacements to make. Replacements are mae after values are unrepeated and combined. */ 46 | val replacements:MutableList? = null, 47 | 48 | /** If false, do not include in output. 49 | * Useful when the same data is repeated for each child record and you need it to mark new records. */ 50 | val include:Boolean = true, 51 | 52 | /** If set, use this attribute of [Value] instead of [Value.text]. */ 53 | val attribute:String? =null, 54 | 55 | /** Data type (used for [RecordType.filter]). */ 56 | val type:String? = null // ExDataType 57 | 58 | ) { 59 | 60 | fun calcValue( values:List ): Value { 61 | val text = values 62 | .map(Value::text) 63 | .unrepeat() 64 | .joinToString(separator) 65 | .replace() 66 | val link = values 67 | .asSequence() 68 | .mapNotNull(Value::link) 69 | .firstOrNull() 70 | return Value(text,link) 71 | } 72 | 73 | private fun List.unrepeat(): List { 74 | if ( ! unrepeat ) return this 75 | (1..size).forEach { i -> 76 | if ( isRepeated( this, i ) ) { 77 | return this.subList(0,i).unrepeat() 78 | } 79 | } 80 | return this 81 | } 82 | 83 | private fun isRepeated( list:List, pos:Int ): Boolean { 84 | if ( list.size < pos*2 ) return false 85 | (0 until pos).forEach { i -> 86 | if ( list[i] != list[pos+i] ) { 87 | return false 88 | } 89 | } 90 | return true 91 | } 92 | 93 | private fun String.replace(): String { 94 | // if there's no replacements, return the original string 95 | if ( replacements == null) { 96 | return this 97 | } else { 98 | // otherwise check all possible regex patterns for a match 99 | replacements.forEach { r -> 100 | // if the string matches a pattern, replace it with the regex replacement and return 101 | if ( r.regexPattern.containsMatchIn( this ) ) { 102 | return r.regexPattern.replace(this, r.replacement) 103 | } 104 | } 105 | } 106 | 107 | // if the string matched no patterns, return the original string 108 | return this 109 | } 110 | 111 | } 112 | 113 | private fun List.join(separator:String):Value { 114 | val text = map(Value::text).joinToString(separator) 115 | val link = asSequence().mapNotNull(Value::link).firstOrNull() 116 | return Value(text,link) 117 | } 118 | 119 | fun ValueType?.calculateValue( values:List ): Value = 120 | this?.calcValue(values) ?: values.join(" ") 121 | 122 | data class PatternReplacement(val pattern:String, val replacement:String ) { 123 | // lazy property to make regex from pattern 124 | val regexPattern by lazy { 125 | Regex(pattern) 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/record/Value.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.record 18 | 19 | data class Value( 20 | val text:String, 21 | val link:String? = null 22 | ) { 23 | fun getValue( attribute:String? ): String? = 24 | when(attribute) { 25 | null -> text 26 | "link" -> link 27 | else -> throw IllegalArgumentException("Unhandled attribute \"${attribute}\"." ) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/record/output/CsvRecordOutput.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.record.output 18 | 19 | import org.apache.commons.csv.* 20 | 21 | import io.mfj.textricator.record.Record 22 | import io.mfj.textricator.record.RecordModel 23 | import io.mfj.textricator.record.RecordType 24 | 25 | import java.io.BufferedWriter 26 | import java.io.OutputStream 27 | import java.io.OutputStreamWriter 28 | 29 | import kotlinx.coroutines.channels.ReceiveChannel 30 | 31 | class CsvRecordOutput(private val config:RecordModel, output:OutputStream, 32 | private val includeSource:Boolean=false ):RecordOutput { 33 | 34 | private val w = BufferedWriter(OutputStreamWriter(output)) 35 | 36 | private val valueTypes = config.valueTypes 37 | 38 | val p = CSVPrinter(w, CSVFormat.DEFAULT.withRecordSeparator("\n")) 39 | 40 | var rowCount = 0 41 | 42 | override fun write( seq:Sequence ) { 43 | printHeader(p) 44 | seq.forEach { rec -> printRows(p,rec) } 45 | } 46 | 47 | override suspend fun write(channel:ReceiveChannel) { 48 | printHeader(p) 49 | for ( rec in channel ) { 50 | printRows(p,rec) 51 | } 52 | } 53 | 54 | override fun close() { 55 | p.flush() 56 | // do NOT close p, the caller is responsible for closing w, and that is all that p.close() does. 57 | //p.close() 58 | w.close() 59 | } 60 | 61 | private fun printHeader(p:CSVPrinter) { 62 | 63 | if ( includeSource ) p.print("source") 64 | p.print("page") 65 | 66 | fun printType( recordType:RecordType) { 67 | recordType.valueTypes.forEach { valueTypeId -> 68 | val valueType = valueTypes[valueTypeId] 69 | if ( valueType?.include ?: true ) { 70 | val label = valueType?.label ?: valueTypeId 71 | p.print( label ) 72 | } 73 | } 74 | recordType.children 75 | .map { config.recordTypes[it] ?: throw Exception("missing type ${it}") } 76 | .forEach { childRecordType -> 77 | printType( childRecordType ) 78 | } 79 | } 80 | val rootRecordType = config.recordTypes[config.rootRecordType] ?: throw Exception("missing type ${config.rootRecordType}") 81 | printType(rootRecordType) 82 | 83 | p.println() 84 | } 85 | 86 | private fun printRows(p:CSVPrinter,root:Record) { 87 | 88 | val map:MutableMap = mutableMapOf() 89 | 90 | fun pr(rec:Record) { 91 | val type = config.recordTypes[rec.typeId] ?: throw Exception( "Missing type ${rec.typeId}" ) 92 | map[type] = rec 93 | 94 | if ( rec.isLeaf ) { 95 | printRow(p,map) 96 | } else { 97 | rec.children.values.forEach { it.forEach { pr(it) } } 98 | } 99 | 100 | map.remove(type) 101 | } 102 | 103 | pr(root) 104 | } 105 | 106 | private fun printRow( p:CSVPrinter, map:Map ) { 107 | 108 | val cells:MutableList = mutableListOf() 109 | 110 | var pageNumber:Int? = null 111 | var pageNumberPriority:Int = -1 112 | 113 | fun printType( recordType:RecordType) { 114 | val rec = map[recordType] 115 | 116 | if ( ( rec != null ) && ( pageNumber == null || recordType.pagePriority > pageNumberPriority ) ) { 117 | pageNumber = rec.pageNumber 118 | pageNumberPriority = recordType.pagePriority 119 | } 120 | 121 | recordType.valueTypes.forEach { valueTypeId -> 122 | val valueType = valueTypes[valueTypeId] 123 | if ( valueType?.include ?: true ) { 124 | val attribute = valueType?.attribute 125 | val value = rec?.getValue(valueTypeId,attribute) ?: "" 126 | cells.add( value ) 127 | } 128 | } 129 | recordType.children 130 | .map { config.recordTypes[it] ?: throw Exception("missing type ${it}") } 131 | .forEach { childRecordType -> 132 | printType( childRecordType ) 133 | } 134 | } 135 | val rootRecordType = config.recordTypes[config.rootRecordType] ?: throw Exception("missing type ${config.rootRecordType}") 136 | 137 | printType(rootRecordType) 138 | 139 | if ( includeSource ) p.print( map[rootRecordType]!!.source ) 140 | p.print( pageNumber ) 141 | p.printRecord( cells ) 142 | 143 | if ( rowCount++ % 100 == 0 ) p.flush() 144 | } 145 | 146 | 147 | } 148 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/record/output/JsonFlatRecordOutput.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.record.output 18 | 19 | import io.mfj.textricator.record.Record 20 | import io.mfj.textricator.record.RecordModel 21 | import io.mfj.textricator.record.RecordType 22 | import io.mfj.textricator.record.ValueType 23 | 24 | import java.io.BufferedWriter 25 | import java.io.OutputStream 26 | import java.io.OutputStreamWriter 27 | 28 | import java.util.concurrent.atomic.AtomicBoolean 29 | 30 | import com.fasterxml.jackson.core.JsonGenerator 31 | import com.fasterxml.jackson.databind.ObjectMapper 32 | import kotlinx.coroutines.channels.ReceiveChannel 33 | 34 | class JsonFlatRecordOutput(private val config:RecordModel, output:OutputStream, private val includeSource:Boolean): 35 | RecordOutput { 36 | 37 | private val w = BufferedWriter(OutputStreamWriter(output)) 38 | 39 | private val valueTypes = config.valueTypes 40 | 41 | private val writer = ObjectMapper() 42 | .configure(JsonGenerator.Feature.AUTO_CLOSE_TARGET,false) // otherwise closes w 43 | .writer().withDefaultPrettyPrinter() 44 | 45 | var printHeader = true 46 | 47 | override fun write( seq:Sequence ) { 48 | 49 | w.append( "[" ) 50 | 51 | val notFirst = AtomicBoolean(false) 52 | 53 | if ( printHeader ) { 54 | printHeader() 55 | notFirst.set(true) 56 | } 57 | 58 | seq.forEach { rec -> 59 | 60 | printRows(rec,notFirst) 61 | 62 | } 63 | 64 | w.append( "]" ) 65 | } 66 | 67 | override suspend fun write(channel:ReceiveChannel) { 68 | w.append( "[" ) 69 | 70 | val notFirst = AtomicBoolean(false) 71 | 72 | if ( printHeader ) { 73 | printHeader() 74 | notFirst.set(true) 75 | } 76 | 77 | for ( rec in channel ) { 78 | 79 | printRows(rec,notFirst) 80 | 81 | } 82 | 83 | w.append( "]" ) 84 | } 85 | 86 | override fun close() { 87 | w.close() 88 | } 89 | 90 | private fun printHeader() { 91 | 92 | val row:MutableMap = mutableMapOf() 93 | if ( includeSource ) row["source"] = "source" 94 | row["page"] = "page" 95 | 96 | fun print( recordType:RecordType) { 97 | recordType.valueTypes.forEach { valueTypeId -> 98 | val valueType:ValueType? = valueTypes[valueTypeId] 99 | if ( valueType?.include ?: true ) { 100 | val label = valueType?.label ?: valueTypeId 101 | row[valueTypeId] = label 102 | } 103 | } 104 | recordType.children 105 | .map { config.recordTypes[it] ?: throw Exception("missing type ${it}") } 106 | .forEach { childRecordType -> 107 | print( childRecordType ) 108 | } 109 | } 110 | val rootRecordType = config.recordTypes[config.rootRecordType] ?: throw Exception("missing type ${config.rootRecordType}") 111 | print(rootRecordType) 112 | 113 | writer.writeValue(w,row) 114 | } 115 | 116 | private fun printRows(root:Record,notFirst:AtomicBoolean) { 117 | 118 | val map:MutableMap = mutableMapOf() 119 | 120 | fun pr(rec:Record) { 121 | val recType = config.recordTypes[rec.typeId] ?: throw Exception( "Missing type ${rec.typeId}" ) 122 | map[recType] = rec 123 | 124 | if ( rec.isLeaf ) { 125 | if ( notFirst.getAndSet(true) ) w.write(",") 126 | printRow(map) 127 | } else { 128 | rec.children.values.forEach { it.forEach { pr(it) } } 129 | } 130 | 131 | map.remove(recType) 132 | } 133 | 134 | pr(root) 135 | } 136 | 137 | private fun printRow( map:Map ) { 138 | 139 | val row:MutableMap = mutableMapOf() 140 | 141 | var pageNumber:Int? = null 142 | var pageNumberPriority:Int = -1 143 | 144 | fun printType( type:RecordType) { 145 | val rec = map[type] 146 | 147 | if ( pageNumber == null && rec != null ) pageNumber = rec.pageNumber 148 | if ( ( rec != null ) && ( pageNumber == null || type.pagePriority > pageNumberPriority ) ) { 149 | pageNumber = rec.pageNumber 150 | pageNumberPriority = type.pagePriority 151 | } 152 | 153 | type.valueTypes.forEach { valueTypeId -> 154 | val valueType = valueTypes[valueTypeId] 155 | if ( valueType?.include ?: true ) { 156 | val attribute = valueType?.attribute 157 | val value = rec?.getValue(valueTypeId,attribute) ?: "" 158 | row[valueTypeId] = value 159 | } 160 | } 161 | type.children 162 | .map { config.recordTypes[it] ?: throw Exception("missing type ${it}") } 163 | .forEach { childRecordType -> 164 | printType( childRecordType ) 165 | } 166 | } 167 | val rootRecordType = config.recordTypes[config.rootRecordType] ?: throw Exception("missing type ${config.rootRecordType}") 168 | 169 | printType(rootRecordType) 170 | 171 | if ( includeSource ) { 172 | row["source"] = map[rootRecordType]!!.source ?: "" 173 | } 174 | row["page"] = pageNumber.toString() 175 | 176 | writer.writeValue(w,row) 177 | } 178 | 179 | 180 | } 181 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/record/output/JsonRecordOutput.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.record.output 18 | 19 | import com.fasterxml.jackson.core.JsonGenerator 20 | import com.fasterxml.jackson.databind.ObjectMapper 21 | import io.mfj.textricator.record.Record 22 | import io.mfj.textricator.record.RecordModel 23 | 24 | import java.io.BufferedWriter 25 | import java.io.OutputStream 26 | import java.io.OutputStreamWriter 27 | import java.io.Writer 28 | 29 | import kotlinx.coroutines.channels.ReceiveChannel 30 | 31 | class JsonRecordOutput(config:RecordModel,output:OutputStream):RecordOutput { 32 | 33 | private val w = BufferedWriter(OutputStreamWriter(output)) 34 | 35 | private val writer = ObjectMapper() 36 | .configure(JsonGenerator.Feature.AUTO_CLOSE_TARGET,false) // otherwise closes w 37 | .writer().withDefaultPrettyPrinter() 38 | 39 | override fun write( seq:Sequence ) { 40 | 41 | w.append( "[" ) 42 | var first = true 43 | 44 | seq.forEach { rec -> 45 | 46 | if ( first ) { 47 | first = false 48 | } else { 49 | w.append( "," ) 50 | } 51 | 52 | writeRec( w, rec ) 53 | 54 | } 55 | 56 | w.append( "]" ) 57 | } 58 | 59 | override suspend fun write(channel:ReceiveChannel) { 60 | w.append( "[" ) 61 | var first = true 62 | 63 | for ( rec in channel ) { 64 | 65 | if ( first ) { 66 | first = false 67 | } else { 68 | w.append( "," ) 69 | } 70 | 71 | writeRec( w, rec ) 72 | 73 | } 74 | 75 | w.append( "]" ) 76 | } 77 | 78 | private fun writeRec( w:Writer, rec:Record) { 79 | writer.writeValue(w,rec) 80 | } 81 | 82 | override fun close() { 83 | w.close() 84 | } 85 | 86 | } 87 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/record/output/NullOutput.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.record.output 18 | 19 | import io.mfj.textricator.record.Record 20 | 21 | import kotlinx.coroutines.channels.ReceiveChannel 22 | 23 | /** 24 | * Run through the records but do not do anything with them. 25 | */ 26 | object NullOutput:RecordOutput { 27 | override fun write(seq:Sequence) { 28 | seq.forEach {} 29 | } 30 | 31 | override suspend fun write(channel:ReceiveChannel) { 32 | for ( rec in channel ) { 33 | } 34 | } 35 | 36 | override fun close() {} 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/record/output/RecordOutput.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.record.output 18 | 19 | import io.mfj.textricator.record.Record 20 | import java.io.Closeable 21 | 22 | import kotlinx.coroutines.channels.ReceiveChannel 23 | 24 | interface RecordOutput: Closeable { 25 | fun write( seq:Sequence ) 26 | suspend fun write( channel:ReceiveChannel) 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/record/output/XmlRecordOutput.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2021 Stephen Byrne. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.record.output 18 | 19 | import io.mfj.textricator.record.Record 20 | import io.mfj.textricator.record.RecordModel 21 | 22 | import java.io.BufferedWriter 23 | import java.io.OutputStream 24 | import java.io.OutputStreamWriter 25 | import javax.xml.stream.XMLOutputFactory 26 | 27 | import kotlinx.coroutines.channels.ReceiveChannel 28 | 29 | import com.fasterxml.jackson.annotation.JsonInclude 30 | import com.fasterxml.jackson.databind.SerializationFeature 31 | import com.fasterxml.jackson.dataformat.xml.XmlMapper 32 | 33 | class XmlRecordOutput(config:RecordModel,output:OutputStream):RecordOutput { 34 | 35 | private val w = BufferedWriter(OutputStreamWriter(output)) 36 | 37 | // First create Stax components we need 38 | private val xmlOutputFactory = XMLOutputFactory.newFactory() 39 | private val sw = xmlOutputFactory.createXMLStreamWriter(w) 40 | // then Jackson components 41 | private val mapper = XmlMapper() 42 | .apply { 43 | enable(SerializationFeature.INDENT_OUTPUT) 44 | setSerializationInclusion(JsonInclude.Include.NON_EMPTY) // omit empty elements e.g.: lots of 45 | } 46 | 47 | init { 48 | sw.writeStartDocument(); 49 | sw.writeStartElement("Records"); 50 | } 51 | 52 | override fun write( seq:Sequence ) { 53 | seq.forEach { rec-> 54 | mapper.writeValue(sw, rec); 55 | } 56 | } 57 | 58 | override suspend fun write(channel:ReceiveChannel) { 59 | for ( rec in channel ) { 60 | mapper.writeValue(sw,rec) 61 | } 62 | } 63 | 64 | override fun close() { 65 | try { 66 | sw.writeEndElement(); 67 | sw.writeEndDocument(); 68 | sw.close() 69 | } finally { 70 | w.close() 71 | } 72 | } 73 | 74 | } 75 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/table/Table.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.table 18 | 19 | 20 | internal class Table(private val numberOfColumns:Int, private val maxRowDistance:Float) { 21 | 22 | private val xtable:XTable = mutableMapOf() 23 | 24 | private fun createXRow():XRow = 25 | (0..numberOfColumns) 26 | .map { createXCell() } 27 | .toTypedArray() 28 | 29 | private fun createXCell():XCell = mutableMapOf() 30 | 31 | private fun createXValues():XValues = mutableListOf() 32 | 33 | 34 | /** add an object to the model */ 35 | fun addToCell(x:Float, y:Float, columnIndex:Int, text:String) { 36 | val xrow = xtable.getOrPut( y, { createXRow() } ) 37 | val xcell = xrow[columnIndex] 38 | val xvalues = xcell.getOrPut( x, { createXValues() } ) 39 | xvalues.add( text ) 40 | } 41 | 42 | fun getRows(): Sequence { 43 | 44 | return xtable 45 | .toSortedMap() 46 | .group() 47 | .asSequence() 48 | .map { xrow -> xrow.toRow() } 49 | } 50 | 51 | /** 52 | * Group rows that are within [maxRowDistance]. 53 | * 54 | * This modifies the receiver! 55 | */ 56 | private fun Map.group(): List { 57 | 58 | val grouped:MutableList = mutableListOf() 59 | 60 | var buffer:Pair? = null 61 | 62 | for ( ( y, row ) in entries ) { 63 | if ( buffer == null ) { 64 | // first row 65 | buffer = Pair(y,row) 66 | } else if ( buffer.first + maxRowDistance < y ) { 67 | // new row 68 | grouped.add( buffer.second ) 69 | buffer = Pair(y,row) 70 | } else { 71 | // combine row with buffer.second 72 | row.forEachIndexed { index, cell -> 73 | 74 | // cell is a map of x-coords to value 75 | // if 2 identical x-coords, space-separate them 76 | val bufferCell = buffer.second[index] 77 | 78 | cell.entries.forEach { (x,value) -> 79 | bufferCell.getOrPut( x, { createXValues() } ) 80 | .addAll( value ) 81 | } 82 | } 83 | } 84 | } 85 | 86 | if ( buffer != null ) { 87 | grouped.add( buffer.second ) 88 | } 89 | 90 | return grouped 91 | } 92 | 93 | private fun XRow.toRow():Row = map { xcell -> xcell.toCell() } 94 | 95 | // sort by x-coord, then combine the xvalues (which is already sorted by y-value) 96 | private fun XCell.toCell():Cell = toSortedMap().values.flatten().toList() 97 | 98 | } 99 | 100 | ///// internal data structure 101 | 102 | // map y-coord -> row 103 | private typealias XTable = MutableMap 104 | 105 | // cell index -> cell 106 | private typealias XRow = Array 107 | 108 | // x-coord -> values 109 | private typealias XCell = MutableMap 110 | 111 | private typealias XValues = MutableList 112 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/table/TableParser.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.table 18 | 19 | import io.mfj.textricator.table.config.TableParseConfig 20 | import io.mfj.textricator.record.Record 21 | import io.mfj.textricator.record.Value 22 | import io.mfj.textricator.text.Page 23 | import io.mfj.textricator.text.PageFilter 24 | import io.mfj.textricator.text.Text 25 | import io.mfj.textricator.text.toPageFilter 26 | 27 | import org.slf4j.LoggerFactory 28 | 29 | /** 30 | * Extract tabular data from a PDF where there is text in each "cell" that may overflow into the next cell. 31 | */ 32 | class TableParser( private val config:TableParseConfig) { 33 | 34 | companion object { 35 | private val log = LoggerFactory.getLogger( TableParser::class.java ) 36 | 37 | internal val ROOT_TYPE = "row" 38 | } 39 | 40 | fun parse(pages:Sequence): Sequence { 41 | 42 | // just the start x-index of each column, sorted 43 | val cols = config.cols.values.sorted().toTypedArray() 44 | 45 | val pageFilter:PageFilter = config.pages.toPageFilter() 46 | 47 | return pages 48 | .filter { page -> pageFilter(page.pageNumber) } 49 | .flatMap { page -> 50 | 51 | val pageNumber = page.pageNumber 52 | 53 | // position filter 54 | val top:Float = config.getTop(pageNumber) ?: 0f 55 | val bottom:Float = config.getBottom(pageNumber) ?: Float.MAX_VALUE 56 | val pageLeft:Float = cols[0] 57 | val pageRight:Float = config.getRight(pageNumber) ?: Float.MAX_VALUE 58 | val positionFilter = { text:Text -> 59 | val y = text.uly 60 | val x = text.ulx 61 | ( y >= top ) && ( y <= bottom ) && ( x >= pageLeft ) && ( x <= pageRight ) 62 | } 63 | 64 | // find the column based on x-value 65 | fun findCol( x:Float ):Int? { 66 | cols.forEachIndexed { colIndex, left -> 67 | val right = if ( colIndex+1 < cols.size ) { 68 | cols[colIndex+1] 69 | } else { 70 | pageRight 71 | } 72 | if ( x >= left && x < right ) { 73 | return colIndex 74 | } 75 | } 76 | return null 77 | } 78 | 79 | // table to add text to 80 | val table = Table(config.cols.size, config.maxRowDistance) 81 | 82 | // Run the extraction 83 | page.texts 84 | .filter ( positionFilter ) 85 | .forEach { text -> 86 | findCol(text.ulx)?.let { colIndex -> 87 | log.debug( "p${pageNumber} y${text.uly} c${colIndex} ${text.content}" ) 88 | table.addToCell(text.ulx, text.uly, colIndex, text.content) 89 | } 90 | } 91 | 92 | log.debug("processed page ${pageNumber}") 93 | 94 | table.getRows() 95 | .map { row -> 96 | createRecord( pageNumber, row ) 97 | } 98 | 99 | } 100 | 101 | } 102 | 103 | private val valueTypes = config.valueTypes 104 | 105 | private fun createRecord( pageNumber:Int, row:Row):Record { 106 | 107 | // Create a single record per row. 108 | // It has no children, just values. 109 | 110 | val record = Record(source=null, pageNumber = pageNumber, typeId = ROOT_TYPE) 111 | 112 | config.cols.keys 113 | .forEachIndexed { i, colName -> 114 | val cell = row[i].map { text -> Value(text) } 115 | val value = valueTypes[colName]!!.calcValue( cell ) 116 | record.values[colName] = value 117 | } 118 | 119 | return record 120 | } 121 | 122 | } 123 | 124 | internal typealias Row = List 125 | internal typealias Cell = List 126 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/table/config/TableParseConfig.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.table.config 18 | 19 | import io.mfj.textricator.record.RecordModel 20 | import io.mfj.textricator.record.RecordType 21 | import io.mfj.textricator.extractor.TextExtractorOptions 22 | import io.mfj.textricator.record.ValueType 23 | import io.mfj.textricator.table.TableParser 24 | 25 | // Value types are dynamically generated from [cols]. 26 | 27 | class TableParseConfig( 28 | var top:Float? = null, 29 | var bottom:Float? = null, 30 | var right:Float? = null, 31 | var cols:Map, 32 | var pageConfig:Map? = null, 33 | val types:Map = emptyMap(), 34 | val filter:String? = null, 35 | extractor:String? = null, 36 | maxRowDistance: Float, 37 | pageFilter:String? = null ): 38 | TextExtractorOptions( 39 | extractor = extractor, 40 | maxRowDistance = maxRowDistance, 41 | pages = pageFilter ), RecordModel 42 | { 43 | 44 | override val recordTypes:Map 45 | get() = mapOf( TableParser.ROOT_TYPE to 46 | RecordType( 47 | label = TableParser.ROOT_TYPE, 48 | valueTypes = cols.keys.toList(), 49 | filter = filter 50 | ) ) 51 | 52 | override val rootRecordType:String 53 | get() = TableParser.ROOT_TYPE 54 | 55 | override val valueTypes:Map 56 | get() = cols 57 | .mapValues { (colName,_) -> 58 | types[colName] ?: ValueType(colName) 59 | } 60 | 61 | fun getTop(page:Int):Float? { 62 | return pageConfig?.get(page)?.top ?: top 63 | } 64 | 65 | fun getBottom(page:Int):Float? { 66 | return pageConfig?.get(page)?.bottom ?: bottom 67 | } 68 | 69 | fun getRight(page:Int):Float? { 70 | return pageConfig?.get(page)?.right ?: right 71 | } 72 | } 73 | 74 | data class ConfigPage( 75 | var top:Float? = null, 76 | var bottom:Float? = null, 77 | var right:Float? = null, 78 | var skip:Boolean = false) 79 | 80 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/table/config/TableParseConfigUtil.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.table.config 18 | 19 | import java.io.File 20 | import java.io.InputStream 21 | 22 | import com.fasterxml.jackson.databind.ObjectMapper 23 | import com.fasterxml.jackson.dataformat.yaml.YAMLFactory 24 | import com.fasterxml.jackson.module.kotlin.* 25 | 26 | object TableParseConfigUtil { 27 | 28 | private val mapper = ObjectMapper(YAMLFactory()).registerKotlinModule() 29 | 30 | fun parseYaml(configFile:File):TableParseConfig = mapper.readValue( configFile.readText() ) 31 | 32 | fun parseYaml(input:InputStream):TableParseConfig = mapper.readValue(input) 33 | 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/text/Page.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.text 18 | 19 | import io.mfj.textricator.text.Text 20 | 21 | data class Page( 22 | /** Page number. */ 23 | val pageNumber:Int, 24 | /** Texts on page. */ 25 | val texts:List 26 | ) 27 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/text/PageFilter.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.text 18 | 19 | typealias PageFilter = (Int)->Boolean 20 | 21 | val ALL_PAGES:PageFilter = { _:Int -> true } 22 | 23 | /** 24 | * Parse pages text to PageFilter. 25 | * 26 | * @param pages Pages: e.g.: 1,3-5,100-103,400 27 | */ 28 | fun String?.toPageFilter():PageFilter { 29 | return if (this != null && isNotBlank()) { 30 | val ranges:List<(Int)->Boolean> = split(",").map { s -> 31 | if (s.contains("-")) { 32 | val a = s.split("-") 33 | val min = a[0].toInt() 34 | val max = intOrEnd(a[1]) 35 | ; 36 | { page:Int -> 37 | (min..max).contains(page) 38 | } 39 | } else { 40 | val p = s.toInt(); 41 | { page:Int -> 42 | (page == p) 43 | } 44 | } 45 | }; 46 | { page:Int -> 47 | ranges.any { it(page) } 48 | } 49 | } else { 50 | ALL_PAGES 51 | } 52 | } 53 | 54 | private fun intOrEnd( s:String? ): Int = if ( s == null || s.isBlank() ) { Int.MAX_VALUE } else { s.toInt() } 55 | 56 | fun PageFilter.and( pages:PageFilter):PageFilter = { i -> this(i) && pages(i) } 57 | 58 | fun PageFilter.and(s:String?):PageFilter = this.and( s.toPageFilter() ) 59 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/text/RowGrouper.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.text 18 | 19 | class RowGrouper( private val maxRowDistance:Float ) { 20 | 21 | private class GroupByRowState { 22 | var buffer:MutableList = mutableListOf() 23 | var lastPage:Int = 1 24 | } 25 | 26 | fun group( source:Sequence ): Sequence { 27 | 28 | val groupByRowState = GroupByRowState() 29 | 30 | return source 31 | .plus( null as Text? ) // sentinal 32 | .map { text -> groupByRow( groupByRowState, text ) } 33 | .filter { it != null }.map { it!! } 34 | .flatten() 35 | } 36 | 37 | private fun groupByRow( s:GroupByRowState, text:Text? ): Sequence? { 38 | 39 | // collect until find a pageNumber break or gap of $maxRowDistance, then flush 40 | 41 | if ( text == null ) { 42 | // sentintal 43 | // flush and done. 44 | return s.buffer.sortedBy( Text::ulx ).asSequence() 45 | } 46 | 47 | val ret:MutableList = mutableListOf() 48 | 49 | if ( text.pageNumber != s.lastPage ) { 50 | // new pageNumber 51 | ret.addAll( s.buffer.sortedBy( Text::ulx ) ) 52 | s.buffer = mutableListOf() 53 | } 54 | 55 | if ( s.buffer.isEmpty() ) { 56 | } else { 57 | val last = s.buffer.last() 58 | if ( text.uly - last.uly > maxRowDistance ) { 59 | ret.addAll( s.buffer.sortedBy( Text::ulx ).asSequence() ) 60 | s.buffer = mutableListOf() 61 | } 62 | } 63 | 64 | s.buffer.add( text ) 65 | s.lastPage = text.pageNumber 66 | 67 | return if ( ret.isNotEmpty() ) ret.asSequence() else null 68 | } 69 | 70 | } 71 | 72 | fun Sequence.groupRows(maxRowDistance:Float?): Sequence = 73 | if ( maxRowDistance != null ) { 74 | RowGrouper(maxRowDistance).group(this) 75 | } else { 76 | this 77 | } 78 | 79 | fun Sequence.groupRowsPaged(maxRowDistance:Float?): Sequence = 80 | if ( maxRowDistance != null ) { 81 | this.map { page -> 82 | Page(page.pageNumber, page.texts.asSequence().groupRows(maxRowDistance).toList()) 83 | } 84 | } else { 85 | this 86 | } 87 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/text/Text.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.text 18 | 19 | import java.beans.Transient 20 | 21 | /** 22 | * Extricated text. 23 | * 24 | * Positions and dimensions are in points. 25 | * x=0 is the left edge. 26 | * y=0 is the top edge. 27 | * 28 | * @property content The text. 29 | * @property pageNumber Page Number 30 | * @property ulx upper-left x-coordinate 31 | * @property uly upper-left y-coordinate 32 | * @property lrx lower-right x-coordinate 33 | * @property lry lower-right y-coordinate 34 | * @property link link url 35 | * @property font Font name 36 | * @property fontSize Font size, in points 37 | * @property color Text color 38 | * @property backgroundColor Background color 39 | * 40 | * @property width Width 41 | * @property height Height 42 | */ 43 | data class Text( 44 | val content:String, 45 | val pageNumber:Int, 46 | val ulx:Float, 47 | val uly:Float, 48 | val lrx:Float, 49 | val lry:Float, 50 | val font:String, 51 | val fontSize:Float, 52 | val color:String? = null, 53 | val backgroundColor:String? = null, 54 | val link:String? = null ) { 55 | 56 | val width:Float 57 | @Transient 58 | get() = lrx - ulx; 59 | 60 | val height:Float 61 | @Transient 62 | get() = lry - uly; 63 | } 64 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/text/output/CsvTextOutput.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.text.output 18 | 19 | import io.mfj.textricator.text.Text 20 | 21 | import java.io.OutputStream 22 | 23 | import org.apache.commons.csv.* 24 | 25 | import org.slf4j.LoggerFactory 26 | import java.io.OutputStreamWriter 27 | 28 | class CsvTextOutput(output:OutputStream):TextOutput { 29 | 30 | private val w = OutputStreamWriter(output) 31 | private val p = CSVPrinter(w, CSV_FORMAT) 32 | 33 | companion object { 34 | private val log = LoggerFactory.getLogger(CsvTextOutput::class.java) 35 | val CSV_FORMAT = CSVFormat.DEFAULT.withRecordSeparator("\n") 36 | } 37 | 38 | override fun close() { 39 | p.close() 40 | w.close() 41 | } 42 | 43 | override fun write(seq:Sequence) { 44 | writeHeader() 45 | writeTexts(seq) 46 | } 47 | 48 | private fun writeTexts(seq:Sequence) { 49 | seq.forEach { text -> 50 | write( text ) 51 | } 52 | } 53 | 54 | private fun writeHeader() { 55 | p.printRecord( 56 | "page", 57 | "ulx", 58 | "uly", 59 | "lrx", 60 | "lry", 61 | "width", 62 | "height", 63 | "content", 64 | "font", 65 | "fontSize", 66 | "fontColor", 67 | "bgcolor", 68 | "link" 69 | ) 70 | } 71 | 72 | private fun write(text:Text) { 73 | // If you change this, change CsvTextExtractor.parseRec(CSVRecord) to match. 74 | p.printRecord( 75 | text.pageNumber, 76 | text.ulx, 77 | text.uly, 78 | text.lrx, 79 | text.lry, 80 | text.width, 81 | text.height, 82 | text.content, 83 | text.font, 84 | text.fontSize, 85 | text.color, 86 | text.backgroundColor, 87 | text.link 88 | ) 89 | } 90 | 91 | } 92 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/text/output/JsonTextOutput.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.text.output 18 | 19 | import io.mfj.textricator.text.Text 20 | 21 | import java.io.OutputStream 22 | 23 | import com.fasterxml.jackson.core.JsonGenerator 24 | import com.fasterxml.jackson.databind.ObjectMapper 25 | 26 | import org.slf4j.LoggerFactory 27 | 28 | class JsonTextOutput(private val output:OutputStream):TextOutput { 29 | 30 | private val writer = ObjectMapper() 31 | .configure(JsonGenerator.Feature.AUTO_CLOSE_TARGET,false) 32 | .writer().withDefaultPrettyPrinter() 33 | 34 | companion object { 35 | private val log = LoggerFactory.getLogger(JsonTextOutput::class.java) 36 | } 37 | 38 | override fun close() {} 39 | 40 | override fun write(seq:Sequence) { 41 | 42 | write("[") 43 | var first = true 44 | seq.forEach { text -> 45 | if ( first ) first = false else output.write(",".toByteArray()) 46 | writer.writeValue(output,text) 47 | } 48 | write("]") 49 | } 50 | 51 | private fun write(s:String) { 52 | output.write( s.toByteArray() ) 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/io/mfj/textricator/text/output/TextOutput.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.text.output 18 | 19 | import io.mfj.textricator.text.Text 20 | import java.io.Closeable 21 | 22 | interface TextOutput: Closeable { 23 | fun write( seq:Sequence ) 24 | } 25 | -------------------------------------------------------------------------------- /src/main/resources/io/mfj/textricator/extractor/textExtractor.properties: -------------------------------------------------------------------------------- 1 | pdf.itext5=io.mfj.textricator.extractor.itext5.Itext5TextExtractorFactory 2 | pdf.itext7=io.mfj.textricator.extractor.itext7.Itext7TextExtractorFactory 3 | pdf.pdfbox=io.mfj.textricator.extractor.pdfbox.PdfboxTextExtractorFactory 4 | json=io.mfj.textricator.extractor.json.JsonTextExtractorFactory 5 | csv=io.mfj.textricator.extractor.csv.CsvTextExtractorFactory 6 | -------------------------------------------------------------------------------- /src/main/resources/io/mfj/textricator/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | System.err 10 | 11 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger -%kvp- %msg%n 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /src/main/resources/io/mfj/textricator/version.properties: -------------------------------------------------------------------------------- 1 | version=${project.version} 2 | copyright.year=${copyright.year} 3 | source.location=${source.location} -------------------------------------------------------------------------------- /src/scripts/textricator: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Find script directory ($SCRIPT_DIR). 4 | # On linux (and cygwin) "SCRIPT_DIR=$(basedir $(readlink -f $0))" works, 5 | # but Macos' readlink is very different. 6 | # This works on linux, cygwin, and macos: 7 | SCRIPT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) && SCRIPT_DIR=$SCRIPT_DIR/$(basename -- "$0") 8 | while [[ -h "$SCRIPT_DIR" ]]; do 9 | DIR=$(dirname -- "$SCRIPT_DIR") 10 | SYM=$(readlink "$SCRIPT_DIR") 11 | SCRIPT_DIR=$(cd "$DIR" && cd "$(dirname -- "$SYM")" && pwd)/$(basename -- "$SYM") 12 | done 13 | SCRIPT_DIR=$(dirname -- "$SCRIPT_DIR") 14 | 15 | java -cp "${SCRIPT_DIR}/lib/*" ${JAVA_OPTS} io.mfj.textricator.cli.TextricatorCli "$@" 16 | -------------------------------------------------------------------------------- /src/scripts/textricator.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | set batpath=%~dp0 3 | java -cp "%batpath%lib/*" io.mfj.textricator.cli.TextricatorCli %* -------------------------------------------------------------------------------- /src/test/java/io/mfj/textricator/examples/ExamplesTest.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.examples 18 | 19 | import io.mfj.textricator.Textricator 20 | import io.mfj.textricator.form.config.FormParseConfigUtil 21 | import io.mfj.textricator.table.config.TableParseConfigUtil 22 | 23 | import java.io.BufferedReader 24 | import java.io.File 25 | 26 | import kotlin.test.assertEquals 27 | import kotlin.test.assertFalse 28 | import kotlin.test.assertTrue 29 | 30 | import org.junit.Test 31 | import org.junit.runner.RunWith 32 | import org.junit.runners.Parameterized 33 | 34 | /** 35 | * Run the examples. They make pretty good tests. 36 | */ 37 | @RunWith(Parameterized::class) 38 | class ExamplesTest( private val name:String, private val type:Type) { 39 | 40 | enum class Type { FORM, TABLE } 41 | 42 | companion object { 43 | 44 | /** 45 | * Examples. 46 | * Map of file name (without extension) to parse type. 47 | * For each of these, there must be a .pdf, a .yml, and a .csv in src/test/resources/io/mfj/textricator/examples/. 48 | */ 49 | val examples = mapOf( 50 | "rap-sheet" to Type.FORM, 51 | "school-employee-list" to Type.FORM 52 | ) 53 | 54 | @JvmStatic 55 | @Parameterized.Parameters(name="{1}:{0}") 56 | fun data() = examples.entries.map { (name,type) -> arrayOf( name, type ) } 57 | 58 | private fun compare( a:BufferedReader, b:BufferedReader ) = compare( 59 | a.lineSequence().iterator(), b.lineSequence().iterator()) 60 | 61 | private fun compare( a:Iterator, b:Iterator ) { 62 | var line = 0 63 | while ( a.hasNext() ) { 64 | line++ 65 | assertTrue( "Missing line @${line}" ) { b.hasNext() } 66 | val aline = a.next() 67 | val bline = b.next() 68 | assertEquals( aline, bline, "Incorrect line @${line}" ) 69 | } 70 | line++ 71 | assertFalse( "Extra line @${line}" ) { b.hasNext() } 72 | } 73 | } 74 | 75 | @Test 76 | fun test() { 77 | val outCsv = File.createTempFile( name, ".csv" ) 78 | outCsv.deleteOnExit() 79 | try { 80 | // run textricator 81 | outCsv.outputStream().use { out -> 82 | ExamplesTest::class.java.getResourceAsStream( "${name}.yml" )!!.use { config -> 83 | ExamplesTest::class.java.getResourceAsStream( "${name}.pdf" )!!.use { pdf -> 84 | when ( type ) { 85 | Type.FORM -> { 86 | Textricator.parseForm( 87 | input = pdf, 88 | inputFormat = "pdf", 89 | output = out, 90 | outputFormat = "csv", 91 | config = FormParseConfigUtil.parseYaml( config ) 92 | ) 93 | } 94 | Type.TABLE -> { 95 | Textricator.parseTable( 96 | input = pdf, 97 | inputFormat = "pdf", 98 | output = out, 99 | outputFormat = "csv", 100 | config = TableParseConfigUtil.parseYaml( config ) 101 | ) 102 | } 103 | } 104 | } 105 | } 106 | } 107 | 108 | // make sure CSVs match 109 | outCsv.bufferedReader().use { b -> 110 | ExamplesTest::class.java.getResourceAsStream( "${name}.csv" )!!.bufferedReader().use { a -> 111 | compare(a, b) 112 | } 113 | } 114 | } finally { 115 | outCsv.delete() 116 | } 117 | 118 | } 119 | 120 | /** 121 | * Test that if we extract text to JSON and then use that as input, it works. 122 | */ 123 | @Test 124 | fun testJson() { 125 | val textJson = File.createTempFile( name, "-text.json" ) 126 | textJson.deleteOnExit() 127 | val outCsv = File.createTempFile( name, ".csv") 128 | outCsv.deleteOnExit() 129 | try { 130 | // pdf -> json 131 | textJson.outputStream().use { out -> 132 | ExamplesTest::class.java.getResourceAsStream( "${name}.yml" )!!.use { config -> 133 | ExamplesTest::class.java.getResourceAsStream( "${name}.pdf" )!!.use { pdf -> 134 | when ( type ) { 135 | Type.FORM -> { 136 | Textricator.extractText( 137 | input = pdf, 138 | inputFormat = "pdf", 139 | output = out, 140 | outputFormat = "json", 141 | textExtractorOptions = FormParseConfigUtil.parseYaml( config ) 142 | ) 143 | } 144 | Type.TABLE -> TODO() 145 | } 146 | } 147 | } 148 | } 149 | 150 | // json -> csv 151 | outCsv.outputStream().use { csv -> 152 | textJson.inputStream().use { json -> 153 | ExamplesTest::class.java.getResourceAsStream( "${name}.yml" )!!.use { config -> 154 | when ( type ) { 155 | Type.FORM -> { 156 | Textricator.parseForm( 157 | input = json, 158 | inputFormat = "json", 159 | output = csv, 160 | outputFormat = "csv", 161 | config = FormParseConfigUtil.parseYaml( config ) 162 | ) 163 | } 164 | Type.TABLE -> TODO() 165 | } 166 | } 167 | } 168 | } 169 | 170 | // make sure CSVs match 171 | outCsv.bufferedReader().use { b -> 172 | ExamplesTest::class.java.getResourceAsStream( "${name}.csv" )!!.bufferedReader().use { a -> 173 | compare(a, b) 174 | } 175 | } 176 | } finally { 177 | outCsv.delete() 178 | textJson.delete() 179 | } 180 | } 181 | 182 | /** 183 | * Test that if we extract text to CSV and then use that as input, it works. 184 | */ 185 | @Test 186 | fun testCSV() { 187 | val textCsv = File.createTempFile( name, "-text.csv" ) 188 | textCsv.deleteOnExit() 189 | val outCsv = File.createTempFile( name, ".csv") 190 | outCsv.deleteOnExit() 191 | try { 192 | // pdf -> json 193 | textCsv.outputStream().use { out -> 194 | ExamplesTest::class.java.getResourceAsStream( "${name}.yml" )!!.use { config -> 195 | ExamplesTest::class.java.getResourceAsStream( "${name}.pdf" )!!.use { pdf -> 196 | when ( type ) { 197 | Type.FORM -> { 198 | Textricator.extractText( 199 | input = pdf, 200 | inputFormat = "pdf", 201 | output = out, 202 | outputFormat = "csv", 203 | textExtractorOptions = FormParseConfigUtil.parseYaml( config ) 204 | ) 205 | } 206 | Type.TABLE -> TODO() 207 | } 208 | } 209 | } 210 | } 211 | 212 | // json -> csv 213 | outCsv.outputStream().use { csv -> 214 | textCsv.inputStream().use { textCsv -> 215 | ExamplesTest::class.java.getResourceAsStream( "${name}.yml" )!!.use { config -> 216 | when ( type ) { 217 | Type.FORM -> { 218 | Textricator.parseForm( 219 | input = textCsv, 220 | inputFormat = "csv", 221 | output = csv, 222 | outputFormat = "csv", 223 | config = FormParseConfigUtil.parseYaml( config ) 224 | ) 225 | } 226 | Type.TABLE -> TODO() 227 | } 228 | } 229 | } 230 | } 231 | 232 | // make sure CSVs match 233 | outCsv.bufferedReader().use { b -> 234 | ExamplesTest::class.java.getResourceAsStream( "${name}.csv" )!!.bufferedReader().use { a -> 235 | compare(a, b) 236 | } 237 | } 238 | } finally { 239 | outCsv.delete() 240 | textCsv.delete() 241 | } 242 | } 243 | 244 | } -------------------------------------------------------------------------------- /src/test/java/io/mfj/textricator/form/NodeMembersTest.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.form 18 | 19 | import io.mfj.textricator.form.config.FormParseConfig 20 | import io.mfj.textricator.form.config.State 21 | import io.mfj.textricator.record.PatternReplacement 22 | import io.mfj.textricator.record.RecordType 23 | import io.mfj.textricator.record.Value 24 | import io.mfj.textricator.record.ValueType 25 | 26 | import org.junit.Assert.* 27 | import org.junit.Test 28 | 29 | class NodeMembersTest { 30 | val model = FormParseConfig(rootRecordType = "name", recordTypes = mutableMapOf( 31 | "name" to RecordType(label = "name", 32 | valueTypes = mutableListOf("inmateName", "city", "state", "inmateDOB", "arrestDate"))), 33 | valueTypes = mutableMapOf("inmateName" to ValueType(label = "Inmate Name"), 34 | "city" to ValueType(label = "City", replacements = mutableListOf(PatternReplacement("(.*),.*", "$1"))), 35 | "state" to ValueType(label = "State", replacements = mutableListOf(PatternReplacement(".*,(.*)", "$1"))), 36 | "inmateDOB" to ValueType(label = "Inmate DOB"), "arrestDate" to ValueType(label = "Arrest Date")), 37 | states = mutableMapOf("inmateName" to State(startRecord = true), 38 | "cityState" to State(valueTypes = mutableListOf("city", "state")), 39 | "inmateDOB" to State(), 40 | "arrestDate" to State())) 41 | 42 | val rp = RecordParser(model) 43 | 44 | private fun sv(pageNumber: Int, stateId: String, vararg value: String) :StateValue = StateValue( 45 | source = "test", 46 | pageNumber = pageNumber, stateId = stateId, 47 | state = model.states[stateId] ?: throw Exception("Missing State: ${stateId}"), values = value.toList().map{ Value(it) }) 48 | 49 | @Test 50 | fun testNodeMembers() { 51 | val stateValues: List = listOf( 52 | sv(1, "inmateName", "John Doe"), 53 | sv(1, "cityState", "New York,NY"), 54 | sv(1, "inmateDOB", "01/01/1985"), 55 | sv(1, "arrestDate", "02/15/2018"), 56 | sv(1, "inmateName", "Jane Doe"), 57 | sv(1, "cityState", "Rochester,NY"), 58 | sv(1, "inmateDOB", "12/12/1975"), 59 | sv(1, "arrestDate", "02/13/2018") 60 | ) 61 | 62 | val records = rp.parse(stateValues.asSequence()).toList() 63 | 64 | assertEquals("New York", records[0].values["city"]?.text) 65 | assertEquals("NY", records[0].values["state"]?.text) 66 | assertEquals("Rochester", records[1].values["city"]?.text) 67 | assertEquals("NY", records[1].values["state"]?.text) 68 | 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/test/java/io/mfj/textricator/form/PatternReplacementTest.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.form 18 | 19 | import io.mfj.textricator.form.config.FormParseConfig 20 | import io.mfj.textricator.form.config.State 21 | import io.mfj.textricator.record.PatternReplacement 22 | import io.mfj.textricator.record.RecordType 23 | import io.mfj.textricator.record.Value 24 | import io.mfj.textricator.record.ValueType 25 | 26 | import org.junit.Assert.* 27 | import org.junit.Test 28 | 29 | class PatternReplacementTest { 30 | val model = FormParseConfig(rootRecordType = "name", recordTypes = mutableMapOf( 31 | "name" to RecordType(label = "name", 32 | valueTypes = mutableListOf("inmateName", "inmateAge", "inmateRace", "arrestDateTime"))), 33 | valueTypes = mutableMapOf("inmateName" to ValueType(label = "Inmate Name"), 34 | "inmateAge" to ValueType(label = "Inmate Age", 35 | replacements = mutableListOf(PatternReplacement("30", "thirty"), PatternReplacement("40", "forty"))), 36 | "inmateRace" to ValueType(label = "Inmate Race", 37 | replacements = mutableListOf(PatternReplacement("W", "white"), PatternReplacement("W", "black"))), 38 | "arrestDateTime" to ValueType(label = "Arrest Date Time", replacements = mutableListOf( 39 | PatternReplacement(pattern = "(.*)\\ 12/30/1899\\ (.*)", replacement = "$1 $2")))), 40 | states = mutableMapOf("inmateName" to State(startRecord = true), 41 | "inmateAge" to State(), "inmateRace" to State(), 42 | "arrestDateTime" to State())) 43 | 44 | val rp = RecordParser(model) 45 | 46 | private fun sv(pageNumber: Int, stateId: String, vararg value: String) :StateValue = StateValue( 47 | source = "test", 48 | pageNumber = pageNumber, stateId = stateId, 49 | state = model.states[stateId] ?: throw Exception("Missing State: ${stateId}"), values = value.toList().map{ Value(it) }) 50 | 51 | @Test 52 | fun testPatternReplacement() { 53 | val stateValues: List = listOf( 54 | sv(1, "inmateName", "John Doe"), 55 | sv(1, "inmateAge", "30"), 56 | sv(1, "inmateRace", "W"), 57 | sv(1, "arrestDateTime", "02/15/2018 12/30/1899 03:33:00 AM"), 58 | sv(1, "inmateName", "Jane Doe"), 59 | sv(1, "inmateAge", "40"), 60 | sv(1, "inmateRace", "W"), 61 | sv(1, "arrestDateTime", "02/13/2018 05:55:00 PM") 62 | ) 63 | 64 | val records = rp.parse(stateValues.asSequence()).toList() 65 | 66 | // test that removing 12/30/1899 from arrestDateTime works 67 | assertEquals("02/15/2018 03:33:00 AM", records[0].values["arrestDateTime"]?.text) 68 | 69 | // test that nothing is removed from arrestDateTime when 12/30/1899 isn't there 70 | assertEquals("02/13/2018 05:55:00 PM", records[1].values["arrestDateTime"]?.text) 71 | 72 | // test that correct replacement is used when there are multiple patterns as options 73 | assertEquals("thirty", records[0].values["inmateAge"]?.text) 74 | assertEquals("forty", records[1].values["inmateAge"]?.text) 75 | 76 | // test that first replacement is used when there are multiple matching patterns 77 | assertEquals("white", records[0].values["inmateRace"]?.text) 78 | assertEquals("white", records[1].values["inmateRace"]?.text) 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/test/java/io/mfj/textricator/form/RecordParserTest.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.form 18 | 19 | import io.mfj.textricator.record.Record 20 | import io.mfj.textricator.record.RecordType 21 | import io.mfj.textricator.form.config.FormParseConfig 22 | import io.mfj.textricator.form.config.State 23 | import io.mfj.textricator.record.Value 24 | 25 | import org.junit.Assert.* 26 | import org.junit.Test 27 | 28 | class RecordParserTest { 29 | 30 | val model = FormParseConfig(rootRecordType = "date", recordTypes = mutableMapOf( 31 | "date" to RecordType(label = "Date", children = mutableListOf("person"), valueTypes = mutableListOf("date")), 32 | "person" to RecordType(label = "Person", children = mutableListOf("case"), 33 | valueTypes = mutableListOf("name", "dob")), 34 | "case" to RecordType(label = "Case", children = mutableListOf("charge"), 35 | valueTypes = mutableListOf("caseId", "court")), 36 | "charge" to RecordType(label = "Charge", valueTypes = mutableListOf("chargeCode", "chargeDesc"))), 37 | states = mutableMapOf( 38 | "date" to State(startRecord = true, startRecordForEachValue = true), 39 | "name" to State(startRecord = true), 40 | "dob" to State(), 41 | "caseId" to State(startRecord = true), 42 | "court" to State(), 43 | "chargeCode" to State(startRecord = true), 44 | "chargeDesc" to State())) 45 | 46 | val rp = RecordParser(model) 47 | 48 | private fun sv( pageNumber:Int, stateId:String, vararg value:String ):StateValue = StateValue( 49 | source = "test", 50 | pageNumber = pageNumber, stateId = stateId, 51 | state = model.states[stateId] ?: throw Exception("Missing state ${stateId}"), values = value.toList().map{ Value(it) }) 52 | 53 | @Test 54 | fun testStructure() { 55 | 56 | val stateValues:List = listOf( 57 | sv( 1, "date", "2018-01-01" ), 58 | sv( 1, "date", "2018-01-02" ), 59 | sv( 1, "name", "John Doe" ), 60 | sv( 1, "dob", "1980-01-01" ), 61 | sv( 1, "caseId", "CASE001" ), 62 | sv( 1, "court", "Kangaroo" ), 63 | sv( 1, "chargeCode", "AGGRAV ASSLT" ), 64 | sv( 1, "chargeDesc", "W DEADLY WEAPON" ), 65 | sv( 1, "chargeCode", "AGGRAV BATTERY" ), 66 | sv( 1, "chargeDesc", "BODILY HARM" ), 67 | sv( 1, "caseId", "CASE002" ), 68 | sv( 1, "court", "Kangaroo" ), 69 | sv( 1, "chargeCode", "MOVING VIOL" ), 70 | sv( 1, "chargeDesc", "30+" ), 71 | sv( 1, "chargeCode", "NONMOVING VIOL" ), 72 | sv( 1, "chargeDesc", "NO LICENSE" ), 73 | sv( 1, "name", "Jane Doe" ), 74 | sv( 1, "dob", "1990-01-01" ), 75 | sv( 1, "caseId", "CASE001" ), 76 | sv( 1, "court", "Kangaroo" ), 77 | sv( 1, "chargeCode", "VEH THEFT" ), 78 | sv( 1, "chargeDesc", "GRAND 3RD" ), 79 | sv( 1, "chargeCode", "NONMOVING VIOL" ), 80 | sv( 1, "chargeDesc", "NO LICESNSE" ) 81 | ) 82 | 83 | val records = rp.parse( stateValues.asSequence() ).toList() 84 | 85 | validateRecordsAgainstModel( records ) 86 | } 87 | 88 | private fun validateRecordsAgainstModel( records:List ) { 89 | records.forEach { record -> 90 | assertEquals( "Root records must be of type \"${model.rootRecordType}\" - found \"${record.typeId}\"", 91 | model.rootRecordType, record.typeId ) 92 | 93 | validateRecordAgainstModel( record ) 94 | } 95 | } 96 | 97 | private fun validateRecordAgainstModel( record:Record) { 98 | val childAllowedTypes = model.recordTypes[record.typeId]!!.children.toSet() 99 | 100 | record.children.keys.forEach { childType -> 101 | assertTrue( "Records of Type \"${record.typeId}\" may only contain children of types ${childAllowedTypes} - found \"${childType}\"", 102 | childAllowedTypes.contains( childType ) ) 103 | } 104 | record.children.values.forEach { children -> 105 | children.forEach { child -> 106 | validateRecordAgainstModel( child ) 107 | } 108 | } 109 | } 110 | 111 | 112 | @Test 113 | fun testAncestorThatIsAChildOf() { 114 | assertEquals( "person", rp.findAncestorThatIsAChildOf("person","date") ) 115 | assertEquals( "person", rp.findAncestorThatIsAChildOf("case","date") ) 116 | assertEquals( "person", rp.findAncestorThatIsAChildOf("charge","date") ) 117 | assertEquals( "case", rp.findAncestorThatIsAChildOf("case","person") ) 118 | assertEquals( "case", rp.findAncestorThatIsAChildOf("charge","person") ) 119 | assertEquals( "charge", rp.findAncestorThatIsAChildOf("charge","case") ) 120 | } 121 | 122 | /** 123 | * Make sure that if a state that is required for another state to start a new record 124 | * ([State.startRecordRequiredState]) 125 | * has [State.include] = false 126 | * that the state is still still noticed for the purposes of [State.startRecordRequiredState]. 127 | * 128 | * Previously, the [StateValue]s with [State.include]=false were filtered out too early and this did not work. 129 | */ 130 | @Test 131 | fun testStartRecordIfSeenIgnoredState() { 132 | val model = FormParseConfig(rootRecordType = "person", 133 | recordTypes = mutableMapOf( 134 | "person" to RecordType(label = "Person", valueTypes = mutableListOf("name")) 135 | ), 136 | states = mutableMapOf( 137 | "name" to State(startRecord = true, startRecordRequiredState = "label"), 138 | "label" to State(include = false) 139 | ) 140 | ) 141 | val stateValues = listOf( 142 | StateValue( 143 | source = "test", 144 | pageNumber = 1, 145 | values = listOf( Value("Fred") ), 146 | state = model.states["name"]!!, 147 | stateId = "name" 148 | ), 149 | StateValue( 150 | source = "test", 151 | pageNumber = 1, 152 | values = listOf( Value("label") ), 153 | state = model.states["label"]!!, 154 | stateId = "label" 155 | ), 156 | StateValue( 157 | source = "test", 158 | pageNumber = 1, 159 | values = listOf( Value("Sally") ), 160 | state = model.states["name"]!!, 161 | stateId = "name" 162 | ) 163 | ).asSequence() 164 | 165 | val records = RecordParser(model).parse( stateValues ).toList() 166 | 167 | assertEquals( 2, records.size ) 168 | assertEquals( "Fred", records[0].values["name"]?.text ) 169 | assertEquals( "Sally", records[1].values["name"]?.text ) 170 | } 171 | 172 | } 173 | -------------------------------------------------------------------------------- /src/test/java/io/mfj/textricator/record/RecordFilterTest.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.record 18 | 19 | import io.mfj.expr.ExDataType 20 | import io.mfj.textricator.form.config.FormParseConfig 21 | 22 | import org.junit.Assert.* 23 | import org.junit.Test 24 | 25 | class RecordFilterTest { 26 | 27 | @Test 28 | fun testNoFilter() { 29 | 30 | val model = FormParseConfig(rootRecordType = "a", recordTypes = mutableMapOf( 31 | "a" to RecordType(label = "A", children = mutableListOf("b", "c"), valueTypes = mutableListOf("a1", "a2")), 32 | "b" to RecordType(label = "B", children = mutableListOf(), valueTypes = mutableListOf("b1", "b2")), 33 | "c" to RecordType(label = "C", children = mutableListOf("d"), valueTypes = mutableListOf("c1", "c2")), 34 | "d" to RecordType(label = "D", valueTypes = mutableListOf("d1", "d2")))) 35 | 36 | val records = 37 | listOf(Record(1, "a", mutableMapOf("a1" to Value("Hello"), "a2" to Value("World")), mutableMapOf( 38 | "b" to mutableListOf(Record(1, "b", mutableMapOf("b1" to Value("1"), "b2" to Value("2")))), 39 | "c" to mutableListOf(Record(1, "c", mutableMapOf("c1" to Value("1"), "c2" to Value("2")), 40 | mutableMapOf("d" to mutableListOf( 41 | Record(1, "d", mutableMapOf("d1" to Value("1"), "d2" to Value("2"))))))))) 42 | ) 43 | 44 | val filtered = RecordFilter(model).filter(records.asSequence()).toList() 45 | 46 | assertEquals( records, filtered ) 47 | 48 | } 49 | 50 | @Test 51 | fun testTopFilter() { 52 | 53 | val model = FormParseConfig(rootRecordType = "a", recordTypes = mutableMapOf( 54 | "a" to RecordType(label = "A", filter = """ a1 = "x" """, children = mutableListOf("b", "c"), 55 | valueTypes = mutableListOf("a1", "a2")), 56 | "b" to RecordType(label = "B", children = mutableListOf(), valueTypes = mutableListOf("b1", "b2")), 57 | "c" to RecordType(label = "C", children = mutableListOf("d"), valueTypes = mutableListOf("c1", "c2")), 58 | "d" to RecordType(label = "D", valueTypes = mutableListOf("d1", "d2")))) 59 | 60 | val records = 61 | listOf(Record(1, "a", mutableMapOf("a1" to Value("Hello"), "a2" to Value("World")), mutableMapOf( 62 | "b" to mutableListOf(Record(1, "b", mutableMapOf("b1" to Value("1"), "b2" to Value("2")))), 63 | "c" to mutableListOf(Record(1, "c", mutableMapOf("c1" to Value("1"), "c2" to Value("2")), 64 | mutableMapOf("d" to mutableListOf( 65 | Record(1, "d", mutableMapOf("d1" to Value("1"), "d2" to Value("2"))))))))) 66 | ) 67 | 68 | val filtered = RecordFilter(model).filter(records.asSequence()).toList() 69 | 70 | assertTrue( filtered.isEmpty() ) 71 | 72 | } 73 | 74 | @Test 75 | fun testChildFilter() { 76 | 77 | val model = FormParseConfig(rootRecordType = "a", recordTypes = mutableMapOf( 78 | "a" to RecordType(label = "A", filter = """ a1 = "Hello" and a2 = "World" """, 79 | children = mutableListOf("b", "c"), valueTypes = mutableListOf("a1", "a2")), 80 | "b" to RecordType(label = "B", children = mutableListOf(), valueTypes = mutableListOf("b1", "b2")), 81 | "c" to RecordType(label = "C", filter = """ c1 = "2" """, children = mutableListOf("d"), 82 | valueTypes = mutableListOf("c1", "c2")), 83 | "d" to RecordType(label = "D", valueTypes = mutableListOf("d1", "d2")))) 84 | 85 | val records = 86 | listOf(Record(1, "a", mutableMapOf("a1" to Value("Hello"), "a2" to Value("World")), mutableMapOf( 87 | "b" to mutableListOf(Record(1, "b", mutableMapOf("b1" to Value("1"), "b2" to Value("2")))), 88 | "c" to mutableListOf(Record(1, "c", mutableMapOf("c1" to Value("1"), "c2" to Value("2")), 89 | mutableMapOf("d" to mutableListOf( 90 | Record(1, "d", mutableMapOf("d1" to Value("1"), "d2" to Value("2"))))))))) 91 | ) 92 | 93 | val expected = 94 | listOf(Record(1, "a", mutableMapOf("a1" to Value("Hello"), "a2" to Value("World")), mutableMapOf( 95 | "b" to mutableListOf(Record(1, "b", mutableMapOf("b1" to Value("1"), "b2" to Value("2")))), 96 | "c" to mutableListOf())) 97 | ) 98 | 99 | val filtered = RecordFilter(model).filter(records.asSequence()).toList() 100 | 101 | assertEquals( expected, filtered ) 102 | 103 | } 104 | 105 | @Test 106 | fun testChildFilterSome() { 107 | 108 | val model = FormParseConfig(rootRecordType = "a", recordTypes = mutableMapOf( 109 | "a" to RecordType(label = "A", filter = """ a1 = "Hello" and a2 = "World" """, 110 | children = mutableListOf("b", "c"), valueTypes = mutableListOf("a1", "a2")), 111 | "b" to RecordType(label = "B", children = mutableListOf(), valueTypes = mutableListOf("b1", "b2")), 112 | "c" to RecordType(label = "C", filter = """ c1 = "2" """, children = mutableListOf("d"), 113 | valueTypes = mutableListOf("c1", "c2")), 114 | "d" to RecordType(label = "D", valueTypes = mutableListOf("d1", "d2")))) 115 | 116 | val records = 117 | listOf(Record(1, "a", mutableMapOf("a1" to Value("Hello"), "a2" to Value("World")), mutableMapOf( 118 | "b" to mutableListOf(Record(1, "b", mutableMapOf("b1" to Value("1"), "b2" to Value("2")))), 119 | "c" to mutableListOf(Record(1, "c", mutableMapOf("c1" to Value("1"), "c2" to Value("2")), 120 | mutableMapOf("d" to mutableListOf( 121 | Record(1, "d", mutableMapOf("d1" to Value("1"), "d2" to Value("2")))))), 122 | Record(1, "c", mutableMapOf("c1" to Value("2"), "c2" to Value("3")), mutableMapOf( 123 | "d" to mutableListOf( 124 | Record(1, "d", mutableMapOf("d1" to Value("1"), "d2" to Value("2"))))))))) 125 | ) 126 | 127 | val expected = 128 | listOf(Record(1, "a", mutableMapOf("a1" to Value("Hello"), "a2" to Value("World")), mutableMapOf( 129 | "b" to mutableListOf(Record(1, "b", mutableMapOf("b1" to Value("1"), "b2" to Value("2")))), 130 | "c" to mutableListOf(Record(1, "c", mutableMapOf("c1" to Value("2"), "c2" to Value("3")), 131 | mutableMapOf("d" to mutableListOf( 132 | Record(1, "d", mutableMapOf("d1" to Value("1"), "d2" to Value("2"))))))))) 133 | ) 134 | 135 | val filtered = RecordFilter(model).filter(records.asSequence()).toList() 136 | 137 | assertEquals( expected, filtered ) 138 | 139 | } 140 | 141 | @Test 142 | fun testTypes() { 143 | 144 | val model = FormParseConfig(rootRecordType = "a", 145 | recordTypes = mutableMapOf("a" to RecordType(label = "A", filter = """ 146 | str = "a string" 147 | and 148 | 3 < int < 5 149 | and 150 | 1.1 < dbl < 1.3 151 | and 152 | none = "member type not set" 153 | and 154 | undef = "member not defined" 155 | """, valueTypes = mutableListOf("str", "int", "dbl", "none", "undef"), children = mutableListOf())), 156 | valueTypes = mutableMapOf("str" to ValueType(type = ExDataType.STRING.name.lowercase()), 157 | "int" to ValueType(type = ExDataType.NUMBER.name.lowercase()), 158 | "dbl" to ValueType(type = ExDataType.NUMBER.name.lowercase()), "none" to ValueType( /* type not set */))) 159 | 160 | val records = 161 | listOf(Record(1, "a", 162 | mutableMapOf("str" to Value("a string"), "int" to Value("4"), "dbl" to Value("1.2"), "none" to Value("member type not set"), 163 | "undef" to Value("member not defined"))), Record(2, "a", 164 | mutableMapOf("str" to Value("wrong"), "int" to Value("4"), "dbl" to Value("1.2"), "none" to Value("member type not set"), 165 | "undef" to Value("member not defined"))), Record(3, "a", 166 | mutableMapOf("str" to Value("a string"), "int" to Value("6"), "dbl" to Value("1.2"), "none" to Value("member type not set"), 167 | "undef" to Value("member not defined"))), Record(4, "a", 168 | mutableMapOf("str" to Value("a string"), "int" to Value("4"), "dbl" to Value("1.7"), "none" to Value("member type not set"), 169 | "undef" to Value("member not defined"))), Record(5, "a", 170 | mutableMapOf("str" to Value("a string"), "int" to Value("4"), "dbl" to Value("1.2"), "none" to Value("wrong"), 171 | "undef" to Value("member not defined"))), Record(6, "a", 172 | mutableMapOf("str" to Value("a string"), "int" to Value("4"), "dbl" to Value("1.2"), "none" to Value("member type not set"), 173 | "undef" to Value("wrong"))) 174 | ) 175 | 176 | val filtered = RecordFilter(model).filter(records.asSequence()).toList() 177 | 178 | assertEquals( 1, filtered.size ) 179 | assertEquals( records[0], filtered[0] ) 180 | 181 | } 182 | } 183 | -------------------------------------------------------------------------------- /src/test/java/io/mfj/textricator/record/output/CsvOutputTest.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | 5 | This program is free software: you can redistribute it and/or modify it under 6 | the terms of the GNU Affero General Public License version 3 as published by the 7 | Free Software Foundation. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 10 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 11 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License along 14 | with this program. If not, see . 15 | */ 16 | 17 | package io.mfj.textricator.record.output 18 | 19 | import io.mfj.textricator.record.Record 20 | import io.mfj.textricator.form.config.FormParseConfigUtil 21 | import io.mfj.textricator.record.Value 22 | 23 | import java.io.ByteArrayOutputStream 24 | 25 | import org.junit.* 26 | import org.junit.Assert.* 27 | 28 | class CsvOutputTest { 29 | 30 | @Test 31 | fun test() { 32 | val model = FormParseConfigUtil.parseYaml( """ 33 | rootRecordType: person 34 | recordTypes: 35 | person: 36 | label: "Person" 37 | valueTypes: 38 | - name 39 | - dob 40 | children: 41 | - case 42 | - address 43 | case: 44 | label: "Case" 45 | valueTypes: 46 | - caseId 47 | - date 48 | children: 49 | - fee 50 | fee: 51 | label: "Fee" 52 | valueTypes: 53 | - desc 54 | - amount 55 | address: 56 | label: "Address" 57 | valueTypes: 58 | - city 59 | - state 60 | valueTypes: 61 | name: 62 | label: "Name" 63 | dob: 64 | label: "Date of Birth" 65 | caseId: 66 | label: "Case ID" 67 | date: 68 | label: "Date" 69 | desc: 70 | label: "Fee Description" 71 | amount: 72 | label: "Fee Amount" 73 | states: 74 | name: {} 75 | dob: {} 76 | caseId: {} 77 | date: {} 78 | desc: {} 79 | amount: {} 80 | city: {} 81 | """ ) 82 | 83 | val root = Record(1, "person", mutableMapOf("name" to Value("John Doe"), "dob" to Value("1/1/1980")), 84 | mutableMapOf("case" to mutableListOf( 85 | Record(1, "case", mutableMapOf("caseId" to Value("1"), "date" to Value("1/1/2018"))), 86 | Record(1, "case", mutableMapOf("caseId" to Value("2"), "date" to Value("3/3/2018")), 87 | mutableMapOf("fee" to mutableListOf(Record(1, "fee", 88 | mutableMapOf("desc" to Value("court fee"), "amount" to Value("99.95")))))), 89 | Record(1, "case", mutableMapOf("caseId" to Value("3"), "date" to Value("7/1/2018")), 90 | mutableMapOf("fee" to mutableListOf( 91 | Record(1, "fee", mutableMapOf("desc" to Value("court fee"), "amount" to Value("10"))), 92 | Record(1, "fee", 93 | mutableMapOf("desc" to Value("jail fee"), "amount" to Value("5.99"))))))), "address" to mutableListOf( 94 | Record(1, "address", mutableMapOf("city" to Value("Rochester"), "state" to Value("NY"))), 95 | Record(1, "address", mutableMapOf("city" to Value("Pittsburgh"), "state" to Value("PA")))))) 96 | 97 | val buffer = ByteArrayOutputStream() 98 | CsvRecordOutput(model,buffer).use { it.write(sequenceOf(root) ) } 99 | 100 | val expected = """ 101 | page,Name,Date of Birth,Case ID,Date,Fee Description,Fee Amount,city,state 102 | 1,John Doe,1/1/1980,1,1/1/2018,,,, 103 | 1,John Doe,1/1/1980,2,3/3/2018,court fee,99.95,, 104 | 1,John Doe,1/1/1980,3,7/1/2018,court fee,10,, 105 | 1,John Doe,1/1/1980,3,7/1/2018,jail fee,5.99,, 106 | 1,John Doe,1/1/1980,,,,,Rochester,NY 107 | 1,John Doe,1/1/1980,,,,,Pittsburgh,PA 108 | """ 109 | 110 | assertEquals( expected.trim(), String(buffer.toByteArray()).trim()) 111 | 112 | } 113 | 114 | } 115 | -------------------------------------------------------------------------------- /src/test/java/io/mfj/textricator/record/output/XmlOutputTest.kt: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of Textricator. 3 | Copyright 2018 Measures for Justice Institute. 4 | Copyright 2021 Stephen Byrne. 5 | 6 | This program is free software: you can redistribute it and/or modify it under 7 | the terms of the GNU Affero General Public License version 3 as published by the 8 | Free Software Foundation. 9 | 10 | This program is distributed in the hope that it will be useful, but WITHOUT ANY 11 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 12 | PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 13 | 14 | You should have received a copy of the GNU Affero General Public License along 15 | with this program. If not, see . 16 | */ 17 | 18 | package io.mfj.textricator.record.output 19 | 20 | import io.mfj.textricator.record.Record 21 | import io.mfj.textricator.form.config.FormParseConfigUtil 22 | import io.mfj.textricator.record.Value 23 | 24 | import java.io.ByteArrayOutputStream 25 | 26 | import com.fasterxml.jackson.dataformat.xml.XmlMapper 27 | 28 | import org.junit.* 29 | import org.junit.Assert.* 30 | 31 | class XmlOutputTest { 32 | 33 | @Test 34 | fun test() { 35 | val model = FormParseConfigUtil.parseYaml( """ 36 | rootRecordType: person 37 | recordTypes: 38 | person: 39 | label: "Person" 40 | valueTypes: 41 | - name 42 | - dob 43 | children: 44 | - case 45 | - address 46 | case: 47 | label: "Case" 48 | valueTypes: 49 | - caseId 50 | - date 51 | children: 52 | - fee 53 | fee: 54 | label: "Fee" 55 | valueTypes: 56 | - desc 57 | - amount 58 | address: 59 | label: "Address" 60 | valueTypes: 61 | - city 62 | - state 63 | valueTypes: 64 | name: 65 | label: "Name" 66 | dob: 67 | label: "Date of Birth" 68 | caseId: 69 | label: "Case ID" 70 | date: 71 | label: "Date" 72 | desc: 73 | label: "Fee Description" 74 | amount: 75 | label: "Fee Amount" 76 | states: 77 | name: {} 78 | dob: {} 79 | caseId: {} 80 | date: {} 81 | desc: {} 82 | amount: {} 83 | city: {} 84 | """ ) 85 | 86 | val root = Record(1, "person", mutableMapOf("name" to Value("John Doe"), "dob" to Value("1/1/1980")), 87 | mutableMapOf("case" to mutableListOf( 88 | Record(1, "case", mutableMapOf("caseId" to Value("1"), "date" to Value("1/1/2018"))), 89 | Record(1, "case", mutableMapOf("caseId" to Value("2"), "date" to Value("3/3/2018")), 90 | mutableMapOf("fee" to mutableListOf(Record(1, "fee", 91 | mutableMapOf("desc" to Value("court fee"), "amount" to Value("99.95")))))), 92 | Record(1, "case", mutableMapOf("caseId" to Value("3"), "date" to Value("7/1/2018")), 93 | mutableMapOf("fee" to mutableListOf( 94 | Record(1, "fee", mutableMapOf("desc" to Value("court fee"), "amount" to Value("10"))), 95 | Record(1, "fee", 96 | mutableMapOf("desc" to Value("jail fee"), "amount" to Value("5.99"))))))), "address" to mutableListOf( 97 | Record(1, "address", mutableMapOf("city" to Value("Rochester"), "state" to Value("NY"))), 98 | Record(1, "address", mutableMapOf("city" to Value("Pittsburgh"), "state" to Value("PA")))))) 99 | 100 | val buffer = ByteArrayOutputStream() 101 | XmlRecordOutput(model,buffer).use { it.write(sequenceOf(root) ) } 102 | val str = String(buffer.toByteArray()).trim() 103 | val normalized = normalize(str) 104 | 105 | val expected = normalize( """ 106 | 107 | 108 | 1 109 | person 110 | 111 | 112 | John Doe 113 | 114 | 115 | 1/1/1980 116 | 117 | 118 | 119 | 120 | 1 121 | case 122 | 123 | 124 | 1 125 | 126 | 127 | 1/1/2018 128 | 129 | 130 | 131 | 132 | 1 133 | case 134 | 135 | 136 | 2 137 | 138 | 139 | 3/3/2018 140 | 141 | 142 | 143 | 144 | 1 145 | fee 146 | 147 | 148 | court fee 149 | 150 | 151 | 99.95 152 | 153 | 154 | 155 | 156 | 157 | 158 | 1 159 | case 160 | 161 | 162 | 3 163 | 164 | 165 | 7/1/2018 166 | 167 | 168 | 169 | 170 | 1 171 | fee 172 | 173 | 174 | court fee 175 | 176 | 177 | 10 178 | 179 | 180 | 181 | 182 | 1 183 | fee 184 | 185 | 186 | jail fee 187 | 188 | 189 | 5.99 190 | 191 | 192 | 193 | 194 | 195 |
196 | 1 197 | address 198 | 199 | 200 | Rochester 201 | 202 | 203 | NY 204 | 205 | 206 |
207 |
208 | 1 209 | address 210 | 211 | 212 | Pittsburgh 213 | 214 | 215 | PA 216 | 217 | 218 |
219 |
220 |
221 |
222 | """ ) 223 | 224 | assertEquals( expected, normalized ) 225 | 226 | } 227 | 228 | /** 229 | * Normalize the XML so we can compare 2 documents. 230 | */ 231 | private fun normalize( input:String ):String { 232 | val mapper = XmlMapper() 233 | val tree = mapper.readTree(input) 234 | val normalized = mapper.writeValueAsString(tree) 235 | return normalized 236 | } 237 | 238 | } 239 | -------------------------------------------------------------------------------- /src/test/resources/io/mfj/textricator/examples/probes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/measuresforjustice/textricator/9c9a5cdeba4ac093b399e95d302856dec68c106f/src/test/resources/io/mfj/textricator/examples/probes.pdf -------------------------------------------------------------------------------- /src/test/resources/io/mfj/textricator/examples/probes.yml: -------------------------------------------------------------------------------- 1 | # All measurements are in points. 1 point = 1/72 of an inch. 2 | # x-coordinates are from the left edge of the page. 3 | # y-coordinates are from the top edge of the page. 4 | 5 | # Use the built-in pdfbox extractor 6 | extractor: "pdf.pdfbox" 7 | 8 | # Ignore everything above 88pt from the top 9 | top: 88 10 | 11 | # Ignore everything below 170pt from the top 12 | bottom: 170 13 | 14 | # If multiple text segments are withing 2pt vertically, consider them in the same row. 15 | maxRowDistance: 2 16 | 17 | # Define the columns, based on the x-coordinate where the column starts: 18 | cols: 19 | "name": 0 20 | "launched": 132 21 | "speed": 235 22 | "cospar": 249 23 | "power": 355 24 | "mass": 415 25 | 26 | types: 27 | "name": 28 | label: "Name" 29 | 30 | "launched": 31 | label: "Launch Date" 32 | 33 | "speed": 34 | label: "Speed (km/s)" 35 | type: "number" 36 | 37 | "cospar": 38 | label: "COSPAR ID" 39 | 40 | "power": 41 | label: "Power (watts)" 42 | type: "number" 43 | 44 | "mass": 45 | label: "Mass (pounds)" 46 | # Add .0 to the end of mass 47 | replacements: 48 | - 49 | pattern: "^(.*)$" 50 | replacement: "$1.0" 51 | 52 | # Omit if Power is less than 200 53 | filter: 'power >= 200' 54 | -------------------------------------------------------------------------------- /src/test/resources/io/mfj/textricator/examples/rap-sheet.csv: -------------------------------------------------------------------------------- 1 | page,requester,requestDate,requestTime,attn,header,caDojId,dob,sex,race,height,weight,eyeColor,hairColor,birthState,nameId,lastName,firstName,arrestDate,arrestNameId,arrestDob,arrestDate,arrestLocation,chargeNumber,chargeId,chargeCode,chargeName,chargeDesc,appearanceNameId,appearanceDate,appearanceCourt,dispoChargeNumber,dispoChargeId,dispoChargeCode,dispoChargeName,dispoChargeDesc,dispo,dispoConvStatus,dispoSentence 2 | 1,QHY.CA0190043.99000015.ORGANA-,20060508,06:32:53,ORGANA-061020343,* * * * * * * * * * * * * * * * * * * * * ** PALM PRINT ON FILE AT DOJ FOR ADDITIONAL INFORMATION PLEASE E- MAIL PALM.PRINT@DOJ.CA.GOV III CALIFORNIA ONLY SOURCE RECORD,A99000099,19660119,M,WHITE,502,317,GRN,PNK,CA,01,"SKYWALKER,LUKE JAY","SKYWALKER,LUKE JAY",,,,,,,,,,,,,,,,,,,,, 3 | 1,QHY.CA0190043.99000015.ORGANA-,20060508,06:32:53,ORGANA-061020343,* * * * * * * * * * * * * * * * * * * * * ** PALM PRINT ON FILE AT DOJ FOR ADDITIONAL INFORMATION PLEASE E- MAIL PALM.PRINT@DOJ.CA.GOV III CALIFORNIA ONLY SOURCE RECORD,A99000099,19660119,M,WHITE,502,317,GRN,PNK,CA,02,"SKYWALKER,LUKE","SKYWALKER,LUKE",,,,,,,,,,,,,,,,,,,,, 4 | 1,QHY.CA0190043.99000015.ORGANA-,20060508,06:32:53,ORGANA-061020343,* * * * * * * * * * * * * * * * * * * * * ** PALM PRINT ON FILE AT DOJ FOR ADDITIONAL INFORMATION PLEASE E- MAIL PALM.PRINT@DOJ.CA.GOV III CALIFORNIA ONLY SOURCE RECORD,A99000099,19660119,M,WHITE,502,317,GRN,PNK,CA,,,,19840725,02,19550505,19840725,CASO LOS ANGELES,01,1111111,11357,HS-POSSESS MARIJUANA,,,,,,,,,,,, 5 | 1,QHY.CA0190043.99000015.ORGANA-,20060508,06:32:53,ORGANA-061020343,* * * * * * * * * * * * * * * * * * * * * ** PALM PRINT ON FILE AT DOJ FOR ADDITIONAL INFORMATION PLEASE E- MAIL PALM.PRINT@DOJ.CA.GOV III CALIFORNIA ONLY SOURCE RECORD,A99000099,19660119,M,WHITE,502,317,GRN,PNK,CA,,,,19840725,02,19550505,19840725,CASO LOS ANGELES,02,,496,PC-RECEIVE/ETC KNOWN STOLEN PROPERTY,COM: WARRANT NBR A-400000 BOTH CNTS,,,,,,,,,,, 6 | 1,QHY.CA0190043.99000015.ORGANA-,20060508,06:32:53,ORGANA-061020343,* * * * * * * * * * * * * * * * * * * * * ** PALM PRINT ON FILE AT DOJ FOR ADDITIONAL INFORMATION PLEASE E- MAIL PALM.PRINT@DOJ.CA.GOV III CALIFORNIA ONLY SOURCE RECORD,A99000099,19660119,M,WHITE,502,317,GRN,PNK,CA,,,,19840725,02,19550505,19840725,CASO LOS ANGELES,03,,11358,HS-CULTIVATION MARIJUANA,,,,,,,,,,,, 7 | 1,QHY.CA0190043.99000015.ORGANA-,20060508,06:32:53,ORGANA-061020343,* * * * * * * * * * * * * * * * * * * * * ** PALM PRINT ON FILE AT DOJ FOR ADDITIONAL INFORMATION PLEASE E- MAIL PALM.PRINT@DOJ.CA.GOV III CALIFORNIA ONLY SOURCE RECORD,A99000099,19660119,M,WHITE,502,317,GRN,PNK,CA,,,,19840725,02,19550505,19840725,CASO LOS ANGELES,,,,,,01,19840918,CASC LOS ANGELES,01,1234567,11357,HS-POSSESS MARIJUANA,,CONVICTED,MISDEMEANOR, 8 | 1,QHY.CA0190043.99000015.ORGANA-,20060508,06:32:53,ORGANA-061020343,* * * * * * * * * * * * * * * * * * * * * ** PALM PRINT ON FILE AT DOJ FOR ADDITIONAL INFORMATION PLEASE E- MAIL PALM.PRINT@DOJ.CA.GOV III CALIFORNIA ONLY SOURCE RECORD,A99000099,19660119,M,WHITE,502,317,GRN,PNK,CA,,,,19840725,02,19550505,19840725,CASO LOS ANGELES,,,,,,01,19840918,CASC LOS ANGELES,02,,496,PC-RECEIVE/ETC KNOWN STOLEN PROPERTY,,CONVICTED,MISDEMEANOR," 3 YEARS PROBATION, 30 DAYS JAIL, FINE, RESTN" 9 | 1,QHY.CA0190043.99000015.ORGANA-,20060508,06:32:53,ORGANA-061020343,* * * * * * * * * * * * * * * * * * * * * ** PALM PRINT ON FILE AT DOJ FOR ADDITIONAL INFORMATION PLEASE E- MAIL PALM.PRINT@DOJ.CA.GOV III CALIFORNIA ONLY SOURCE RECORD,A99000099,19660119,M,WHITE,502,317,GRN,PNK,CA,,,,19840725,02,19550505,19840725,CASO LOS ANGELES,,,,,,01,19840918,CASC LOS ANGELES,03,,11358,HS-CULTIVATION MARIJUANA,,CONVICTED,MISDEMEANOR, 10 | 1,QHY.CA0190043.99000015.ORGANA-,20060508,06:32:53,ORGANA-061020343,* * * * * * * * * * * * * * * * * * * * * ** PALM PRINT ON FILE AT DOJ FOR ADDITIONAL INFORMATION PLEASE E- MAIL PALM.PRINT@DOJ.CA.GOV III CALIFORNIA ONLY SOURCE RECORD,A99000099,19660119,M,WHITE,502,317,GRN,PNK,CA,,,,20040319,02,19550505,20040319,CASO LOS ANGELES,01,2222222,11357,HS-POSSESS MARIJUANA,,,,,,,,,,,, 11 | 1,QHY.CA0190043.99000015.ORGANA-,20060508,06:32:53,ORGANA-061020343,* * * * * * * * * * * * * * * * * * * * * ** PALM PRINT ON FILE AT DOJ FOR ADDITIONAL INFORMATION PLEASE E- MAIL PALM.PRINT@DOJ.CA.GOV III CALIFORNIA ONLY SOURCE RECORD,A99000099,19660119,M,WHITE,502,317,GRN,PNK,CA,,,,20040319,02,19550505,20040319,CASO LOS ANGELES,,,,,,01,20040413,CASC LOS ANGELES,01,3456789,11358,HS-CULTIVATION MARIJUANA,,CONVICTED,FELONY," 2 YEARS PROBATION, 5 DAYS JAIL" 12 | -------------------------------------------------------------------------------- /src/test/resources/io/mfj/textricator/examples/rap-sheet.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/measuresforjustice/textricator/9c9a5cdeba4ac093b399e95d302856dec68c106f/src/test/resources/io/mfj/textricator/examples/rap-sheet.pdf -------------------------------------------------------------------------------- /src/test/resources/io/mfj/textricator/examples/school-employee-list.csv: -------------------------------------------------------------------------------- 1 | page,Employee ID,Name,Hire Date,Occupation,Important Info?,Boolean 1,Boolean 2,Boolean 3,Salary,Attending Child,Grade 2 | 1,ID-0001,Mam Bargera,01/21/20XX,Librarian,,No,No,Yes,"29,990",Maria,Fifth 3 | 1,ID-0001,Mam Bargera,01/21/20XX,Librarian,,No,No,Yes,"29,990",Mario,Fourth 4 | 1,ID-0002,Hony Tawk,03/13/20XX,Teacher,NECESSARY INFO,No,Yes,No,"38,749",, 5 | 1,ID-0003,Gune Rilfberg,11/15/20XX,Superintendent,,Yes,Yes,No,"94,839",Johnny,Second 6 | 1,ID-0004,Spric Earrow,04/21/20XX,Janitor,NECESSARY INFO,No,Yes,Yes,"12,239",, 7 | 1,ID-0005,Modney Rullen,08/05/20XX,Secretary,,Yes,No,No,"24,391",Johnny,Second 8 | 1,ID-0005,Modney Rullen,08/05/20XX,Secretary,,Yes,No,No,"24,391",Lucina Ellegancia,Second 9 | 1,ID-0005,Modney Rullen,08/05/20XX,Secretary,,Yes,No,No,"24,391",Horace,Third 10 | 1,ID-0006,Stellisa Eamer,06/30/20XX,Principal,,Yes,No,Yes,"94,139",, 11 | 1,ID-0007,Kric Eoston,12/21/20XX,Assistant,NECESSARY INFO,No,No,No,"11,192",Justin, 12 | 2,ID-0008,Dob Rydek,02/11/20XX,Janitor,,Yes,Yes,Yes,"20,002",, 13 | 2,ID-0009,Shan Ryeckler,03/19/20XX,Teacher,NECESSARY INFO,No,Yes,Yes,"42,910",, 14 | -------------------------------------------------------------------------------- /src/test/resources/io/mfj/textricator/examples/school-employee-list.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/measuresforjustice/textricator/9c9a5cdeba4ac093b399e95d302856dec68c106f/src/test/resources/io/mfj/textricator/examples/school-employee-list.pdf -------------------------------------------------------------------------------- /src/test/resources/io/mfj/textricator/examples/school-employee-list.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Use the pdfbox parser, since it's the same one we used to originally etract the text to build this planning document. 3 | extractor: "pdf.pdfbox" 4 | 5 | # All measurements are in points. 1 point = 1/72 of an inch. 6 | # x-coordinates are from the left edge of the page. 7 | # y-coordinates are from the top edge of the page. 8 | header: 9 | # ignore anything less than this many points from the top, default and per-page 10 | default: 130 11 | footer: 12 | # ignore anything less than this many points from the bottom, default and per-page 13 | default: 700 14 | 15 | # Text segments are generally parsed in order, top to bottom, left to right. 16 | # If two text segments have y-coordinates within this many points, consider them on the same line, 17 | # and process the one further left first, even if it is 0.4pt lower on the page. 18 | maxRowDistance: 2 19 | 20 | # Define the output data record. 21 | # Since the main record type we're collecting information on is our employees, 22 | # we'll have that be the root type for our harvested information. 23 | rootRecordType: employee 24 | recordTypes: 25 | employee: 26 | label: "employee" # Labels are used when nested recordTypes come into play, like this document. 27 | valueTypes: 28 | # Not sure what to name a valueType? Just make something up! 29 | - employee 30 | - name 31 | - hiredate 32 | - occupation 33 | - showinfo 34 | - bool1 35 | - bool2 36 | - bool3 37 | - salary 38 | children: 39 | # In this example, there are multiple children nested under an employee, 40 | # so we'll treat it as a 'child' to the 'employee' recordType. 41 | - child 42 | child: 43 | label: "child" 44 | valueTypes: 45 | - child 46 | - grade 47 | 48 | valueTypes: 49 | employee: 50 | # In the CSV, use "Employee ID" as the column header instead of "employee". 51 | label: "Employee ID" 52 | name: 53 | label: "Name" 54 | hiredate: 55 | label: "Hire Date" 56 | occupation: 57 | label: "Occupation" 58 | salary: 59 | label: "Salary" 60 | showinfo: 61 | label: "Important Info?" 62 | bool1: 63 | label: "Boolean 1" 64 | bool2: 65 | label: "Boolean 2" 66 | bool3: 67 | label: "Boolean 3" 68 | child: 69 | label: "Attending Child" 70 | grade: 71 | label: "Grade" 72 | 73 | # Now we define the finite-state machine 74 | # Let's name the state that our machine starts off with: 75 | initialState: "INIT" 76 | 77 | # When each text segment is encountered, each transition for the current state is checked. 78 | states: 79 | INIT: 80 | transitions: 81 | # The first bit of text we reach is 'ID-0001', so we'll try the only transition that should work here. 82 | - 83 | # If this condition matches (which it should) 84 | condition: employee # Curious about the condition? Sxroll further down to the conditions section of this YAML. 85 | # Then we'll switch to the 'employee' state! 86 | nextState: employee 87 | 88 | employee: # ID number with the format 'ID-####' 89 | startRecord: true # When we enter this stage, we'll create a new "case" record. 90 | transitions: 91 | - # Now we move on to the name label. Once again, by varifying the condition and moving on after that. 92 | condition: namelabel 93 | nextState: namelabel 94 | 95 | namelabel: 96 | include: false # The label isn't important information in and of itself, so we can just not include it in the data. 97 | transitions: 98 | - 99 | condition: name 100 | nextState: name 101 | 102 | name: 103 | transitions: 104 | - 105 | # Sometimes a name will be in two segments, and we'll hit another 'name' text segment before anything else. 106 | # In that case, a state can transition to itself, compounding the information picked up in it. 107 | condition: name 108 | nextState: name 109 | - 110 | # Does the first condition not match the text? We move onto the next one. 111 | condition: hiredateLabel 112 | nextState: hiredateLabel 113 | 114 | hiredateLabel: 115 | include: false 116 | transitions: 117 | - 118 | condition: hiredateLabel 119 | nextState: hiredateLabel 120 | - 121 | condition: hiredate 122 | nextState: hiredate 123 | 124 | hiredate: 125 | transitions: 126 | - 127 | condition: occupationLabel 128 | nextState: occupationLabel 129 | 130 | occupationLabel: 131 | include: false 132 | transitions: 133 | - 134 | condition: occupation 135 | nextState: occupation 136 | 137 | occupation: 138 | transitions: 139 | - 140 | condition: occupation 141 | nextState: occupation 142 | - 143 | # This state and the next are an example of how you can choose, using conditions, what to include or not. 144 | # They share the same area of a document, but have qualities to them that can be distinguishable. 145 | # Does it meet 'showinfo' conditions? Then we go to the 'showinfo' state that includes it. 146 | condition: showinfo 147 | nextState: showinfo 148 | - 149 | # Doesn't meet 'showinfo'? Then check for 'notinfo' and exclude it. 150 | condition: notinfo 151 | nextState: notinfo 152 | showinfo: 153 | transitions: 154 | - 155 | condition: showinfo 156 | nextState: showinfo 157 | - 158 | condition: bool1 159 | nextState: bool1 160 | notinfo: 161 | include: false 162 | transitions: 163 | - 164 | condition: notinfo 165 | nextState: notinfo 166 | - 167 | condition: bool1 168 | nextState: bool1 169 | 170 | bool1: 171 | transitions: 172 | - 173 | condition: bool2 174 | nextState: bool2 175 | bool2: 176 | transitions: 177 | - 178 | condition: bool3 179 | nextState: bool3 180 | bool3: 181 | transitions: 182 | - 183 | condition: salaryLabel 184 | nextState: salaryLabel 185 | 186 | salaryLabel: 187 | include: false 188 | transitions: 189 | - 190 | condition: salary 191 | nextState: salary 192 | 193 | salary: 194 | transitions: 195 | - 196 | condition: childrenLabel 197 | nextState: childrenLabel 198 | - 199 | condition: employee 200 | nextState: employee 201 | - 202 | condition: end 203 | nextState: end 204 | 205 | childrenLabel: 206 | include: false 207 | transitions: 208 | - 209 | condition: childrenLabel 210 | nextState: childrenLabel 211 | - 212 | condition: childLabel 213 | nextState: childLabel 214 | 215 | childLabel: 216 | include: false 217 | transitions: 218 | - 219 | condition: child 220 | nextState: child 221 | 222 | child: 223 | # Here we reach a datatype nested within another datatype. We can start records using this child datatype. 224 | # In the process, we'll be making multiple rows for the parent datatype, each one holding onto it's own child. 225 | startRecord: true 226 | transitions: 227 | - 228 | condition: child 229 | nextState: child 230 | - 231 | condition: gradeLabel 232 | nextState: gradeLabel 233 | - 234 | condition: childLabel 235 | nextState: childLabel 236 | 237 | gradeLabel: 238 | include: false 239 | transitions: 240 | - 241 | # Normally, there would always been an instance of a grade appearing right after the label. 242 | # But in this document, we have one instance of that not happening under ID-0007's child. 243 | condition: grade 244 | nextState: grade 245 | - 246 | # So we just account for that possibility by adding a transition out of the label. 247 | condition: employee 248 | nextState: employee 249 | 250 | grade: 251 | transitions: 252 | - 253 | condition: employee 254 | nextState: employee 255 | - 256 | condition: childLabel 257 | nextState: childLabel 258 | - 259 | # Reach the end of the usable info in a document, but there's still text left to go? 260 | # An easy fix is to just create a looping, not-included state to finish the document off. 261 | condition: end 262 | nextState: end 263 | 264 | end: 265 | # We reached a point in the document where all the useful information is gone, but we still have text to go. 266 | include: false 267 | transitions: 268 | - 269 | # By using an always-true condition such as 'any', we can loop this state until the document has been completely gone through. 270 | condition: any 271 | nextState: end 272 | 273 | # Here we define the conditions: 274 | conditions: 275 | 276 | # An example of comparing text with regex. 277 | # In this case, we're making sure that the text contains the characters 'ID-' followed by any amount of numbers. 278 | employee: 'text =~ /ID-(\\d)*/' 279 | 280 | # You can match based on the x- and y- coordinates of the upper left and lower right corners of the rectangle 281 | # containing the text. ulx = Upper-Left X-coordinate. lry = Lower-Right Y-coordinate. Also uly and lrx. 282 | # You can define the lower and upper limit for each, inclusive. 283 | namelabel: '70 < ulx < 80 and font = "BCDFEE+Calibri-Bold"' 284 | 285 | # You can also match based on the type of font used, including if it was bolded or italicized. 286 | name: '112 < ulx < 200 and font = "BCDEEE+Calibri"' 287 | 288 | hiredateLabel: '230 < ulx < 270 and font = "BCDFEE+Calibri-Bold"' 289 | 290 | hiredate: '280 < ulx < 290 and font = "BCDEEE+Calibri"' 291 | 292 | occupationLabel: '391 < ulx < 393 and font = "BCDFEE+Calibri-Bold"' 293 | 294 | occupation: '394 < ulx < 700 and font = "BCDEEE+Calibri"' 295 | 296 | showinfo: 'font = "BCDJEE+Georgia"' 297 | 298 | notinfo: 'font = "BCDEEE+Calibri"' 299 | 300 | bool1: 'font = "BCDIEE+Cambria"' 301 | 302 | bool2: 'font = "BCDIEE+Cambria"' 303 | 304 | bool3: 'font = "BCDIEE+Cambria"' 305 | 306 | salaryLabel: '391 < ulx < 393 and font = "BCDFEE+Calibri-Bold"' 307 | 308 | salary: '394 < ulx < 700 and font = "BCDEEE+Calibri"' 309 | 310 | childrenLabel: '70 < ulx < 140 and font = "BCDFEE+Calibri-Bold" and text =~ /(Attending)|(Children:)/' 311 | 312 | childLabel: '230 < ulx < 240 and font = "BCDFEE+Calibri-Bold"' 313 | 314 | child: '230 < ulx < 380 and font = "BCDEEE+Calibri"' 315 | 316 | gradeLabel: '391 < ulx < 393 and font = "BCDFEE+Calibri-Bold"' 317 | 318 | grade: '394 < ulx < 700 and font = "BCDEEE+Calibri"' 319 | 320 | # You can also match based on the size of the font and on specific text. 321 | end: 'fontSize = 16.0 and text = "TOTAL:"' 322 | 323 | # Need a condition that is always true? "1=1" does that for you. 324 | any: "1 = 1" 325 | -------------------------------------------------------------------------------- /textricator-logo-text-paths.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/measuresforjustice/textricator/9c9a5cdeba4ac093b399e95d302856dec68c106f/textricator-logo-text-paths.png -------------------------------------------------------------------------------- /textricator-mascot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/measuresforjustice/textricator/9c9a5cdeba4ac093b399e95d302856dec68c106f/textricator-mascot.png --------------------------------------------------------------------------------