├── .gitignore ├── LICENSE.txt ├── README.md ├── build.sbt ├── check_for_author_tags.py ├── conf ├── base.conf ├── fast.conf ├── first_folio_init_font.conf ├── first_folio_lm_train.conf └── first_folio_main.conf ├── lib ├── PDFRenderer-0.9.1.jar ├── junit-4.12.jar └── murphy.jar ├── make-readme-options.py ├── make_jar.sh ├── make_run_script.sh ├── options_lists.txt ├── project └── plugins.sbt ├── publish_jar_with_dependencies.sh ├── replace ├── latin.txt ├── nahuatl.txt └── spanish.txt ├── sample_images ├── advertencias │ ├── pl_blac_047_00039-800.jpg │ ├── pl_blac_047_00039-800.txt │ ├── pl_blac_047_00040-800.jpg │ ├── pl_blac_047_00040-800.txt │ ├── pl_blac_047_00041-800.jpg │ └── pl_blac_047_00041-800.txt └── english │ ├── 184101040058.jpg │ ├── 184101040060.jpg │ └── 184101040062.jpg ├── sbt-launch-0.13.8.jar └── src ├── main └── java │ └── edu │ └── berkeley │ └── cs │ └── nlp │ └── ocular │ ├── data │ ├── Document.java │ ├── FirstFolioRawImageLoader.java │ ├── LazyRawImageDocument.java │ ├── LazyRawImageLoader.java │ ├── LazyRawPdfImageDocument.java │ ├── LazyRawSingleImageDocument.java │ ├── PdfImageReader.java │ ├── RawImageLoader.java │ ├── TextAndLineImagesLoader.java │ └── textreader │ │ ├── BasicTextReader.java │ │ ├── BlacklistCharacterSetTextReader.java │ │ ├── CharIndexer.java │ │ ├── Charset.java │ │ ├── ConvertLongSTextReader.java │ │ ├── FlipUVTextReader.java │ │ ├── RemoveAllDiacriticsTextReader.java │ │ ├── ReplaceSomeTextReader.java │ │ ├── TextReader.java │ │ └── WhitelistCharacterSetTextReader.java │ ├── eval │ ├── AlignedFormPair.java │ ├── BasicMultiDocumentTranscriber.java │ ├── BasicSingleDocumentEvaluatorAndOutputPrinter.java │ ├── ErrorSampler.java │ ├── EvalPrinter.java │ ├── Evaluator.java │ ├── Form.java │ ├── Glyph.java │ ├── LmPerplexity.java │ ├── MarkovEditDistanceComputer.java │ ├── ModelTranscriptions.java │ ├── MultiDocumentTranscriber.java │ ├── Operation.java │ └── SingleDocumentEvaluatorAndOutputPrinter.java │ ├── font │ └── Font.java │ ├── gsm │ ├── BasicGlyphSubstitutionModel.java │ ├── GlyphChar.java │ ├── GlyphSubstitutionModel.java │ └── NoSubGlyphSubstitutionModel.java │ ├── image │ ├── FontRenderer.java │ ├── ImageUtils.java │ └── Visualizer.java │ ├── lm │ ├── BasicCodeSwitchLanguageModel.java │ ├── CodeSwitchLanguageModel.java │ ├── CorpusCounter.java │ ├── CountDb.java │ ├── CountDbBig.java │ ├── CountDbSimple.java │ ├── CountType.java │ ├── InterpolatingSingleLanguageModel.java │ ├── LanguageModel.java │ ├── LongArrWrapper.java │ ├── LongNgram.java │ ├── Ngram.java │ ├── NgramCounts.java │ ├── NgramLanguageModel.java │ ├── NgramWrapper.java │ ├── SingleLanguageModel.java │ └── UniformLanguageModel.java │ ├── main │ ├── ExtractLinesOnly.java │ ├── FonttrainTranscribeShared.java │ ├── InitializeFont.java │ ├── InitializeGlyphSubstitutionModel.java │ ├── InitializeLanguageModel.java │ ├── LineExtractionOptions.java │ ├── NoDocumentsFoundException.java │ ├── NoDocumentsToProcessException.java │ ├── OcularRunnable.java │ ├── TrainFont.java │ ├── Transcribe.java │ └── gui │ │ ├── GridLayout2.java │ │ ├── InitializeFontGUI.java │ │ ├── TrainLanguageModelGUI.java │ │ └── TranscribeOrTrainFontGUI.java │ ├── model │ ├── CharacterTemplate.java │ ├── DecodeState.java │ ├── DecoderEM.java │ ├── TransitionStateType.java │ ├── em │ │ ├── BeamingSemiMarkovDP.java │ │ ├── CUDAInnerLoop.java │ │ ├── DefaultInnerLoop.java │ │ ├── DenseBigramTransitionModel.java │ │ ├── EmissionCacheInnerLoop.java │ │ ├── EmptyBeamException.java │ │ └── JOCLInnerLoop.java │ ├── emission │ │ ├── CachingEmissionModel.java │ │ ├── CachingEmissionModelExplicitOffset.java │ │ └── EmissionModel.java │ └── transition │ │ ├── CharacterNgramTransitionModel.java │ │ ├── CharacterNgramTransitionModelMarkovOffset.java │ │ ├── CodeSwitchTransitionModel.java │ │ └── SparseTransitionModel.java │ ├── output │ ├── AltoOutputWriter.java │ └── HtmlOutputWriter.java │ ├── preprocessing │ ├── Binarizer.java │ ├── Cropper.java │ ├── LineExtractor.java │ ├── ManualCropper.java │ ├── ManualStackCropperPrep.java │ ├── Straightener.java │ ├── Test.java │ ├── VerticalModel.java │ └── VerticalProfile.java │ ├── train │ ├── FontTrainer.java │ ├── ModelPathMaker.java │ └── TrainingRestarter.java │ └── util │ ├── ArrayHelper.java │ ├── CollectionHelper.java │ ├── FileHelper.java │ ├── FileUtil.java │ ├── StringHelper.java │ ├── Tuple2.java │ └── Tuple3.java └── test ├── java └── edu │ └── berkeley │ └── cs │ └── nlp │ └── ocular │ ├── data │ └── textreader │ │ ├── BasicTextReaderTests.java │ │ ├── BlacklistCharacterSetTextReaderTests.java │ │ ├── CharIndexerTests.java │ │ ├── CharsetTests.java │ │ ├── ConvertLongSTextReaderTests.java │ │ ├── RemoveAllDiacriticsTextReaderTests.java │ │ ├── ReplaceSomeTextReaderTests.java │ │ └── WhitelistCharacterSetTextReaderTests.java │ ├── eval │ └── LmPerplexityTests.java │ ├── gsm │ └── BasicGlyphSubstitutionModelTests.java │ ├── lm │ └── LanguageTransitionPriorsTests.java │ ├── model │ ├── FontTrainEMTests.java │ └── PostViterbiTests.java │ └── util │ ├── ArrayHelperTests.java │ ├── CollectionHelperTests.java │ ├── FileUtilTests.java │ └── StringHelperTests.java └── resources ├── .gitignore ├── doc.jpg ├── doc.txt ├── doc_normalized.txt ├── extracted_lines └── doc-line_extract_jpg │ ├── line00.jpg │ ├── line01.jpg │ └── line02.jpg ├── initialize_font.sh ├── initialize_lm.sh └── train_font.sh /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | /font 3 | /lm 4 | /gsm 5 | /data 6 | /extracted_lines 7 | /train_output 8 | /transcribe_output 9 | /cs_train_lineex/ 10 | /cs_train_output/ 11 | /cs_train_output.txt 12 | /cs_transcribe_lineex/ 13 | /cs_transcribe_output/ 14 | /cs_transcribe_output.txt 15 | /sample_images/ 16 | /texts 17 | /replace 18 | 19 | *.class 20 | *.log 21 | 22 | /ocular-*.jar 23 | /lib 24 | 25 | # sbt specific 26 | dist/* 27 | target/ 28 | lib_managed/ 29 | src_managed/ 30 | project/boot/ 31 | project/plugins/project/ 32 | 33 | # Scala-IDE specific 34 | .scala_dependencies 35 | .project 36 | .classpath 37 | .cache 38 | .cache-main 39 | .cache-tests 40 | .settings/ 41 | .worksheet 42 | .pydevproject 43 | 44 | /bin/ 45 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | import com.typesafe.sbt.SbtStartScript 2 | 3 | import com.github.retronym.SbtOneJar._ 4 | 5 | name := "ocular" 6 | 7 | organization := "edu.berkeley.cs.nlp" 8 | 9 | version := "0.3-SNAPSHOT" 10 | 11 | scalaVersion := "2.12.1" 12 | 13 | javacOptions ++= Seq("-source", "1.6", "-target", "1.6") 14 | 15 | Seq(SbtStartScript.startScriptForClassesSettings: _*) 16 | 17 | SbtStartScript.stage in Compile := Unit 18 | 19 | oneJarSettings 20 | 21 | mainClass in oneJar := None 22 | 23 | 24 | libraryDependencies ++= Seq( 25 | // "org.apache.commons" % "commons-lang3" % "3.4", //to escape HTML special characters 26 | "org.swinglabs" % "pdf-renderer" % "1.0.5", 27 | "junit" % "junit" % "4.12" % "test", 28 | "com.novocode" % "junit-interface" % "0.10" % "test") 29 | -------------------------------------------------------------------------------- /check_for_author_tags.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | def has_author(f): 4 | for line in f: 5 | line = line.split() 6 | if '@author' in line: 7 | return True 8 | if 'class' in line or 'interface' in line: 9 | return False 10 | assert False, 'No class found...' 11 | 12 | for (folder,dirs,files) in os.walk("."): 13 | for fn in files: 14 | fn = '%s/%s' % (folder, fn) 15 | if fn.endswith('.java'): 16 | with open(fn) as f: 17 | if not has_author(f): 18 | print fn 19 | 20 | -------------------------------------------------------------------------------- /conf/base.conf: -------------------------------------------------------------------------------- 1 | inputPath ./test 2 | outputPath ./output 3 | outputFontPath ./font/learned.fontser 4 | lmPath ./lm/nyt.lmser 5 | initFontPath ./font/init.fontser 6 | 7 | binarizeThreshold 0.12 8 | 9 | paddingMinWidth 1 10 | paddingMaxWidth 5 11 | 12 | markovVerticalOffset true 13 | beamSize 20 14 | learnFont true 15 | numEMIters 4 16 | 17 | emissionEngine DEFAULT 18 | cudaDeviceID 0 19 | numMstepThreads 8 20 | numEmissionCacheThreads 8 21 | numDecodeThreads 4 22 | decodeBatchSize 16 23 | -------------------------------------------------------------------------------- /conf/fast.conf: -------------------------------------------------------------------------------- 1 | inputPath ./test 2 | outputPath ./output 3 | outputFontPath ./font/learned.fontser 4 | lmPath ./lm/nyt.lmser 5 | initFontPath ./font/init.fontser 6 | 7 | binarizeThreshold 0.12 8 | 9 | paddingMinWidth 1 10 | paddingMaxWidth 5 11 | 12 | markovVerticalOffset false 13 | beamSize 10 14 | learnFont true 15 | numEMIters 4 16 | 17 | emissionEngine DEFAULT 18 | cudaDeviceID 0 19 | numMstepThreads 8 20 | numEmissionCacheThreads 8 21 | numDecodeThreads 4 22 | decodeBatchSize 16 23 | -------------------------------------------------------------------------------- /conf/first_folio_init_font.conf: -------------------------------------------------------------------------------- 1 | inputLmPath /Users/tberg/Desktop/ob-longs-uv-4gm-4pow.lmser 2 | outputFontPath /Users/tberg/Desktop/init.fontser 3 | spaceMinWidthFraction 0.0 -------------------------------------------------------------------------------- /conf/first_folio_lm_train.conf: -------------------------------------------------------------------------------- 1 | lmPath /Users/tberg/Desktop/ob-longs-uv-4gm-4pow.lmser 2 | textPath /Users/tberg/Desktop/big-lm.txt 3 | insertLongS true 4 | allowUVFlip true 5 | charN 4 6 | power 4.0 -------------------------------------------------------------------------------- /conf/first_folio_main.conf: -------------------------------------------------------------------------------- 1 | inputPath /Users/tberg/Desktop/F-tem/seg_extraction 2 | usePrebuiltLM true 3 | lmPath /Users/tberg/Desktop/ob-longs-uv-4gm-4pow.lmser 4 | lmTextPath /Users/tberg/git/first_folio_attr/data/txt/F-tem 5 | lmOrder 4 6 | lmPower 4.0 7 | initFontPath /Users/tberg/Desktop/init.fontser 8 | learnFont true 9 | outputPath /Users/tberg/Desktop/F-tem-output-ob-uv-4pow-nospace-git 10 | outputFontPath /Users/tberg/Desktop/F-tem-output-ob-uv-4pow-nospace-git/learned.fontser 11 | emissionEngine CUDA 12 | cudaDeviceID 1 13 | -------------------------------------------------------------------------------- /lib/PDFRenderer-0.9.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tberg12/ocular/3470d41a38a16a2c8b5e77f382c82e51d886cdd6/lib/PDFRenderer-0.9.1.jar -------------------------------------------------------------------------------- /lib/junit-4.12.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tberg12/ocular/3470d41a38a16a2c8b5e77f382c82e51d886cdd6/lib/junit-4.12.jar -------------------------------------------------------------------------------- /lib/murphy.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tberg12/ocular/3470d41a38a16a2c8b5e77f382c82e51d886cdd6/lib/murphy.jar -------------------------------------------------------------------------------- /make_jar.sh: -------------------------------------------------------------------------------- 1 | cp lib/JCuda-All-0.6.0-bin-linux-x86_64/* lib/ 2 | cp lib/JCuda-All-0.6.0-bin-apple-x86_64/* lib/ 3 | 4 | 5 | java -Dfile.encoding=UTF8 -Xmx1536M -Xss1M -XX:+CMSClassUnloadingEnabled -XX:MaxPermSize=256m -jar sbt-launch-*.jar "one-jar" 6 | JARPATH=`expr target/scala-*/ocular_*-*-one-jar.jar` 7 | FILENAME=$(basename $JARPATH) 8 | VERSION=${FILENAME:12} 9 | VERSION=${VERSION::${#VERSION}-12} 10 | JARNAME="ocular-${VERSION}-with_dependencies.jar" 11 | TEMPDIR=${FILENAME::${#FILENAME}-4} 12 | mkdir $TEMPDIR 13 | mv $JARPATH $TEMPDIR 14 | cd $TEMPDIR 15 | jar -xf $FILENAME 16 | rm $FILENAME 17 | cp ../lib/*.jar lib/ 18 | cp ../lib/JCuda-*/* lib/ 19 | jar cmf META-INF/MANIFEST.MF ../$JARNAME * 20 | cd .. 21 | rm -r $TEMPDIR 22 | 23 | -------------------------------------------------------------------------------- /make_run_script.sh: -------------------------------------------------------------------------------- 1 | cp lib/JCuda-All-0.6.0-bin-linux-x86_64/* lib/ 2 | cp lib/JCuda-All-0.6.0-bin-apple-x86_64/* lib/ 3 | 4 | 5 | java -Dfile.encoding=UTF8 -Xmx1536M -Xss1M -XX:+CMSClassUnloadingEnabled -XX:MaxPermSize=256m -jar sbt-launch-*.jar "start-script" 6 | -------------------------------------------------------------------------------- /options_lists.txt: -------------------------------------------------------------------------------- 1 | ### InitializeLanguageModel 2 | ##### Required 3 | inputTextPath 4 | outputLmPath 5 | ##### Additional Options 6 | minCharCount 7 | insertLongS 8 | charNgramLength 9 | alternateSpellingReplacementPaths 10 | ##### Rarely Used Options 11 | removeDiacritics 12 | pKeepSameLanguage 13 | languagePriors 14 | lmPower 15 | explicitCharacterSet 16 | lmCharCount 17 | 18 | 19 | 20 | ### InitializeFont 21 | ##### Required 22 | inputLmPath 23 | outputFontPath 24 | ##### Additional Options 25 | allowedFontsPath 26 | ##### Rarely Used Options 27 | numFontInitThreads 28 | spaceMaxWidthFraction 29 | spaceMinWidthFraction 30 | templateMaxWidthFraction 31 | templateMinWidthFraction 32 | 33 | 34 | 35 | ### TrainFont 36 | ##### Main Options 37 | inputDocPath 38 | inputDocListPath 39 | inputFontPath 40 | inputLmPath 41 | numDocs 42 | numDocsToSkip 43 | numEMIters 44 | continueFromLastCompleteIteration 45 | outputPath 46 | outputFormats 47 | outputFontPath 48 | ##### Additional Options 49 | extractedLinesPath 50 | updateDocBatchSize 51 | These options affect the speed of font training 52 | emissionEngine 53 | beamSize 54 | markovVerticalOffset 55 | ##### Glyph Substitution Model Options 56 | Glyph substitution is the feature that allows Ocular to use a probabilistic mapping from modern orthography (as used in the language model training text) to the orthography seen in the documents. If the glyph substitution feature is used, Ocular will jointly produce dual transcriptions: one that is an exact transcription of the document, and one that is a normalized version of the text. 57 | allowGlyphSubstitution 58 | inputGsmPath 59 | updateGsm 60 | outputGsmPath 61 | ##### Language Model Training Options 62 | updateLM 63 | outputLmPath 64 | ##### Line Extraction Options 65 | binarizeThreshold 66 | crop 67 | ##### Evaluate During Training 68 | evalInputDocPath 69 | evalNumDocs 70 | evalExtractedLinesPath 71 | evalFreq 72 | evalBatches 73 | ##### Rarely Used Options 74 | allowLanguageSwitchOnPunct 75 | cudaDeviceID 76 | decodeBatchSize 77 | gsmElideAnything 78 | gsmElisionSmoothingCountMultiplier 79 | gsmNoCharSubPrior 80 | gsmPower 81 | gsmSmoothingCount 82 | paddingMaxWidth 83 | paddingMinWidth 84 | uniformLineHeight 85 | numDecodeThreads 86 | numEmissionCacheThreads 87 | numMstepThreads 88 | 89 | 90 | 91 | 92 | 93 | ### Transcribe 94 | ##### Main Options 95 | inputDocPath 96 | inputDocListPath 97 | inputFontPath 98 | inputLmPath 99 | numDocs 100 | numDocsToSkip 101 | skipAlreadyTranscribedDocs 102 | outputPath 103 | outputFormats 104 | ##### Additional Options 105 | extractedLinesPath 106 | failIfAllDocsAlreadyTranscribed 107 | These options affect the speed of transcription 108 | emissionEngine 109 | beamSize 110 | markovVerticalOffset 111 | ##### Glyph Substitution Model Options 112 | Glyph substitution is the feature that allows Ocular to use a probabilistic mapping from modern orthography (as used in the language model training text) to the orthography seen in the documents. If the glyph substitution feature is used, Ocular will jointly produce dual transcriptions: one that is an exact transcription of the document, and one that is a normalized version of the text. 113 | allowGlyphSubstitution 114 | inputGsmPath 115 | ##### Model Updating Options 116 | updateDocBatchSize 117 | For updating the font model 118 | updateFont 119 | outputFontPath 120 | For updating the glyph substitution model 121 | updateGsm 122 | outputGsmPath 123 | For updating the language model 124 | updateLM 125 | outputLmPath 126 | ##### Line Extraction Options 127 | binarizeThreshold 128 | crop 129 | ##### Evaluate During Training 130 | evalInputDocPath 131 | evalNumDocs 132 | evalBatches 133 | evalExtractedLinesPath 134 | ##### Rarely Used Options 135 | allowLanguageSwitchOnPunct 136 | cudaDeviceID 137 | decodeBatchSize 138 | gsmElideAnything 139 | gsmElisionSmoothingCountMultiplier 140 | gsmNoCharSubPrior 141 | gsmPower 142 | gsmSmoothingCount 143 | paddingMaxWidth 144 | paddingMinWidth 145 | uniformLineHeight 146 | numDecodeThreads 147 | numEmissionCacheThreads 148 | numMstepThreads 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.typesafe.sbt" % "sbt-start-script" % "0.10.0") 2 | 3 | addSbtPlugin("org.scala-sbt.plugins" % "sbt-onejar" % "0.8") 4 | 5 | -------------------------------------------------------------------------------- /publish_jar_with_dependencies.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | ./make_jar.sh 6 | scp ocular-0.3-SNAPSHOT-with_dependencies.jar k:public_html/maven-repository/snapshots/edu/berkeley/cs/nlp/ocular/0.3-SNAPSHOT/ 7 | 8 | -------------------------------------------------------------------------------- /replace/latin.txt: -------------------------------------------------------------------------------- 1 | an \~a 4 2 | en \~e 4 3 | in \~i 4 4 | on \~o 4 5 | un \~u 4 6 | u v 4 7 | v u 5 8 | ae æ 2 9 | -------------------------------------------------------------------------------- /replace/nahuatl.txt: -------------------------------------------------------------------------------- 1 | \'q \~q 1 2 | u v 4 3 | v u 5 4 | -------------------------------------------------------------------------------- /replace/spanish.txt: -------------------------------------------------------------------------------- 1 | \`a a 1 2 | \`e e 1 3 | \`i i 1 4 | \`o o 1 5 | \`u u 1 6 | que \~q 4 7 | per \~p 4 8 | ci zi 4 9 | ce ze 4 10 | x j 4 11 | j x 5 12 | an \~a 4 13 | en \~e 4 14 | in \~i 4 15 | on \~o 4 16 | un \~u 4 17 | h 5 18 | be ve 5 19 | u v 4 20 | v u 5 21 | \'a a 4 22 | \'e e 4 23 | \'i i 4 24 | \'o o 4 25 | \'u u 4 26 | ae æ 2 27 | oracion o\~ron 5 28 | -------------------------------------------------------------------------------- /sample_images/advertencias/pl_blac_047_00039-800.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tberg12/ocular/3470d41a38a16a2c8b5e77f382c82e51d886cdd6/sample_images/advertencias/pl_blac_047_00039-800.jpg -------------------------------------------------------------------------------- /sample_images/advertencias/pl_blac_047_00039-800.txt: -------------------------------------------------------------------------------- 1 | los confeſſores 9 2 | 3 | quod quando quis fcit in confeſsione pecca- 4 | tum alicuius, cum quo pænitens dicit ſe pec 5 | caſſe, quando ille venerit ad confitendum po 6 | teſt eum de tali peccato interrogare, & inge 7 | nere, & inſpecie: ſi tale eſt peccatum de quo 8 | ſolent confeſſores interrogare pęnitentes. Et 9 | hoc dum modo talis non poſſet habere ſuſpì 10 | tionem, quod is, cum quo peccanit, fuerit de 11 | illo peccato confeſſus. 12 | ¶Algunas viejas y viejos ſe vienem a con- 6 13 | feſſar y a reconciliar que apenas puede el c\~o 14 | feſſor juzgar, ſi es peccado venial lo que di- 15 | zen. Sino los abſuelue van deſconſoladas, 16 | y ſi las abſuelue, queda con ſcrupulo de auer 17 | abſuelto ſin materia ſufficiente. Para eſto ab 18 | ſueluale deſta manera. Si vere peccato habes 19 | & confeſſus es. Ego te abſoluo, ſi n\~o habes 20 | non. Aſsi lo enſe\~na el Maeſtro fray Bartho- 21 | lome de Medina en ſu ſumma cap. 12. Vega 22 | lib. I. caſo. 353. 23 | ¶ En algunas partes ſe hazen ya tan perezo- 7 24 | ſos los naturales para venir a confeſſarſe la- 25 | Quareſma, que ſino ſe tiene gran cuenta en 26 | preuenirlos deſde el Domingo antes de la ſe 27 | mana que vengan por ſus varrios a confeſſar 28 | ſe, no vienen. Y ſi los miniſtros a premian a 29 | C 1 los 30 | 31 | -------------------------------------------------------------------------------- /sample_images/advertencias/pl_blac_047_00040-800.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tberg12/ocular/3470d41a38a16a2c8b5e77f382c82e51d886cdd6/sample_images/advertencias/pl_blac_047_00040-800.jpg -------------------------------------------------------------------------------- /sample_images/advertencias/pl_blac_047_00040-800.txt: -------------------------------------------------------------------------------- 1 | 2 | Advertencias para 3 | a los mandones a que los traygan, ellos he- 4 | chan mano, de los primeros que topan por la 5 | calle, o tianguez, y ſi les mandan aguardar 6 | para que pienſen bien ſus peccados, y ſi quie 7 | ra hagan algunos actos de contriction, ſucce- 8 | de que quando acuerda el confeſſor ya ſe han 9 | ydo y nunca mas bueluen. Pues confeſſarlos 10 | ſin preceder deuida penitencia y dìlig\~ete exa 11 | men de ſu conſciencia, ya ſe vee lo que diz\~e 12 | los Doctores que la confeſsion del que no hi 13 | zo la deuida diligencia para examinar bien 14 | ſu conſciencia, por lo qual ſe le oluido alg\~u 15 | peccado, o peccados mortales, es invalido y 16 | neceſſario repet\~eda ſino ſino es ìn articulo mor 17 | tis, que tunc excuſatur pœ ſi conſitea- 18 | tur fine pręuia examìnatìone. Para eſto diſ- 19 | pongale el confeſſor lo mejor que pudiere y 20 | ſupiere, por que ſuppletur deffectus examinis 21 | per interrogationem prudentis confeſſarij: 22 | præſertim vrg\~ete cauſa, & qu\~ado ruſtici eti\~a 23 | moniti neſciunt præmeditari peccata, como 24 | dize Nauarro y otros que alega Henrico H\~e 25 | riquez tom. I. lib. 2. de pęnit\~ecia cap. 5. §. I. 26 | ¶ Muchas vezes va vn ſacerdote por vn ca- 8 27 | mino, y llamanle a confeſſar a vn indio que 28 | eſta malo, y no ſabe el ſacerdote mucha len- 29 | gua 30 | 31 | -------------------------------------------------------------------------------- /sample_images/advertencias/pl_blac_047_00041-800.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tberg12/ocular/3470d41a38a16a2c8b5e77f382c82e51d886cdd6/sample_images/advertencias/pl_blac_047_00041-800.jpg -------------------------------------------------------------------------------- /sample_images/advertencias/pl_blac_047_00041-800.txt: -------------------------------------------------------------------------------- 1 | los confeſſores 10 2 | 3 | gua, que hara Confieſſele no hauiendo muy 4 | cerca otro ſacerdote que lo pueda confeſſar, 5 | vieno ſe\~nales de contricion en el : que por 6 | pocos peccados que le entienda, aunque de- 7 | xe de entender otros muchos, baſta para po- 8 | derle abſoluer, y aquella alma queda reme- 9 | diada. Por que ſi eſtaua attrita, con el ſacra- 10 | mento ſe haze contrita, y por el conſiguiente 11 | digna de vida eterna. Y concederle ha la in- 12 | dulgencia de la Bulla [teniendola] para \~q no 13 | ſe detenga en el purgatorio. Y eſto aunque 14 | no eſte muy enfermo, pues la experi\~ecia nos 15 | enſe\~na quanto los ſuele apreſurar la enferme 16 | dad y lleuarſe los en no nada. Pues fray Lu 17 | ys Lopez y fray Manuel Rodriguez, tom. I. 18 | cap. 61. conl. 3. num. 3 dizen que el confeſ- 19 | ſor Caſtellano que no ſabe la lenga Fr\~ace- 20 | la ſi no alguna coſa della, puede confeſſar al 21 | Frances \~q en ſu lengua ſe confieſſa con el, a\~u 22 | \~q ſea fuera del articulo de la muerte. Y no ſe 23 | qual es el ſacerdote \~q puede c\~ofeſſar en ſemejan 24 | te neceſsidad, particularmente los religioſos 25 | a quien ſiruen tambi\~e eſtos pobres naturales. 26 | ¶ Com\~umente diz\~e los doctores \~q nadie eſ 9 27 | ta obligado a c\~ofeſſarſe por interprete eſt\~ado 28 | C 2 con 29 | 30 | -------------------------------------------------------------------------------- /sample_images/english/184101040058.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tberg12/ocular/3470d41a38a16a2c8b5e77f382c82e51d886cdd6/sample_images/english/184101040058.jpg -------------------------------------------------------------------------------- /sample_images/english/184101040060.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tberg12/ocular/3470d41a38a16a2c8b5e77f382c82e51d886cdd6/sample_images/english/184101040060.jpg -------------------------------------------------------------------------------- /sample_images/english/184101040062.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tberg12/ocular/3470d41a38a16a2c8b5e77f382c82e51d886cdd6/sample_images/english/184101040062.jpg -------------------------------------------------------------------------------- /sbt-launch-0.13.8.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tberg12/ocular/3470d41a38a16a2c8b5e77f382c82e51d886cdd6/sbt-launch-0.13.8.jar -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/data/Document.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.data; 2 | 3 | import java.util.List; 4 | 5 | import edu.berkeley.cs.nlp.ocular.image.ImageUtils.PixelType; 6 | 7 | /** 8 | * @author Dan Garrette (dhgarrette@gmail.com) 9 | */ 10 | public interface Document { 11 | public String baseName(); 12 | public PixelType[][][] loadLineImages(); 13 | public String[][] loadDiplomaticTextLines(); 14 | public String[][] loadNormalizedTextLines(); 15 | public List loadNormalizedText(); 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/data/FirstFolioRawImageLoader.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.data; 2 | 3 | import java.io.File; 4 | import java.io.FilenameFilter; 5 | import java.util.Arrays; 6 | import java.util.List; 7 | 8 | import edu.berkeley.cs.nlp.ocular.image.ImageUtils; 9 | import edu.berkeley.cs.nlp.ocular.image.ImageUtils.ConnectedComponentProcessor; 10 | import edu.berkeley.cs.nlp.ocular.image.ImageUtils.PixelType; 11 | import edu.berkeley.cs.nlp.ocular.preprocessing.Binarizer; 12 | import edu.berkeley.cs.nlp.ocular.preprocessing.LineExtractor; 13 | import tberg.murphy.arrays.a; 14 | import tberg.murphy.fileio.f; 15 | import tberg.murphy.threading.BetterThreader; 16 | 17 | public class FirstFolioRawImageLoader { 18 | 19 | public static class FirstFolioRawImageDocument implements Document { 20 | private final String baseName; 21 | final PixelType[][][] observations; 22 | 23 | public FirstFolioRawImageDocument(String inputPath, String baseName, int lineHeight, double binarizeThreshold) { 24 | this.baseName = baseName; 25 | double[][] levels = ImageUtils.getLevels(f.readImage(inputPath+"/"+baseName)); 26 | ConnectedComponentProcessor ccprocBig = new ConnectedComponentProcessor() { 27 | public void process(double[][] levels, List connectedComponent) { 28 | if (connectedComponent.size() > 1000) { 29 | for (int[] pixel : connectedComponent) { 30 | levels[pixel[0]][pixel[1]] = 255.0; 31 | } 32 | } 33 | } 34 | }; 35 | ImageUtils.processConnectedComponents(levels, 50.0, ccprocBig); 36 | Binarizer.binarizeGlobal(binarizeThreshold, levels); 37 | ConnectedComponentProcessor ccprocSmall = new ConnectedComponentProcessor() { 38 | public void process(double[][] levels, List connectedComponent) { 39 | if (connectedComponent.size() < 20 || connectedComponent.size() > 1000) { 40 | for (int[] pixel : connectedComponent) { 41 | levels[pixel[0]][pixel[1]] = 255.0; 42 | } 43 | } 44 | } 45 | }; 46 | ImageUtils.processConnectedComponents(levels, 127.0, ccprocSmall); 47 | 48 | int padHeight = 0; 49 | double[][] topPadLevels = new double[levels.length][]; 50 | for (int i=0; i lines = LineExtractor.extractLines(topPadLevels); 53 | observations = new PixelType[lines.size()][][]; 54 | for (int i=0; i= 0) { 56 | observations[i] = ImageUtils.getPixelTypes(ImageUtils.resampleImage(ImageUtils.makeImage(lines.get(i)), lineHeight)); 57 | } else { 58 | observations[i] = ImageUtils.getPixelTypes(ImageUtils.makeImage(lines.get(i))); 59 | } 60 | } 61 | } 62 | 63 | public PixelType[][][] loadLineImages() { 64 | return observations; 65 | } 66 | 67 | public String[][] loadDiplomaticTextLines() { 68 | return null; 69 | } 70 | 71 | public String[][] loadNormalizedTextLines() { 72 | return null; 73 | } 74 | 75 | public List loadNormalizedText() { 76 | return null; 77 | } 78 | 79 | public String baseName() { 80 | return baseName; 81 | } 82 | } 83 | 84 | public static List loadDocuments(final String inputPath, final int lineHeight, final double binarizeThreshold, final int numThreads) { 85 | System.out.println("Extracting text line images from dataset "+inputPath); 86 | File dir = new File(inputPath); 87 | final String[] dirList = dir.list(new FilenameFilter() { 88 | public boolean accept(File dir, String name) { 89 | if (name.startsWith(".")) { // ignore hidden files 90 | return false; 91 | } 92 | else if (!name.endsWith(".png") && !name.endsWith(".jpg")) { 93 | return false; 94 | } 95 | return true; 96 | } 97 | }); 98 | final Document[] docs = new Document[dirList.length]; 99 | BetterThreader.Function func = new BetterThreader.Function(){public void call(Integer i, Object ignore){ 100 | String baseName = dirList[i]; 101 | docs[i] = new FirstFolioRawImageDocument(inputPath, baseName, lineHeight, binarizeThreshold); 102 | }}; 103 | BetterThreader threader = new BetterThreader(func, numThreads); 104 | for (int i=0; i loadDocuments(String inputPath, String extractedLinesPath, int numDocs, int numDocsToSkip) { 23 | return loadDocuments(inputPath, extractedLinesPath, numDocs, numDocsToSkip, true, 0.12, false); 24 | } 25 | public static List loadDocuments(String inputPath, String extractedLinesPath, int numDocs, int numDocsToSkip, boolean uniformLineHeight, double binarizeThreshold, boolean crop) { 26 | return loadDocuments(Arrays.asList(inputPath), extractedLinesPath, numDocs, numDocsToSkip, uniformLineHeight, binarizeThreshold, crop); 27 | } 28 | 29 | public static List loadDocuments(List inputPaths, String extractedLinesPath, int numDocs, int numDocsToSkip) { 30 | return loadDocuments(inputPaths, extractedLinesPath, numDocs, numDocsToSkip, true, 0.12, false); 31 | } 32 | public static List loadDocuments(List inputPaths, String extractedLinesPath, int numDocs, int numDocsToSkip, boolean uniformLineHeight, double binarizeThreshold, boolean crop) { 33 | List lazyDocs = new ArrayList(); 34 | for (String inputPath : inputPaths) { 35 | lazyDocs.addAll(loadDocumentsFromDir(inputPath, extractedLinesPath, uniformLineHeight, binarizeThreshold, crop)); 36 | } 37 | 38 | int actualNumDocsToSkip = Math.min(lazyDocs.size(), numDocsToSkip); 39 | int actualNumDocsToUse = Math.min(lazyDocs.size() - actualNumDocsToSkip, numDocs <= 0 ? Integer.MAX_VALUE : numDocs); 40 | System.out.println("Using "+actualNumDocsToUse+" documents (skipping "+actualNumDocsToSkip+")"); 41 | for (int docNum = 0; docNum < actualNumDocsToSkip; ++docNum) { 42 | Document lazyDoc = lazyDocs.get(docNum); 43 | System.out.println(" Skipping the first "+numDocsToSkip+" documents: " + lazyDoc.baseName()); 44 | } 45 | 46 | List documents = new ArrayList(); 47 | for (int docNum = actualNumDocsToSkip; docNum < actualNumDocsToSkip + actualNumDocsToUse; ++docNum) { 48 | Document lazyDoc = lazyDocs.get(docNum); 49 | System.out.println(" Using " + lazyDoc.baseName()); 50 | documents.add(lazyDoc); 51 | } 52 | return documents; 53 | } 54 | 55 | private static List loadDocumentsFromDir(String inputPath, String extractedLinesPath, boolean uniformLineHeight, double binarizeThreshold, boolean crop) { 56 | int lineHeight = uniformLineHeight ? CharacterTemplate.LINE_HEIGHT : -1; 57 | 58 | File dir = new File(inputPath); 59 | System.out.println("Reading data from [" + dir + "], which " + (dir.exists() ? "exists" : "does not exist")); 60 | List dirList = FileUtil.recursiveFiles(dir); 61 | 62 | List lazyDocs = new ArrayList(); 63 | for (File f : dirList) { 64 | if (f.getName().endsWith(".txt")) 65 | continue; 66 | else if (f.getName().endsWith(".pdf")) { 67 | int numPages = PdfImageReader.numPagesInPdf(f); 68 | for (int pageNumber = 1; pageNumber <= numPages; ++pageNumber) { 69 | lazyDocs.add(new LazyRawPdfImageDocument(f, pageNumber, inputPath, lineHeight, binarizeThreshold, crop, extractedLinesPath)); 70 | } 71 | } 72 | else { 73 | lazyDocs.add(new LazyRawSingleImageDocument(f, inputPath, lineHeight, binarizeThreshold, crop, extractedLinesPath)); 74 | } 75 | } 76 | 77 | Collections.sort(lazyDocs, new Comparator() { 78 | public int compare(Document o1, Document o2) { 79 | return o1.baseName().compareTo(o2.baseName()); 80 | } 81 | }); 82 | 83 | return lazyDocs; 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/data/LazyRawPdfImageDocument.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.data; 2 | 3 | import java.awt.image.BufferedImage; 4 | import java.io.File; 5 | 6 | import edu.berkeley.cs.nlp.ocular.util.FileUtil; 7 | 8 | /** 9 | * A document that reads a page from a pdf file only as it is needed 10 | * (and then stores the contents in memory for later use). 11 | * 12 | * @author Dan Garrette (dhgarrette@gmail.com) 13 | */ 14 | public class LazyRawPdfImageDocument extends LazyRawImageDocument { 15 | private final File pdfFile; 16 | private final int pageNumber; // starts at one! 17 | 18 | public LazyRawPdfImageDocument(File pdfFile, int pageNumber, String inputPath, int lineHeight, double binarizeThreshold, boolean crop, String extractedLinesPath) { 19 | super(inputPath, lineHeight, binarizeThreshold, crop, extractedLinesPath); 20 | this.pdfFile = pdfFile; 21 | this.pageNumber = pageNumber; 22 | } 23 | 24 | protected BufferedImage doLoadBufferedImage() { 25 | System.out.println("Extracting text line images from " + pdfFile + ", page " + pageNumber); 26 | return PdfImageReader.readPdfPageAsImage(pdfFile, pageNumber); 27 | } 28 | 29 | protected File file() { return pdfFile; } 30 | protected String preext() { return new File(baseName()).getName(); } 31 | protected String ext() { return "png"; } 32 | 33 | public String baseName() { 34 | return FileUtil.withoutExtension(pdfFile.getPath()) + "_pdf_page" + String.format("%05d", pageNumber); 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/data/LazyRawSingleImageDocument.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.data; 2 | 3 | import java.awt.image.BufferedImage; 4 | import java.io.File; 5 | 6 | import edu.berkeley.cs.nlp.ocular.util.FileUtil; 7 | import tberg.murphy.fileio.f; 8 | 9 | /** 10 | * A document that reads a file only as it is needed (and then stores 11 | * the contents in memory for later use). 12 | * 13 | * @author Dan Garrette (dhgarrette@gmail.com) 14 | */ 15 | public class LazyRawSingleImageDocument extends LazyRawImageDocument { 16 | private final File file; 17 | 18 | public LazyRawSingleImageDocument(File file, String inputPath, int lineHeight, double binarizeThreshold, boolean crop, String extractedLinesPath) { 19 | super(inputPath, lineHeight, binarizeThreshold, crop, extractedLinesPath); 20 | this.file = file; 21 | } 22 | 23 | protected BufferedImage doLoadBufferedImage() { 24 | System.out.println("Extracting text line images from " + file); 25 | return f.readImage(file.getPath()); 26 | } 27 | 28 | protected File file() { return file; } 29 | protected String preext() { return FileUtil.withoutExtension(file.getName()); } 30 | protected String ext() { return FileUtil.extension(file.getName()); } 31 | 32 | public String baseName() { 33 | return file.getPath(); 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/data/PdfImageReader.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.data; 2 | 3 | import java.awt.Graphics2D; 4 | import java.awt.Image; 5 | import java.awt.Rectangle; 6 | import java.awt.image.BufferedImage; 7 | import java.io.File; 8 | import java.io.IOException; 9 | import java.io.RandomAccessFile; 10 | import java.nio.ByteBuffer; 11 | import java.nio.channels.FileChannel; 12 | import java.util.ArrayList; 13 | import java.util.List; 14 | 15 | import com.sun.pdfview.PDFFile; 16 | import com.sun.pdfview.PDFPage; 17 | 18 | /** 19 | * @author Dan Garrette (dhgarrette@gmail.com) 20 | */ 21 | public class PdfImageReader { 22 | 23 | public static int numPagesInPdf(File pdfFile) { 24 | try { 25 | RandomAccessFile raf = new RandomAccessFile(pdfFile, "r"); 26 | FileChannel channel = raf.getChannel(); 27 | ByteBuffer buf = channel.map(FileChannel.MapMode.READ_ONLY, 0, channel.size()); 28 | PDFFile pdf = new PDFFile(buf); 29 | int numPages = pdf.getNumPages(); 30 | raf.close(); 31 | return numPages; 32 | } 33 | catch (IOException e) { 34 | throw new RuntimeException(e); 35 | } 36 | } 37 | 38 | public static List readPdfAsImages(File pdfFile) { 39 | try { 40 | RandomAccessFile raf = new RandomAccessFile(pdfFile, "r"); 41 | FileChannel channel = raf.getChannel(); 42 | ByteBuffer buf = channel.map(FileChannel.MapMode.READ_ONLY, 0, channel.size()); 43 | PDFFile pdf = new PDFFile(buf); 44 | 45 | List images = new ArrayList(); 46 | for (int pageNumber = 1; pageNumber <= pdf.getNumPages(); ++pageNumber) { 47 | images.add(readPage(pdf, pageNumber)); 48 | } 49 | 50 | raf.close(); 51 | return images; 52 | } 53 | catch (IOException e) { 54 | throw new RuntimeException(e); 55 | } 56 | } 57 | 58 | /** 59 | * 60 | * @param pdfFile 61 | * Path to the pdf file. 62 | * @param pageNumber 63 | * One-based page number to read 64 | * @return 65 | */ 66 | public static BufferedImage readPdfPageAsImage(File pdfFile, int pageNumber) { 67 | if (pageNumber < 1) 68 | throw new RuntimeException("page numbering starts with 1; '" + pageNumber + "' given"); 69 | try { 70 | RandomAccessFile raf = new RandomAccessFile(pdfFile, "r"); 71 | FileChannel channel = raf.getChannel(); 72 | ByteBuffer buf = channel.map(FileChannel.MapMode.READ_ONLY, 0, channel.size()); 73 | PDFFile pdf = new PDFFile(buf); 74 | BufferedImage image = readPage(pdf, pageNumber); 75 | raf.close(); 76 | return image; 77 | } 78 | catch (IOException e) { 79 | throw new RuntimeException(e); 80 | } 81 | } 82 | 83 | private static BufferedImage readPage(PDFFile pdf, int pageNumber) { 84 | double scale = 2.5; // because otherwise the image comes out really tiny 85 | PDFPage page = pdf.getPage(pageNumber); 86 | Rectangle rect = new Rectangle(0, 0, (int) page.getBBox().getWidth(), (int) page.getBBox().getHeight()); 87 | BufferedImage bufferedImage = new BufferedImage((int)(rect.width * scale), (int)(rect.height * scale), BufferedImage.TYPE_INT_RGB); 88 | Image image = page.getImage((int)(rect.width * scale), (int)(rect.height * scale), rect, null, true, true); 89 | Graphics2D bufImageGraphics = bufferedImage.createGraphics(); 90 | bufImageGraphics.drawImage(image, 0, 0, null); 91 | return bufferedImage; 92 | } 93 | } 94 | 95 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/data/RawImageLoader.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.data; 2 | 3 | import tberg.murphy.fileio.f; 4 | import edu.berkeley.cs.nlp.ocular.image.ImageUtils; 5 | import edu.berkeley.cs.nlp.ocular.image.ImageUtils.PixelType; 6 | 7 | import java.io.File; 8 | import java.io.FilenameFilter; 9 | import java.util.Arrays; 10 | import java.util.List; 11 | 12 | import edu.berkeley.cs.nlp.ocular.preprocessing.Binarizer; 13 | import edu.berkeley.cs.nlp.ocular.preprocessing.Cropper; 14 | import edu.berkeley.cs.nlp.ocular.preprocessing.LineExtractor; 15 | import edu.berkeley.cs.nlp.ocular.preprocessing.Straightener; 16 | import tberg.murphy.threading.BetterThreader; 17 | 18 | /** 19 | * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu) 20 | */ 21 | public class RawImageLoader { 22 | 23 | public static class RawImageDocument implements Document { 24 | private final String baseName; 25 | final PixelType[][][] observations; 26 | 27 | public RawImageDocument(String inputPath, String baseName, int lineHeight, double binarizeThreshold) { 28 | this.baseName = baseName; 29 | double[][] levels = ImageUtils.getLevels(f.readImage(inputPath+"/"+baseName)); 30 | double[][] rotLevels = Straightener.straighten(levels); 31 | double[][] cropLevels = Cropper.crop(rotLevels, binarizeThreshold); 32 | Binarizer.binarizeGlobal(binarizeThreshold, cropLevels); 33 | List lines = LineExtractor.extractLines(cropLevels); 34 | observations = new PixelType[lines.size()][][]; 35 | for (int i=0; i= 0) { 37 | observations[i] = ImageUtils.getPixelTypes(ImageUtils.resampleImage(ImageUtils.makeImage(lines.get(i)), lineHeight)); 38 | } else { 39 | observations[i] = ImageUtils.getPixelTypes(ImageUtils.makeImage(lines.get(i))); 40 | } 41 | } 42 | } 43 | 44 | public PixelType[][][] loadLineImages() { 45 | return observations; 46 | } 47 | 48 | public String[][] loadDiplomaticTextLines() { 49 | return null; 50 | } 51 | 52 | public String[][] loadNormalizedTextLines() { 53 | return null; 54 | } 55 | 56 | public List loadNormalizedText() { 57 | return null; 58 | } 59 | 60 | public String baseName() { 61 | return baseName; 62 | } 63 | 64 | } 65 | 66 | public static List loadDocuments(final String inputPath, final int lineHeight, final double binarizeThreshold, final int numThreads) { 67 | System.out.println("Extracting text line images from dataset "+inputPath); 68 | File dir = new File(inputPath); 69 | final String[] dirList = dir.list(new FilenameFilter() { 70 | public boolean accept(File dir, String name) { 71 | if (name.startsWith(".")) { // ignore hidden files 72 | return false; 73 | } 74 | else if (!name.endsWith(".png") && !name.endsWith(".jpg")) { 75 | return false; 76 | } 77 | return true; 78 | } 79 | }); 80 | final Document[] docs = new Document[dirList.length]; 81 | BetterThreader.Function func = new BetterThreader.Function(){public void call(Integer i, Object ignore){ 82 | String baseName = dirList[i]; 83 | docs[i] = new RawImageDocument(inputPath, baseName, lineHeight, binarizeThreshold); 84 | }}; 85 | BetterThreader threader = new BetterThreader(func, numThreads); 86 | for (int i=0; i= 0) { 38 | observations[i] = ImageUtils.getPixelTypes(ImageUtils.resampleImage(f.readImage(imgPathPrefix + i + imgNameSuffix), lineHeight)); 39 | } else { 40 | observations[i] = ImageUtils.getPixelTypes(f.readImage(imgPathPrefix + i + imgNameSuffix)); 41 | } 42 | } catch (Exception e) { 43 | throw new RuntimeException("Couldn't read doc from: " + imgPathPrefix + i + imgNameSuffix); 44 | } 45 | } 46 | return observations; 47 | } 48 | 49 | public String[][] loadDiplomaticTextLines() { 50 | File textFile = new File(textPath); 51 | String[][] text = (!textFile.exists() ? null : f.readDocumentByCharacter(textPath, numLines)); 52 | return text; 53 | } 54 | 55 | public String[][] loadNormalizedTextLines() { 56 | return null; 57 | } 58 | 59 | public List loadNormalizedText() { 60 | return null; 61 | } 62 | 63 | public String baseName() { 64 | String[] split = imgPathPrefix.split("/"); 65 | String baseNamePlusHyphen = split[split.length-1]; 66 | return baseNamePlusHyphen.substring(0, baseNamePlusHyphen.length()-1); 67 | } 68 | 69 | public boolean useLongS() { 70 | return useLongS; 71 | } 72 | } 73 | 74 | public static List loadDocuments(String inputPath, int lineHeight) { 75 | List lines = f.readLines(inputPath); 76 | List docs = new ArrayList(); 77 | File inputFile = new File(inputPath); 78 | for (String line : lines) { 79 | if (line.trim().equals("")) continue; 80 | String[] split = line.split("\\s+"); 81 | docs.add(new TextAndLineImagesDocument(inputFile.getParentFile().getAbsolutePath()+"/"+split[0], split[1], inputFile.getParentFile().getAbsolutePath()+"/"+split[2], Boolean.parseBoolean(split[3]), Integer.parseInt(split[4]), lineHeight)); 82 | } 83 | return docs; 84 | } 85 | 86 | } 87 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/data/textreader/BasicTextReader.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.data.textreader; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | /** 7 | * @author Dan Garrette (dhgarrette@gmail.com) 8 | */ 9 | public class BasicTextReader implements TextReader { 10 | 11 | private boolean treatBackslashAsEscape; 12 | 13 | public BasicTextReader(boolean treatBackslashAsEscape) { 14 | this.treatBackslashAsEscape = treatBackslashAsEscape; 15 | } 16 | 17 | public BasicTextReader() { 18 | this.treatBackslashAsEscape = true; 19 | } 20 | 21 | public List> readCharacters(List lines) { 22 | List> characterLines = new ArrayList>(); 23 | for (String l : lines) 24 | characterLines.add(readCharacters(l)); 25 | return characterLines; 26 | } 27 | 28 | public List readCharacters(String line) { 29 | if (!treatBackslashAsEscape) { 30 | line = line.replace("\\", "\\\\"); 31 | } 32 | 33 | line = line.replace("``", "\""); 34 | line = line.replace("''", "\""); 35 | line = line.replace("\t", " "); 36 | 37 | // Split characters and convert to diacritic-normalized forms. 38 | List normalizedChars = new ArrayList(); 39 | for (String c : Charset.readNormalizeCharacters(line)) { 40 | normalizedChars.add(c); 41 | } 42 | return normalizedChars; 43 | } 44 | 45 | public String toString() { 46 | return "BasicTextReader(" + treatBackslashAsEscape + ")"; 47 | } 48 | 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/data/textreader/BlacklistCharacterSetTextReader.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.data.textreader; 2 | 3 | import java.util.ArrayList; 4 | import java.util.HashSet; 5 | import java.util.List; 6 | import java.util.Set; 7 | 8 | /** 9 | * @author Dan Garrette (dhgarrette@gmail.com) 10 | */ 11 | public class BlacklistCharacterSetTextReader implements TextReader { 12 | 13 | private Set allInvalidCharacters = new HashSet(); 14 | private TextReader delegate; 15 | 16 | public BlacklistCharacterSetTextReader(Set invalidCharacters, TextReader delegate) { 17 | for (String c : invalidCharacters) { 18 | allInvalidCharacters.add(Charset.normalizeChar(c)); 19 | } 20 | this.delegate = delegate; 21 | } 22 | 23 | public List readCharacters(String line) { 24 | List chars = new ArrayList(); 25 | for (String c : delegate.readCharacters(line)) { 26 | if (!allInvalidCharacters.contains(c)) { 27 | chars.add(c); 28 | } 29 | } 30 | return chars; 31 | } 32 | 33 | public String toString() { 34 | return "BlacklistCharacterSetTextReader(" + delegate + ")"; 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/data/textreader/CharIndexer.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.data.textreader; 2 | 3 | import java.util.Collection; 4 | 5 | import tberg.murphy.indexer.HashMapIndexer; 6 | import tberg.murphy.indexer.Indexer; 7 | 8 | /** 9 | * @author Dan Garrette (dhgarrette@gmail.com) 10 | */ 11 | public class CharIndexer implements Indexer { 12 | private static final long serialVersionUID = 3212987272223100239L; 13 | 14 | private Indexer delegate; 15 | 16 | public CharIndexer() { 17 | delegate = new HashMapIndexer(); 18 | } 19 | 20 | public boolean contains(String object) { 21 | return delegate.contains(Charset.normalizeChar(object)); 22 | } 23 | 24 | public int getIndex(String object) { 25 | return delegate.getIndex(Charset.normalizeChar(object)); 26 | } 27 | 28 | public void index(String[] vect) { 29 | for (String x : vect) 30 | getIndex(x); 31 | } 32 | 33 | public boolean locked() { return delegate.locked(); } 34 | public void lock() { delegate.lock(); } 35 | public int size() { return delegate.size(); } 36 | public String getObject(int index) { return delegate.getObject(index); } 37 | public void forgetIndexLookup() { delegate.forgetIndexLookup(); } 38 | public Collection getObjects() { return delegate.getObjects(); } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/data/textreader/ConvertLongSTextReader.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.data.textreader; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | /** 7 | * @author Dan Garrette (dhgarrette@gmail.com) 8 | */ 9 | public class ConvertLongSTextReader implements TextReader { 10 | 11 | private TextReader delegate; 12 | 13 | public ConvertLongSTextReader(TextReader delegate) { 14 | this.delegate = delegate; 15 | } 16 | 17 | public List readCharacters(String line) { 18 | List chars = new ArrayList(); 19 | for (String c : delegate.readCharacters(line)) { 20 | chars.add(c); 21 | } 22 | 23 | /* 24 | * Replace 's' characters with 'long-s' characters. 25 | */ 26 | // for every letter except the last (since the last letter can 27 | // never be a long-s since it can never be followed by a letter 28 | for (int t = 0; t < chars.size() - 1; t++) { 29 | if (chars.get(t).equals("s")) { 30 | String next = chars.get(t + 1); 31 | String nextWithoutDiacritics = Charset.removeAnyDiacriticFromChar(next); 32 | if (nextWithoutDiacritics.length() != 1) { 33 | if (!nextWithoutDiacritics.equals("\\\\")) { 34 | throw new AssertionError("expected nextWithoutDiacritics [" + nextWithoutDiacritics + "] length() == 1"); 35 | } 36 | } 37 | char nextWithoutDiacriticsChar = nextWithoutDiacritics.charAt(0); 38 | if (t > 0 && chars.get(t - 1).equals(Charset.LONG_S) && nextWithoutDiacriticsChar == 'i') { 39 | // "ſsi": do nothing 40 | } 41 | else if (Character.isAlphabetic(nextWithoutDiacriticsChar)) { 42 | chars.set(t, Charset.LONG_S); 43 | } 44 | } 45 | } 46 | 47 | return chars; 48 | } 49 | 50 | public String toString() { 51 | return "ConvertLongSTextReader(" + delegate + ")"; 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/data/textreader/FlipUVTextReader.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.data.textreader; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | import java.util.Random; 6 | 7 | /** 8 | * @author Dan Garrette (dhgarrette@gmail.com) 9 | */ 10 | public class FlipUVTextReader implements TextReader { 11 | 12 | private double flipRate; 13 | private TextReader delegate; 14 | 15 | private Random rand = new Random(0); 16 | 17 | public FlipUVTextReader(double flipRate, TextReader delegate) { 18 | this.flipRate = flipRate; 19 | this.delegate = delegate; 20 | } 21 | 22 | public List readCharacters(String line) { 23 | List chars = new ArrayList(); 24 | for (String c : delegate.readCharacters(line)) { 25 | if (c.equals("u")) { 26 | if (rand.nextDouble() < flipRate) { 27 | chars.add("u"); 28 | } else { 29 | chars.add("v"); 30 | } 31 | } else if (c.equals("U")) { 32 | if (rand.nextDouble() < flipRate) { 33 | chars.add("U"); 34 | } else { 35 | chars.add("V"); 36 | } 37 | } else if (c.equals("v")) { 38 | if (rand.nextDouble() < flipRate) { 39 | chars.add("v"); 40 | } else { 41 | chars.add("u"); 42 | } 43 | } else if (c.equals("V")) { 44 | if (rand.nextDouble() < flipRate) { 45 | chars.add("V"); 46 | } else { 47 | chars.add("U"); 48 | } 49 | } else { 50 | chars.add(c); 51 | } 52 | } 53 | return chars; 54 | } 55 | 56 | public String toString() { 57 | return "FlipUVTextReader(" + flipRate + ", " + delegate + ")"; 58 | } 59 | 60 | } 61 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/data/textreader/RemoveAllDiacriticsTextReader.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.data.textreader; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | /** 7 | * @author Dan Garrette (dhgarrette@gmail.com) 8 | */ 9 | public class RemoveAllDiacriticsTextReader implements TextReader { 10 | 11 | private TextReader delegate; 12 | 13 | public RemoveAllDiacriticsTextReader(TextReader delegate) { 14 | this.delegate = delegate; 15 | } 16 | 17 | public List readCharacters(String line) { 18 | List chars = new ArrayList(); 19 | for (String c : delegate.readCharacters(line)) { 20 | chars.add(Charset.removeAnyDiacriticFromChar(c)); 21 | } 22 | return chars; 23 | } 24 | 25 | public String toString() { 26 | return "RemoveAllDiacriticsTextReader(" + delegate + ")"; 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/data/textreader/ReplaceSomeTextReader.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.data.textreader; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Arrays; 5 | import java.util.Iterator; 6 | import java.util.List; 7 | 8 | import edu.berkeley.cs.nlp.ocular.util.CollectionHelper; 9 | import edu.berkeley.cs.nlp.ocular.util.Tuple2; 10 | import static edu.berkeley.cs.nlp.ocular.util.Tuple2.Tuple2; 11 | import tberg.murphy.fileio.f; 12 | 13 | /** 14 | * @author Dan Garrette (dhgarrette@gmail.com) 15 | */ 16 | public class ReplaceSomeTextReader implements TextReader { 17 | 18 | private final List, List>, Integer>> rules; 19 | private final TextReader delegate; 20 | private final int[] occurrences; 21 | 22 | /** 23 | * @param delegate 24 | * @param rules <, each> Replace `input` by `output` every `each` occurrences 25 | */ 26 | public ReplaceSomeTextReader(List, List>, Integer>> rules, TextReader delegate) { 27 | this.rules = rules; 28 | this.delegate = delegate; 29 | this.occurrences = new int[rules.size()]; 30 | } 31 | 32 | public List readCharacters(String line) { 33 | List result = delegate.readCharacters(line); 34 | for (int i = 0; i < rules.size(); ++i) { 35 | Tuple2, List>, Integer> r = rules.get(i); 36 | List input = r._1._1; 37 | List output = r._1._2; 38 | int each = r._2; 39 | List newResult = new ArrayList(); 40 | for (int j = 0; j < input.size() - 1; ++j) { 41 | // add some buffer to the end so sliding goes to the end 42 | result.add(null); 43 | } 44 | Iterator> iter = CollectionHelper.sliding(result, input.size()); 45 | while (iter.hasNext()) { 46 | List x = iter.next(); 47 | if (x.equals(input)) { 48 | if (x.equals(input) && occurrences[i] % each == each - 1) { 49 | newResult.addAll(output); // add `output` to the result (to replace `input`) 50 | for (int j = 0; j < input.size() - 1; ++j) { 51 | //remove the rest of `input` from `iter` 52 | iter.next(); 53 | } 54 | } 55 | else { 56 | newResult.add(x.get(0)); 57 | } 58 | ++occurrences[i]; 59 | } 60 | else { 61 | newResult.add(x.get(0)); 62 | } 63 | } 64 | result = newResult; 65 | } 66 | return result; 67 | } 68 | 69 | public static List, List>, Integer>> loadRulesFromFile(String path) { 70 | List, List>, Integer>> result = new ArrayList, List>, Integer>>(); 71 | for (String line : f.readLines(path)) { 72 | if (!line.trim().isEmpty()) { 73 | String[] parts = line.split("\t"); 74 | if (parts.length != 3) throw new RuntimeException("line does not contain 3 parts. found: " + Arrays.asList(parts)); 75 | result.add(Tuple2(Tuple2(Charset.readNormalizeCharacters(parts[0]), Charset.readNormalizeCharacters(parts[1])), Integer.valueOf(parts[2]))); 76 | } 77 | } 78 | return result; 79 | } 80 | 81 | public String toString() { 82 | return "ReplaceSomeTextReader(rules=..., " + delegate + ")"; 83 | } 84 | 85 | } 86 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/data/textreader/TextReader.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.data.textreader; 2 | 3 | import java.util.List; 4 | 5 | /** 6 | * @author Dan Garrette (dhgarrette@gmail.com) 7 | */ 8 | public interface TextReader { 9 | 10 | /** 11 | * @param line A line of text, possibly containing diacritics (precomposed, composed, or escaped). 12 | * @return A list of normalized characters. 13 | */ 14 | public List readCharacters(String line); 15 | 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/data/textreader/WhitelistCharacterSetTextReader.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.data.textreader; 2 | 3 | import java.util.ArrayList; 4 | import java.util.HashSet; 5 | import java.util.List; 6 | import java.util.Set; 7 | 8 | /** 9 | * @author Dan Garrette (dhgarrette@gmail.com) 10 | */ 11 | public class WhitelistCharacterSetTextReader implements TextReader { 12 | 13 | private Set allValidCharacters = new HashSet(); 14 | private boolean disregardDiacritics; 15 | private TextReader delegate; 16 | 17 | /** 18 | * @param validCharacters The set of characters that are allowed. 19 | * Any other character will be skipped. 20 | * @param disregardDiacritics If true, then a character with a diacritic 21 | * will be considered valid even if only its non-diacritic version is in 22 | * the validCharcters set. 23 | * @param delegate 24 | */ 25 | public WhitelistCharacterSetTextReader(Set validCharacters, boolean disregardDiacritics, TextReader delegate) { 26 | if (validCharacters.isEmpty()) { 27 | throw new RuntimeException("validCharacters is empty in WhitelistCharacterSetTextReader constructor"); 28 | } 29 | 30 | for (String c : validCharacters) { 31 | allValidCharacters.add(Charset.normalizeChar(c)); 32 | } 33 | allValidCharacters.add(Charset.SPACE); 34 | 35 | this.disregardDiacritics = disregardDiacritics; 36 | this.delegate = delegate; 37 | } 38 | 39 | public WhitelistCharacterSetTextReader(Set validCharacters, TextReader delegate) { 40 | this(validCharacters, false, delegate); 41 | } 42 | 43 | public List readCharacters(String line) { 44 | List chars = new ArrayList(); 45 | for (String c : delegate.readCharacters(line)) { 46 | if (allValidCharacters.contains(c)) { 47 | chars.add(c); 48 | } 49 | else if (disregardDiacritics && allValidCharacters.contains(Charset.removeAnyDiacriticFromChar(c))) { 50 | chars.add(c); 51 | } 52 | } 53 | return chars; 54 | } 55 | 56 | public String toString() { 57 | return "WhitelistCharacterSetTextReader(" + delegate + ")"; 58 | } 59 | 60 | } 61 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/eval/AlignedFormPair.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.eval; 2 | 3 | import java.util.List; 4 | 5 | /** 6 | * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu) 7 | */ 8 | public class AlignedFormPair { 9 | 10 | public final Form src; 11 | public final Form trg; 12 | public final List ops; 13 | public final double cost; 14 | 15 | public AlignedFormPair(Form src, Form trg, List ops, double cost) { 16 | this.src = src; 17 | this.trg = trg; 18 | this.ops = ops; 19 | this.cost = cost; 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/eval/BasicMultiDocumentTranscriber.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.eval; 2 | 3 | import static edu.berkeley.cs.nlp.ocular.util.Tuple2.Tuple2; 4 | 5 | import java.io.File; 6 | import java.text.SimpleDateFormat; 7 | import java.util.ArrayList; 8 | import java.util.Calendar; 9 | import java.util.List; 10 | import java.util.Map; 11 | import java.util.Set; 12 | 13 | import edu.berkeley.cs.nlp.ocular.data.Document; 14 | import edu.berkeley.cs.nlp.ocular.eval.Evaluator.EvalSuffStats; 15 | import edu.berkeley.cs.nlp.ocular.font.Font; 16 | import edu.berkeley.cs.nlp.ocular.gsm.GlyphSubstitutionModel; 17 | import edu.berkeley.cs.nlp.ocular.lm.CodeSwitchLanguageModel; 18 | import edu.berkeley.cs.nlp.ocular.main.FonttrainTranscribeShared.OutputFormat; 19 | import edu.berkeley.cs.nlp.ocular.model.CharacterTemplate; 20 | import edu.berkeley.cs.nlp.ocular.model.DecodeState; 21 | import edu.berkeley.cs.nlp.ocular.model.DecoderEM; 22 | import edu.berkeley.cs.nlp.ocular.model.em.DenseBigramTransitionModel; 23 | import edu.berkeley.cs.nlp.ocular.train.FontTrainer; 24 | import edu.berkeley.cs.nlp.ocular.util.Tuple2; 25 | import tberg.murphy.indexer.Indexer; 26 | 27 | /** 28 | * Transcribe all document, write their results to files, and evaluate the results. 29 | * 30 | * @author Dan Garrette (dhgarrette@gmail.com) 31 | */ 32 | public class BasicMultiDocumentTranscriber implements MultiDocumentTranscriber { 33 | private List documents; 34 | private String inputDocPath; 35 | private String outputPath; 36 | private Set outputFormats; 37 | private DecoderEM decoderEM; 38 | private SingleDocumentEvaluatorAndOutputPrinter docOutputPrinterAndEvaluator; 39 | private Indexer charIndexer; 40 | private boolean skipFailedDocs; 41 | 42 | public BasicMultiDocumentTranscriber( 43 | List documents, String inputDocPath, String outputPath, Set outputFormats, 44 | DecoderEM decoderEM, 45 | SingleDocumentEvaluatorAndOutputPrinter documentOutputPrinterAndEvaluator, 46 | Indexer charIndexer, 47 | boolean skipFailedDocs) { 48 | this.documents = documents; 49 | this.inputDocPath = inputDocPath; 50 | this.outputPath = outputPath; 51 | this.outputFormats = outputFormats; 52 | this.decoderEM = decoderEM; 53 | this.docOutputPrinterAndEvaluator = documentOutputPrinterAndEvaluator; 54 | this.charIndexer = charIndexer; 55 | this.skipFailedDocs = skipFailedDocs; 56 | } 57 | 58 | public void transcribe(Font font, CodeSwitchLanguageModel lm, GlyphSubstitutionModel gsm) { 59 | transcribe(0, 0, font, lm, gsm); 60 | } 61 | 62 | public void transcribe(int iter, int batchId, Font font, CodeSwitchLanguageModel lm, GlyphSubstitutionModel gsm) { 63 | int numDocs = documents.size(); 64 | CharacterTemplate[] templates = FontTrainer.loadTemplates(font, charIndexer); 65 | DenseBigramTransitionModel backwardTransitionModel = new DenseBigramTransitionModel(lm); 66 | 67 | double totalJointLogProb = 0.0; 68 | List>> allDiplomaticEvals = new ArrayList>>(); 69 | List>> allNormalizedEvals = new ArrayList>>(); 70 | for (int docNum = 0; docNum < numDocs; ++docNum) { 71 | Document doc = documents.get(docNum); 72 | System.out.println((iter > 0 ? "Training iteration "+iter+", " : "") + (batchId > 0 ? "batch "+batchId+", " : "") + "Transcribing eval document "+(docNum+1)+" of "+numDocs+": "+doc.baseName() + " " + (new SimpleDateFormat("yyyy/MM/dd HH:mm:ss").format(Calendar.getInstance().getTime()))); 73 | 74 | try { 75 | Tuple2 decodeResults = decoderEM.computeEStep(doc, false, lm, gsm, templates, backwardTransitionModel); 76 | final DecodeState[][] decodeStates = decodeResults._1; 77 | totalJointLogProb += decodeResults._2; 78 | 79 | Tuple2,Map> evals = docOutputPrinterAndEvaluator.evaluateAndPrintTranscription(iter, batchId, doc, decodeStates, inputDocPath, outputPath, outputFormats, lm); 80 | if (evals._1 != null) allDiplomaticEvals.add(Tuple2(doc.baseName(), evals._1)); 81 | if (evals._2 != null) allNormalizedEvals.add(Tuple2(doc.baseName(), evals._2)); 82 | } catch(RuntimeException e) { 83 | if (skipFailedDocs) { 84 | System.err.println("DOCUMENT FAILED! Skipping " + doc.baseName()); 85 | e.printStackTrace(); 86 | } else { 87 | throw e; 88 | } 89 | } 90 | } 91 | double avgLogProb = totalJointLogProb / numDocs; 92 | System.out.println("Iteration "+iter+", batch "+batchId+": eval avg joint log prob: " + avgLogProb); 93 | if (new File(inputDocPath).isDirectory()) { 94 | //Document doc = documents.get(0); 95 | //String fileParent = FileUtil.removeCommonPathPrefixOfParents(new File(inputDocPath), new File(doc.baseName()))._2; 96 | String preext = "eval"; 97 | String outputFilenameBase = outputPath + "/all_transcriptions/" + new File(inputDocPath).getName() + "/" + preext; 98 | if (iter > 0) outputFilenameBase += "_iter-" + iter; 99 | if (batchId > 0) outputFilenameBase += "_batch-" + batchId; 100 | if (!allDiplomaticEvals.isEmpty()) 101 | EvalPrinter.printEvaluation(allDiplomaticEvals, outputFilenameBase + "_diplomatic.txt"); 102 | if (!allNormalizedEvals.isEmpty()) 103 | EvalPrinter.printEvaluation(allNormalizedEvals, outputFilenameBase + "_normalized.txt"); 104 | } 105 | } 106 | 107 | } 108 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/eval/ErrorSampler.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.eval; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Arrays; 5 | import java.util.Collections; 6 | import java.util.List; 7 | import java.util.Random; 8 | 9 | import edu.berkeley.cs.nlp.ocular.eval.MarkovEditDistanceComputer.EditDistanceParams; 10 | import tberg.murphy.fileio.f; 11 | import tberg.murphy.tuple.Pair; 12 | 13 | /** 14 | * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu) 15 | */ 16 | public class ErrorSampler { 17 | 18 | public static class Error implements Comparable { 19 | public final int docIdx; 20 | public final int lineIdx; 21 | public final int guessTokenIdx; 22 | public final String guess; 23 | public final String gold; 24 | 25 | public static final String INSERTION = ""; 26 | public static final String DELETION = ""; 27 | 28 | public Error(int docIdx, int lineIdx, int guessColumn, String guess, String gold) { 29 | this.docIdx = docIdx; 30 | this.lineIdx = lineIdx; 31 | this.guessTokenIdx = guessColumn; 32 | this.guess = guess; 33 | this.gold = gold; 34 | } 35 | 36 | @Override 37 | public int compareTo(Error e1) { 38 | if (this.docIdx != e1.docIdx) { 39 | return this.docIdx - e1.docIdx; 40 | } else if (this.lineIdx != e1.lineIdx) { 41 | return this.lineIdx - e1.lineIdx; 42 | } 43 | return this.guessTokenIdx - e1.guessTokenIdx; 44 | } 45 | 46 | public String toString() { 47 | return "Doc " + docIdx + ", line " + lineIdx + ", guess idx " + guessTokenIdx + ": guess = " + guess + ", gold = " + gold; 48 | } 49 | 50 | } 51 | 52 | public static void main(String[] args) { 53 | List errors = aggregateWordErrors(args); 54 | final int NUM_ERRORS = 50; 55 | Collections.shuffle(errors, new Random(0)); 56 | List selectedErrors = errors.subList(0, Math.min(errors.size(), NUM_ERRORS)); 57 | Collections.sort(selectedErrors); 58 | for (int i = 0; i < selectedErrors.size(); i++) { 59 | System.out.println(selectedErrors.get(i).toString()); 60 | } 61 | } 62 | 63 | public static List aggregateWordErrors(String[] fileNames) { 64 | List allErrors = new ArrayList(); 65 | for (int fileIdx = 0; fileIdx < fileNames.length; fileIdx++) { 66 | String fileName = fileNames[fileIdx]; 67 | Pair,List> goldGuessLines = getGoldGuessLinesFromOutput(fileName); 68 | List goldLines = goldGuessLines.getFirst(); 69 | List guessLines = goldGuessLines.getSecond(); 70 | assert goldLines.size() == guessLines.size(); 71 | for (int i = 0; i < goldLines.size(); i++) { 72 | String goldStr = goldLines.get(i).replaceAll("\\|", "s"); 73 | String guessStr = guessLines.get(i).replaceAll("\\|", "s"); 74 | Form guessForm = Form.wordsAsGlyphs(Arrays.asList(guessStr.split("\\s+"))); 75 | Form goldForm = Form.wordsAsGlyphs(Arrays.asList(goldStr.split("\\s+"))); 76 | EditDistanceParams params = EditDistanceParams.getStandardParams(guessForm, goldForm, false); 77 | MarkovEditDistanceComputer medc = new MarkovEditDistanceComputer(params); 78 | AlignedFormPair alignedPair = medc.runEditDistance(); 79 | assert alignedPair.trg.length() == goldForm.length(); 80 | int srcGuessIdx = 0; 81 | int trgGoldIdx = 0; 82 | for (Operation op : alignedPair.ops) { 83 | switch (op) { 84 | case EQUAL: 85 | srcGuessIdx++; 86 | trgGoldIdx++; 87 | break; 88 | case SUBST: 89 | allErrors.add(new Error(fileIdx, i, srcGuessIdx, guessForm.charAt(srcGuessIdx).toString(), goldForm.charAt(trgGoldIdx).toString())); 90 | srcGuessIdx++; 91 | trgGoldIdx++; 92 | break; 93 | case INSERT: 94 | allErrors.add(new Error(fileIdx, i, srcGuessIdx, Error.INSERTION, goldForm.charAt(trgGoldIdx).toString())); 95 | trgGoldIdx++; 96 | break; 97 | case DELETE: 98 | allErrors.add(new Error(fileIdx, i, srcGuessIdx, guessForm.charAt(srcGuessIdx).toString(), Error.DELETION)); 99 | srcGuessIdx++; 100 | break; 101 | } 102 | } 103 | } 104 | System.out.println("Processed file " + fileNames[fileIdx] + " with " + goldLines.size() + " lines, cumulative errors = " + allErrors.size()); 105 | } 106 | return allErrors; 107 | } 108 | 109 | public static Pair,List> getGoldGuessLinesFromOutput(String outFile) { 110 | List lines = f.readLines(outFile); 111 | List guessLines = new ArrayList(); 112 | List goldLines = new ArrayList(); 113 | for (int i = 0; i < lines.size(); i++) { 114 | String currLine = lines.get(i).trim(); 115 | if (i % 3 == 0 && currLine.equals("")) { 116 | break; 117 | } 118 | switch (i % 3) { 119 | case 0: guessLines.add(currLine); 120 | break; 121 | case 1: goldLines.add(currLine); 122 | break; 123 | case 2: assert currLine.equals(""); 124 | break; 125 | } 126 | } 127 | return Pair.makePair(goldLines, guessLines); 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/eval/EvalPrinter.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.eval; 2 | 3 | import java.util.HashMap; 4 | import java.util.List; 5 | import java.util.Map; 6 | 7 | import edu.berkeley.cs.nlp.ocular.eval.Evaluator.EvalSuffStats; 8 | import edu.berkeley.cs.nlp.ocular.util.FileHelper; 9 | import edu.berkeley.cs.nlp.ocular.util.Tuple2; 10 | 11 | /** 12 | * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu) 13 | * @author Dan Garrette (dhgarrette@gmail.com) 14 | */ 15 | public class EvalPrinter { 16 | 17 | public static void printEvaluation(List>> allEvals, String outputPath) { 18 | Map totalSuffStats = new HashMap(); 19 | StringBuffer buf = new StringBuffer(); 20 | buf.append("All evals:\n"); 21 | for (Tuple2> docNameAndEvals : allEvals) { 22 | String docName = docNameAndEvals._1; 23 | Map evals = docNameAndEvals._2; 24 | buf.append("Document: " + docName + "\n"); 25 | buf.append(Evaluator.renderEval(evals) + "\n"); 26 | for (String evalType : evals.keySet()) { 27 | EvalSuffStats eval = evals.get(evalType); 28 | EvalSuffStats totalEval = totalSuffStats.get(evalType); 29 | if (totalEval == null) { 30 | totalEval = new EvalSuffStats(); 31 | totalSuffStats.put(evalType, totalEval); 32 | } 33 | totalEval.increment(eval); 34 | } 35 | } 36 | 37 | buf.append("\nMacro-avg total eval:\n"); 38 | buf.append(Evaluator.renderEval(totalSuffStats) + "\n"); 39 | 40 | FileHelper.writeString(outputPath, buf.toString()); 41 | System.out.println("\n" + outputPath); 42 | System.out.println(buf.toString()); 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/eval/Form.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.eval; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Collections; 5 | import java.util.Iterator; 6 | import java.util.List; 7 | 8 | import edu.berkeley.cs.nlp.ocular.data.textreader.Charset; 9 | import edu.berkeley.cs.nlp.ocular.util.Tuple2; 10 | 11 | /** 12 | * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu) 13 | */ 14 | public class Form implements Comparable
{ 15 | 16 | private final List glyphs; 17 | 18 | public Form(List glyphs) { 19 | this.glyphs = glyphs; 20 | } 21 | 22 | public static Form charsAsGlyphs(String str) { 23 | return charsAsGlyphs(str, true); 24 | } 25 | 26 | /** 27 | * 28 | * @param str 29 | * @param charIncludesDiacritic If false, the diacritic will be scored separately from the base character. 30 | * @return 31 | */ 32 | public static Form charsAsGlyphs(String str, boolean charIncludesDiacritic) { 33 | List glyphs = new ArrayList(); 34 | for (String c : Charset.readNormalizeCharacters(str)) { 35 | if (charIncludesDiacritic) { 36 | glyphs.add(new Glyph(c)); 37 | } 38 | else { 39 | Tuple2> letterAndNormalDiacritics = Charset.normalizeCharSeparateDiacritics(c); 40 | Collections.sort(letterAndNormalDiacritics._2); 41 | for (String diacritic : letterAndNormalDiacritics._2) { 42 | glyphs.add(new Glyph(diacritic)); 43 | } 44 | glyphs.add(new Glyph(letterAndNormalDiacritics._1)); 45 | } 46 | } 47 | return new Form(glyphs); 48 | } 49 | 50 | public static Form wordsAsGlyphs(List words) { 51 | List glyphs = new ArrayList(); 52 | for (int i = 0; i < words.size(); i++) { 53 | glyphs.add(new Glyph(words.get(i))); 54 | } 55 | return new Form(glyphs); 56 | } 57 | 58 | public Form substring(int start) { 59 | return substring(start, length()); 60 | } 61 | 62 | public Form substring(int start, int end) { 63 | return new Form(glyphs.subList(start, end)); 64 | } 65 | 66 | public int length() { 67 | return glyphs.size(); 68 | } 69 | 70 | public Glyph charAt(int index) { 71 | return glyphs.get(index); 72 | } 73 | 74 | public Form append(Form other) { 75 | List newGlyphs = new ArrayList(); 76 | newGlyphs.addAll(this.glyphs); 77 | newGlyphs.addAll(other.glyphs); 78 | return new Form(newGlyphs); 79 | } 80 | 81 | @Override 82 | public boolean equals(Object other) { 83 | if (other == null || !(other instanceof Form)) { 84 | return false; 85 | } 86 | return this.glyphs.equals(((Form)other).glyphs); 87 | } 88 | 89 | @Override 90 | public int hashCode() { 91 | return this.glyphs.hashCode(); 92 | } 93 | 94 | @Override 95 | public String toString() { 96 | String ret = ""; 97 | for (Glyph glyph : glyphs) { 98 | ret += glyph.toString(); 99 | } 100 | return ret; 101 | } 102 | 103 | public String toStringWithSpaces() { 104 | String ret = ""; 105 | for (Glyph glyph : glyphs) { 106 | ret += glyph.toString() + " "; 107 | } 108 | return ret; 109 | } 110 | 111 | @Override 112 | public int compareTo(Form o) { 113 | return compareCollections(this.glyphs, o.glyphs); 114 | } 115 | 116 | public static > int compareCollections(Iterable col1, Iterable col2) { 117 | Iterator first = col1.iterator(); 118 | Iterator second = col2.iterator(); 119 | while (first.hasNext() && second.hasNext()) { 120 | int result = first.next().compareTo(second.next()); 121 | if (result != 0) { 122 | return result; 123 | } 124 | } 125 | if (!first.hasNext() && !second.hasNext()) { 126 | return 0; 127 | } 128 | // Longer one comes second 129 | return (first.hasNext() ? 1 : -1); 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/eval/Glyph.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.eval; 2 | 3 | /** 4 | * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu) 5 | */ 6 | public class Glyph implements Comparable { 7 | 8 | public final String glyph; 9 | 10 | public Glyph(String glyph) { 11 | this.glyph = glyph; 12 | } 13 | 14 | @Override 15 | public boolean equals(Object other) { 16 | if (other == null || !(other instanceof Glyph)) { 17 | return false; 18 | } 19 | return this.glyph.equals(((Glyph)other).glyph); 20 | } 21 | 22 | @Override 23 | public int hashCode() { 24 | return glyph.hashCode(); 25 | } 26 | 27 | @Override 28 | public String toString() { 29 | return glyph; 30 | } 31 | 32 | @Override 33 | public int compareTo(Glyph o) { 34 | return this.glyph.compareTo(o.glyph); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/eval/LmPerplexity.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.eval; 2 | 3 | import java.util.List; 4 | 5 | import edu.berkeley.cs.nlp.ocular.data.textreader.Charset; 6 | import edu.berkeley.cs.nlp.ocular.lm.CodeSwitchLanguageModel; 7 | import edu.berkeley.cs.nlp.ocular.util.CollectionHelper; 8 | 9 | /** 10 | * @author Hannah Alpert-Abrams (halperta@gmail.com) 11 | * @author Dan Garrette (dhgarrette@gmail.com) 12 | */ 13 | public class LmPerplexity { 14 | 15 | private CodeSwitchLanguageModel lm; 16 | 17 | private final int spaceIndex; 18 | 19 | public LmPerplexity(CodeSwitchLanguageModel lm) { 20 | this.lm = lm; 21 | this.spaceIndex = lm.getCharacterIndexer().getIndex(Charset.SPACE); 22 | } 23 | 24 | public double perplexity(List viterbiNormalizedTranscriptionCharIndices, List viterbiNormalizedTranscriptionLangIndices) { 25 | double logTotalProbability = 0.0; 26 | for (int i=0; i viterbiNormalizedTranscriptionCharIndices, List viterbiNormalizedTranscriptionLangIndices) { 43 | int startPoint = findStartPoint(i, curL, viterbiNormalizedTranscriptionLangIndices); 44 | int[] context = CollectionHelper.intListToArray(viterbiNormalizedTranscriptionCharIndices.subList(startPoint, i)); 45 | return lm.get(curL).getCharNgramProb(context, curC); 46 | } 47 | 48 | private int findStartPoint(int i, int curL, List viterbiNormalizedTranscriptionLangIndices) { 49 | int startPoint = i; 50 | while (startPoint > 0 && getLangIndex(viterbiNormalizedTranscriptionLangIndices, startPoint-1) == curL && i-startPoint < lm.get(curL).getMaxOrder()-1) { 51 | --startPoint; 52 | } 53 | return startPoint; 54 | } 55 | 56 | private double getLangTransitionProb(int i, int curL, List viterbiNormalizedTranscriptionCharIndices, List viterbiNormalizedTranscriptionLangIndices) { 57 | if (i > 0) { 58 | int prevC = viterbiNormalizedTranscriptionCharIndices.get(i-1); 59 | int prevL = getLangIndex(viterbiNormalizedTranscriptionLangIndices, i-1); 60 | if (prevC != spaceIndex) { 61 | if (prevL != curL) throw new RuntimeException("Characters cannot change languages mid-word."); 62 | return 1.0; 63 | } 64 | else { 65 | return lm.languageTransitionProb(prevL, curL); 66 | } 67 | } 68 | else { 69 | return lm.languagePrior(curL); 70 | } 71 | } 72 | 73 | private int getLangIndex(List viterbiNormalizedTranscriptionLangIndices, int i) { 74 | int curL = viterbiNormalizedTranscriptionLangIndices.get(i); 75 | if (curL < 0) { 76 | if (this.lm.getLanguageIndexer().size() == 1) 77 | curL = 0; 78 | else if (i > 0) 79 | throw new RuntimeException("curl="+curL+", i="+i); 80 | } 81 | return curL; 82 | } 83 | 84 | } 85 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/eval/MultiDocumentTranscriber.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.eval; 2 | 3 | import edu.berkeley.cs.nlp.ocular.font.Font; 4 | import edu.berkeley.cs.nlp.ocular.gsm.GlyphSubstitutionModel; 5 | import edu.berkeley.cs.nlp.ocular.lm.CodeSwitchLanguageModel; 6 | 7 | /** 8 | * @author Dan Garrette (dhgarrette@gmail.com) 9 | */ 10 | public interface MultiDocumentTranscriber { 11 | 12 | public void transcribe(Font font, CodeSwitchLanguageModel lm, GlyphSubstitutionModel gsm); 13 | public void transcribe(int iter, int batchId, Font font, CodeSwitchLanguageModel lm, GlyphSubstitutionModel gsm); 14 | 15 | /** 16 | * No-op evaluator implementation 17 | */ 18 | public static class NoOpMultiDocumentTranscriber implements MultiDocumentTranscriber { 19 | public void transcribe(Font font, CodeSwitchLanguageModel lm, GlyphSubstitutionModel gsm) {} 20 | public void transcribe(int iter, int batchId, Font font, CodeSwitchLanguageModel lm, GlyphSubstitutionModel gsm) {} 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/eval/Operation.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.eval; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | /** 7 | * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu) 8 | */ 9 | public enum Operation { 10 | 11 | EQUAL, SUBST, INSERT, DELETE; 12 | 13 | public static String opToString(Operation op) { 14 | switch (op) { 15 | case EQUAL: return "="; 16 | case SUBST: return "S"; 17 | case INSERT : return "I"; 18 | case DELETE : return "D"; 19 | default : throw new RuntimeException("Bad op: " + op); 20 | } 21 | } 22 | 23 | public static String opsToString(List ops) { 24 | String opsStr = ""; 25 | for (Operation op : ops) { 26 | opsStr += opToString(op); 27 | } 28 | return opsStr; 29 | } 30 | 31 | public static Operation charToOp(char opChar) { 32 | switch (opChar) { 33 | case '=': return EQUAL; 34 | case 'S': return SUBST; 35 | case 'I': return INSERT; 36 | case 'D': return DELETE; 37 | default : throw new RuntimeException("Bad op string: " + opChar); 38 | } 39 | } 40 | 41 | public static List stringToOps(String opsStr) { 42 | List ops = new ArrayList(); 43 | for (int i = 0; i < opsStr.length(); i++) { 44 | ops.add(charToOp(opsStr.charAt(i))); 45 | } 46 | return ops; 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/eval/SingleDocumentEvaluatorAndOutputPrinter.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.eval; 2 | 3 | import java.util.Map; 4 | import java.util.Set; 5 | 6 | import edu.berkeley.cs.nlp.ocular.data.Document; 7 | import edu.berkeley.cs.nlp.ocular.eval.Evaluator.EvalSuffStats; 8 | import edu.berkeley.cs.nlp.ocular.lm.CodeSwitchLanguageModel; 9 | import edu.berkeley.cs.nlp.ocular.main.FonttrainTranscribeShared.OutputFormat; 10 | import edu.berkeley.cs.nlp.ocular.model.DecodeState; 11 | import edu.berkeley.cs.nlp.ocular.util.Tuple2; 12 | 13 | /** 14 | * @author Dan Garrette (dhgarrette@gmail.com) 15 | */ 16 | public interface SingleDocumentEvaluatorAndOutputPrinter { 17 | 18 | public Tuple2,Map> evaluateAndPrintTranscription(int iter, int batchId, 19 | Document doc, 20 | DecodeState[][] decodeStates, 21 | String inputDocPath, String outputPath, Set outputFormats, 22 | CodeSwitchLanguageModel lm); 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/font/Font.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.font; 2 | 3 | import java.io.Serializable; 4 | import java.util.Map; 5 | 6 | import edu.berkeley.cs.nlp.ocular.model.CharacterTemplate; 7 | 8 | /** 9 | * @author Dan Garrette (dhgarrette@gmail.com) 10 | */ 11 | public class Font implements Serializable { 12 | private static final long serialVersionUID = 1L; 13 | 14 | public final Map charTemplates; 15 | 16 | public Font(Map charTemplates) { 17 | this.charTemplates = charTemplates; 18 | } 19 | 20 | public CharacterTemplate get(String character) { 21 | return charTemplates.get(character); 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/gsm/GlyphChar.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.gsm; 2 | 3 | import java.io.Serializable; 4 | 5 | import tberg.murphy.indexer.Indexer; 6 | 7 | /** 8 | * @author Dan Garrette (dhgarrette@gmail.com) 9 | */ 10 | public class GlyphChar implements Serializable { 11 | private static final long serialVersionUID = 1L; 12 | 13 | public enum GlyphType { 14 | ELISION_TILDE, // this glyph is marked with a tilde indicating that some subsequent letter have been elided 15 | TILDE_ELIDED, // this (empty) glyph appears after an "elision tilde" 16 | FIRST_ELIDED, // this (empty) glyph results from the elision of the first letter of a word 17 | DOUBLED, // this glyph marks an empty LM character whose glyph is a duplicate of the next glyph, which is just a rendering of its LM character 18 | //RMRGN_HPHN_DROP, // this glyph marks a right-margin line-breaking hyphen is not printed 19 | ELIDED, // this (empty) glyph results from the elision a character 20 | NORMAL_CHAR }; // 21 | 22 | public final int templateCharIndex; 23 | public final GlyphType glyphType; 24 | 25 | public GlyphChar(int templateCharIndex, GlyphType glyphType) { 26 | this.templateCharIndex = templateCharIndex; 27 | this.glyphType = glyphType; 28 | } 29 | 30 | public boolean isElided() { 31 | switch (glyphType) { 32 | case TILDE_ELIDED: 33 | case FIRST_ELIDED: 34 | case ELIDED: 35 | return true; 36 | default: 37 | return false; 38 | } 39 | } 40 | 41 | public boolean equals(Object o) { 42 | if (this == o) return true; 43 | if (!(o instanceof GlyphChar)) return false; 44 | final GlyphChar gc = (GlyphChar) o; 45 | return templateCharIndex == gc.templateCharIndex && glyphType == gc.glyphType; 46 | } 47 | 48 | public int hashCode() { 49 | return 29 * templateCharIndex + 17 * (glyphType.ordinal()); 50 | } 51 | 52 | public String toString() { 53 | return "GlyphChar(templateCharIndex="+templateCharIndex+", glyphType="+glyphType+")"; 54 | } 55 | 56 | public String toString(Indexer charIndexer) { 57 | return "GlyphChar("+charIndexer.getObject(templateCharIndex)+", "+glyphType+")"; 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/gsm/GlyphSubstitutionModel.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.gsm; 2 | 3 | import java.io.Serializable; 4 | 5 | /** 6 | * @author Dan Garrette (dhgarrette@gmail.com) 7 | */ 8 | public interface GlyphSubstitutionModel extends Serializable { 9 | 10 | public double glyphProb(int language, int lmChar, GlyphChar glyphChar); 11 | 12 | } 13 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/gsm/NoSubGlyphSubstitutionModel.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.gsm; 2 | 3 | import edu.berkeley.cs.nlp.ocular.gsm.GlyphChar.GlyphType; 4 | 5 | /** 6 | * @author Dan Garrette (dhgarrette@gmail.com) 7 | */ 8 | public class NoSubGlyphSubstitutionModel implements GlyphSubstitutionModel { 9 | private static final long serialVersionUID = 1L; 10 | 11 | public NoSubGlyphSubstitutionModel() { 12 | } 13 | 14 | public double glyphProb(int language, int lmChar, GlyphChar glyphChar) { 15 | return (glyphChar.glyphType == GlyphType.NORMAL_CHAR && lmChar == glyphChar.templateCharIndex) ? 1.0 : 0.0; 16 | } 17 | 18 | } 19 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/lm/CodeSwitchLanguageModel.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.lm; 2 | 3 | import java.io.Serializable; 4 | 5 | import tberg.murphy.indexer.Indexer; 6 | 7 | /** 8 | * @author Dan Garrette (dhgarrette@gmail.com) 9 | */ 10 | public interface CodeSwitchLanguageModel extends LanguageModel, Serializable { 11 | 12 | public Indexer getLanguageIndexer(); 13 | 14 | public SingleLanguageModel get(int language); 15 | public double languagePrior(int language); 16 | public double languageTransitionProb(int fromLanguage, int destinationLanguage); 17 | public double getProbKeepSameLanguage(); 18 | 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/lm/CountDb.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.lm; 2 | 3 | /** 4 | * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu) 5 | */ 6 | public interface CountDb { 7 | 8 | public long getNumTokens(); 9 | 10 | public int getNumBigramTypes(); 11 | 12 | public int currSize(); 13 | 14 | public int totalSize(); 15 | 16 | public long[] getKeys(); 17 | 18 | public int getCount(long key, CountType countType); 19 | 20 | public int getCount(NgramWrapper ngram, CountType countType); 21 | 22 | public void incrementBigramTypes(); 23 | 24 | /** 25 | * @return The old count of the ngram (pre-update), but only if we do token counts 26 | */ 27 | public int incrementCount(NgramWrapper ngram, CountType countType); 28 | 29 | public void maybeResize(); 30 | 31 | public String getStringAnalysis(); 32 | } 33 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/lm/CountType.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.lm; 2 | 3 | /** 4 | * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu) 5 | */ 6 | public enum CountType 7 | { 8 | TOKEN_INDEX(0), 9 | HISTORY_TYPE_INDEX(1), 10 | LOWER_ORDER_TYPE_INDEX(2), 11 | LOWER_ORDER_TYPE_NORMALIZER(3); 12 | 13 | private final int index; 14 | 15 | private CountType(int index) { 16 | this.index = index; 17 | } 18 | 19 | public int getIndex() { 20 | return index; 21 | } 22 | } -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/lm/InterpolatingSingleLanguageModel.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.lm; 2 | 3 | import java.util.List; 4 | import java.util.Set; 5 | 6 | import edu.berkeley.cs.nlp.ocular.util.ArrayHelper; 7 | import edu.berkeley.cs.nlp.ocular.util.Tuple2; 8 | import tberg.murphy.indexer.Indexer; 9 | 10 | /** 11 | * @author Dan Garrette (dhgarrette@gmail.com) 12 | */ 13 | public class InterpolatingSingleLanguageModel implements SingleLanguageModel { 14 | private static final long serialVersionUID = 1L; 15 | 16 | private SingleLanguageModel[] subModels; 17 | private double[] interpWeights; 18 | private int numModels; 19 | 20 | private Indexer charIndexer = null; 21 | private Set activeCharacters = null; 22 | private int maxOrder = -1; 23 | 24 | public InterpolatingSingleLanguageModel(List> subModelsAndinterpWeights) { 25 | numModels = subModelsAndinterpWeights.size(); 26 | 27 | subModels = new SingleLanguageModel[numModels]; 28 | interpWeights = new double[numModels]; 29 | 30 | double totalInterpWeight = 0.0; 31 | for (int i = 0; i < numModels; ++i) { 32 | Tuple2 modelAndWeight = subModelsAndinterpWeights.get(i); 33 | subModels[i] = modelAndWeight._1; 34 | interpWeights[i] = modelAndWeight._2; 35 | totalInterpWeight += interpWeights[i]; 36 | 37 | if (charIndexer == null) { 38 | charIndexer = subModels[i].getCharacterIndexer(); 39 | activeCharacters = subModels[i].getActiveCharacters(); 40 | int thisMaxOrder = subModels[i].getMaxOrder(); 41 | if (thisMaxOrder > maxOrder) 42 | maxOrder = thisMaxOrder; 43 | } else if (charIndexer != subModels[i].getCharacterIndexer()) { 44 | throw new RuntimeException("Sub-models don't all use the same character indexer"); 45 | } else if (activeCharacters != subModels[i].getActiveCharacters()) { 46 | throw new RuntimeException("Sub-models don't all use the same active-character set"); 47 | } 48 | } 49 | for (int i = 0; i < numModels; ++i) { 50 | interpWeights[i] /= totalInterpWeight; 51 | } 52 | } 53 | 54 | @Override 55 | public double getCharNgramProb(int[] context, int c) { 56 | double probSum = 0.0; 57 | for (int i = 0; i < numModels; ++i) { 58 | int[] shrunkenContext = subModels[i].shrinkContext(context); // context may be different for different submodels 59 | probSum += subModels[i].getCharNgramProb(shrunkenContext, c) * interpWeights[i]; 60 | } 61 | return probSum; 62 | } 63 | 64 | @Override 65 | public Indexer getCharacterIndexer() { 66 | return charIndexer; 67 | } 68 | 69 | @Override 70 | public Set getActiveCharacters() { 71 | return activeCharacters; 72 | } 73 | 74 | @Override 75 | public int getMaxOrder() { 76 | return maxOrder; 77 | } 78 | 79 | @Override 80 | public int[] shrinkContext(int[] originalContext) { 81 | int[] newContext = originalContext; 82 | while (!containsContext(newContext) && newContext.length > 0) { 83 | newContext = ArrayHelper.takeRight(newContext, newContext.length - 1); 84 | } 85 | return newContext; 86 | } 87 | 88 | @Override 89 | public boolean containsContext(int[] context) { 90 | for (SingleLanguageModel slm : subModels) { 91 | if (slm.containsContext(context)) { 92 | return true; 93 | } 94 | } 95 | return false; 96 | } 97 | 98 | public SingleLanguageModel getSubModel(int i) { 99 | return subModels[i]; 100 | } 101 | 102 | } 103 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/lm/LanguageModel.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.lm; 2 | 3 | import tberg.murphy.indexer.Indexer; 4 | 5 | /** 6 | * @author Dan Garrette (dhgarrette@gmail.com) 7 | */ 8 | public interface LanguageModel { 9 | 10 | public double getCharNgramProb(int[] context, int c); 11 | 12 | public Indexer getCharacterIndexer(); 13 | 14 | } 15 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/lm/LongArrWrapper.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.lm; 2 | 3 | import java.io.Serializable; 4 | import java.util.Arrays; 5 | 6 | /** 7 | * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu) 8 | */ 9 | public class LongArrWrapper implements Serializable { 10 | private static final long serialVersionUID = 5942433644698840887L; 11 | public final long[] arr; 12 | 13 | public LongArrWrapper(long[] arr) { 14 | this.arr = arr; 15 | } 16 | 17 | @Override 18 | public boolean equals(Object other) { 19 | if (other == null || !(other instanceof LongArrWrapper)) { 20 | return false; 21 | } 22 | return Arrays.equals(this.arr, ((LongArrWrapper)other).arr); 23 | } 24 | 25 | @Override 26 | public int hashCode() { 27 | return Arrays.hashCode(this.arr); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/lm/LongNgram.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.lm; 2 | 3 | import tberg.murphy.indexer.Indexer; 4 | 5 | /** 6 | * Contains code for carrying out operations on trigrams encoded as longs. 7 | * Can be instantiated, but also has static methods so that the code can be 8 | * used without creating the object. 9 | * 10 | * Indices are packed into a long using BITS_PER_WORD bits per index, 11 | * up to MAX_ORDER indices. BITS_PER_WORD * MAX_ORDER must be <= 64 (use = at your own risk...) 12 | * When indices are in the long, 1 is added to each of them so that lower-order 13 | * n-grams (with 0s) can be differentiated from n-grams with the first character in the indexer 14 | * in them. 15 | * 16 | * @author Greg Durrett (gdurrett@cs.berkeley.edu) 17 | */ 18 | public class LongNgram { 19 | 20 | // 128 characters should be enough, this lets us do a 9-gram 21 | public static final int BITS_PER_WORD = 7; 22 | public static final int MAX_ORDER = 9; 23 | 24 | public static long[] convertToLong(int[] ngram) { 25 | return convertToLong(ngram, 0, ngram.length); 26 | } 27 | 28 | public static long[] convertToLong(int[] ngram, int start, int end) { 29 | // Add MAX_ORDER-1 to round up 30 | int numLongs = (end - start + MAX_ORDER-1)/MAX_ORDER; 31 | long[] longs = new long[numLongs]; 32 | int longIdx = numLongs - 1; 33 | for (int i = end; i > start; i -= MAX_ORDER) { 34 | longs[longIdx] = Ngram.convertToLong(ngram, Math.max(start, i - MAX_ORDER), i); 35 | longIdx--; 36 | } 37 | return longs; 38 | } 39 | 40 | public static int[] convertToIntArr(long[] ngram) { 41 | int[] arr = new int[LongNgram.getActualOrder(ngram)]; 42 | int ngramIdx = arr.length - 1; 43 | for (int longIdx = ngram.length - 1; longIdx >= 0; longIdx--) { 44 | int[] curr = Ngram.convertToIntArr(ngram[longIdx]); 45 | for (int i = curr.length - 1; i >= 0; i--) { 46 | arr[ngramIdx] = curr[i]; 47 | ngramIdx--; 48 | } 49 | } 50 | return arr; 51 | } 52 | 53 | // TODO: I think these methods work but they don't do clipping to arbitrary orders, 54 | // and I think it's easier to just 55 | // public static long[] getLowerOrder(long[] ngram) { 56 | // return LongNgram.getLowerOrder(ngram, LongNgram.getActualOrder(ngram)); 57 | // } 58 | // 59 | // public static long[] getLowerOrder(long[] ngram, int order) { 60 | // if (order % MAX_ORDER == 1) { 61 | // long[] newNgram = new long[ngram.length-1]; 62 | // System.arraycopy(ngram, 1, newNgram, 0, ngram.length-1); 63 | // return newNgram; 64 | // } else { 65 | // long[] newNgram = new long[ngram.length]; 66 | // System.arraycopy(ngram, 0, newNgram, 0, ngram.length); 67 | // newNgram[0] = Ngram.getLowerOrder(ngram[0]); 68 | // return newNgram; 69 | // } 70 | // } 71 | // 72 | // public static long[] getHistory(long[] ngram) { 73 | // return LongNgram.getHistory(ngram, LongNgram.getActualOrder(ngram)); 74 | // } 75 | // 76 | // public static long[] getHistory(long[] ngram, int order) { 77 | // long lowOrderMask = (1L << ((long)BITS_PER_WORD)) - 1L; 78 | // long[] newNgram; 79 | // int newNgramIdx; 80 | // long carryOver; 81 | // if (order % MAX_ORDER == 1) { 82 | // newNgram = new long[ngram.length-1]; 83 | // newNgramIdx = 0; 84 | // carryOver = ngram[0]; 85 | // } else { 86 | // newNgram = new long[ngram.length]; 87 | // newNgramIdx = 1; 88 | // carryOver = ngram[0] & lowOrderMask; 89 | // newNgram[0] = ngram[0] >>> BITS_PER_WORD; 90 | // } 91 | // for (int i = 1; i < ngram.length; i++) { 92 | // newNgram[newNgramIdx] = ngram[i] >>> BITS_PER_WORD + carryOver << (BITS_PER_WORD * (MAX_ORDER - 1)); 93 | // newNgramIdx++; 94 | // carryOver = ngram[i] & lowOrderMask; 95 | // } 96 | // return newNgram; 97 | // } 98 | // 99 | // public static long[] getLowerOrderHistory(long[] ngram) { 100 | // return LongNgram.getLowerOrderHistory(ngram, LongNgram.getActualOrder(ngram)); 101 | // } 102 | // 103 | // public static long[] getLowerOrderHistory(long[] ngram, int order) { 104 | // return LongNgram.getLowerOrder(LongNgram.getHistory(ngram, order), order - 1); 105 | // } 106 | 107 | public static int getActualOrder(long[] ngram) { 108 | if (ngram.length == 0) { 109 | return 0; 110 | } else { 111 | return (ngram.length - 1) * MAX_ORDER + Ngram.getActualOrder(ngram[0]); 112 | } 113 | } 114 | 115 | public static String toString(int[] ngram, Indexer indexer) { 116 | return LongNgram.toString(LongNgram.convertToLong(ngram), indexer); 117 | } 118 | 119 | public static String toString(long[] ngram, Indexer indexer) { 120 | int order = LongNgram.getActualOrder(ngram); 121 | String ngramStr = ""; 122 | for (int i = 0; i < ngram.length; i++) { 123 | ngramStr += Ngram.getNgramStr(ngram[i], indexer); 124 | } 125 | return "[" + order + ":" + ngramStr + "]"; 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/lm/Ngram.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.lm; 2 | 3 | import tberg.murphy.indexer.Indexer; 4 | 5 | /** 6 | * Contains code for carrying out operations on trigrams encoded as longs. 7 | * Can be instantiated, but also has static methods so that the code can be 8 | * used without creating the object. 9 | * 10 | * Indices are packed into a long using BITS_PER_WORD bits per index, 11 | * up to MAX_ORDER indices. BITS_PER_WORD * MAX_ORDER must be <= 64 (use = at your own risk...) 12 | * When indices are in the long, 1 is added to each of them so that lower-order 13 | * n-grams (with 0s) can be differentiated from n-grams with the first character in the indexer 14 | * in them. 15 | * 16 | * @author Greg Durrett (gdurrett@cs.berkeley.edu) 17 | */ 18 | public class Ngram { 19 | 20 | // 128 characters should be enough, this lets us do a 9-gram 21 | public static final int BITS_PER_WORD = 7; 22 | public static final int MAX_ORDER = 9; 23 | public static final int[] CONVERTER = new int[MAX_ORDER]; 24 | 25 | private static int encodeWord(int rawWord) { 26 | return rawWord + 1; 27 | } 28 | 29 | private static int decodeWord(int encodedWord) { 30 | return encodedWord - 1; 31 | } 32 | 33 | public static long convertToLong(int[] ngram) { 34 | return convertToLong(ngram, 0, ngram.length); 35 | } 36 | 37 | public static long convertToLong(int[] ngram, int start, int end) { 38 | long l = 0; 39 | for (int i = start; i < end; i++) 40 | l = (l << BITS_PER_WORD) + encodeWord(ngram[i]); 41 | return l; 42 | } 43 | 44 | public static int[] convertToIntArr(long ngram) { 45 | // assert Ngram.getActualOrder(ngram) == MAX_ORDER : "Ngram of less than max order: " 46 | // + Ngram.toString(ngram) + ", order: " + Ngram.getActualOrder(ngram); 47 | int[] arr = new int[Ngram.getActualOrder(ngram)]; 48 | int i = 0; 49 | long wordMask = (1L << BITS_PER_WORD) - 1; 50 | while (ngram != 0) { 51 | arr[arr.length - 1 - i] = decodeWord((int) (ngram & wordMask)); 52 | i++; 53 | ngram = Ngram.getHistory(ngram); 54 | } 55 | return arr; 56 | } 57 | 58 | public static long getLowerOrder(long ngram) { 59 | return Ngram.getLowerOrder(ngram, Ngram.getActualOrder(ngram)); 60 | } 61 | 62 | public static long getLowerOrder(long ngram, int order) { 63 | long mask = (1L << ((order - 1) * BITS_PER_WORD)) - 1L; 64 | return mask & ngram; 65 | } 66 | 67 | public static long getHistory(long ngram) { 68 | return Ngram.getHistory(ngram, Ngram.getActualOrder(ngram)); 69 | } 70 | 71 | public static long getHistory(long ngram, int order) { 72 | long mask = ((1L << (((long) order - 1) * BITS_PER_WORD)) - 1L) << BITS_PER_WORD; 73 | return (mask & ngram) >> BITS_PER_WORD; 74 | } 75 | 76 | public static long getLowerOrderHistory(long ngram) { 77 | return Ngram.getLowerOrderHistory(ngram, Ngram.getActualOrder(ngram)); 78 | } 79 | 80 | public static long getLowerOrderHistory(long ngram, int order) { 81 | return Ngram.getLowerOrder(Ngram.getHistory(ngram, order), order - 1); 82 | } 83 | 84 | // public static long addWordAndShift(long ngram, int word) { 85 | // long mask = (1L << (((long) MAX_ORDER - 1) * BITS_PER_WORD)) - 1L << BITS_PER_WORD; 86 | // return ((ngram << BITS_PER_WORD) & mask) + encodeWord(word); 87 | // } 88 | 89 | public static int getActualOrder(long ngram) { 90 | for (int i = MAX_ORDER - 1; i >= 0; i--) { 91 | long mask = (1L << (((long) i) * BITS_PER_WORD)) - 1L; 92 | if ((ngram & mask) != ngram) 93 | return i + 1; 94 | } 95 | return 0; 96 | } 97 | 98 | public static String toString(int[] ngram, Indexer indexer) { 99 | return Ngram.toString(Ngram.convertToLong(ngram), indexer); 100 | } 101 | 102 | public static String toString(long ngram, Indexer indexer) { 103 | return "[" + Ngram.getActualOrder(ngram) + ":" + getNgramStr(ngram, indexer) + "]"; 104 | } 105 | 106 | public static String getNgramStr(long ngram, Indexer indexer) { 107 | String string = ""; 108 | int order = Ngram.getActualOrder(ngram); 109 | for (int i = 0; i < order; i++) { 110 | long mask = (1L << BITS_PER_WORD) - 1L; 111 | string = indexer.getObject(decodeWord((int) (ngram & mask))) + string; 112 | ngram = ngram >> BITS_PER_WORD; 113 | } 114 | return string; 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/lm/NgramWrapper.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.lm; 2 | 3 | /** 4 | * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu) 5 | */ 6 | public class NgramWrapper { 7 | 8 | public int[] ngram; 9 | public int start; 10 | public int end; 11 | 12 | private NgramWrapper() { 13 | this.ngram = null; 14 | this.start = -1; 15 | this.end = -1; 16 | } 17 | 18 | public static NgramWrapper getNew(int[] ngram, int start, int end) { 19 | NgramWrapper next = new NgramWrapper(); 20 | next.changeNgramWrapper(ngram, start, end); 21 | return next; 22 | } 23 | 24 | private void changeNgramWrapper(int[] ngram, int start, int end) { 25 | this.ngram = ngram; 26 | this.start = start; 27 | this.end = end; 28 | } 29 | 30 | public int getOrder() { 31 | return end - start; 32 | } 33 | 34 | public NgramWrapper getLowerOrder() { 35 | return getNew(ngram, start + 1, end); 36 | } 37 | 38 | public NgramWrapper getLowerOrder(int order) { 39 | return getNew(ngram, end - order, end); 40 | } 41 | 42 | public NgramWrapper getHistory() { 43 | return getNew(ngram, start, end - 1); 44 | } 45 | 46 | public long getLongRep() { 47 | return Ngram.convertToLong(ngram, start, end); 48 | } 49 | 50 | public long[] getLongerRep() { 51 | return LongNgram.convertToLong(ngram, start, end); 52 | } 53 | 54 | public String toString() { 55 | String str = "["; 56 | for (int i = start; i < end; i++) { 57 | str += ngram[i] + ", "; 58 | } 59 | if (str.length() == 1) { 60 | return str + "]"; 61 | } else { 62 | return str.substring(0, str.length() - 2) + "]"; 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/lm/SingleLanguageModel.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.lm; 2 | 3 | import java.io.Serializable; 4 | import java.util.Set; 5 | 6 | import edu.berkeley.cs.nlp.ocular.lm.LanguageModel; 7 | 8 | /** 9 | * @author Dan Garrette (dhgarrette@gmail.com) 10 | */ 11 | public interface SingleLanguageModel extends LanguageModel, Serializable { 12 | 13 | public Set getActiveCharacters(); 14 | public int getMaxOrder(); 15 | public int[] shrinkContext(int[] originalContext); 16 | public boolean containsContext(int[] context); 17 | 18 | } 19 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/lm/UniformLanguageModel.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.lm; 2 | 3 | import tberg.murphy.indexer.Indexer; 4 | 5 | import java.util.Set; 6 | 7 | import edu.berkeley.cs.nlp.ocular.lm.SingleLanguageModel; 8 | 9 | /** 10 | * @author Dan Garrette (dhgarrette@gmail.com) 11 | */ 12 | public class UniformLanguageModel implements SingleLanguageModel { 13 | private static final long serialVersionUID = 398523984923L; 14 | 15 | final private Set activeCharacters; 16 | final private Indexer charIndexer; 17 | final private int maxOrder; 18 | final private boolean[] isActive; 19 | final private double prob; 20 | 21 | public UniformLanguageModel(Set activeCharacters, Indexer charIndexer, int maxOrder) { 22 | this.activeCharacters = activeCharacters; 23 | this.charIndexer = charIndexer; 24 | this.maxOrder = maxOrder; 25 | 26 | isActive = new boolean[charIndexer.size()]; 27 | for (int c : activeCharacters) { 28 | isActive[c] = true; 29 | } 30 | this.prob = 1.0 / activeCharacters.size(); 31 | } 32 | 33 | public Set getActiveCharacters() { 34 | return activeCharacters; 35 | } 36 | 37 | public int[] shrinkContext(int[] context) { 38 | return context; 39 | } 40 | 41 | public boolean containsContext(int[] context) { 42 | return true; 43 | } 44 | 45 | public double getCharNgramProb(int[] context, int c) { 46 | if (isActive[c]) 47 | return prob; 48 | else 49 | return 0.0; 50 | } 51 | 52 | public Indexer getCharacterIndexer() { 53 | return charIndexer; 54 | } 55 | 56 | public int getMaxOrder() { 57 | return maxOrder; 58 | } 59 | 60 | } 61 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/main/ExtractLinesOnly.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.main; 2 | 3 | import java.util.List; 4 | 5 | import edu.berkeley.cs.nlp.ocular.data.Document; 6 | import edu.berkeley.cs.nlp.ocular.data.LazyRawImageLoader; 7 | 8 | /** 9 | * @author Dan Garrette (dhgarrette@gmail.com) 10 | */ 11 | public class ExtractLinesOnly extends LineExtractionOptions { 12 | 13 | public static void main(String[] args) { 14 | System.out.println("ExtractLinesOnly"); 15 | ExtractLinesOnly main = new ExtractLinesOnly(); 16 | main.doMain(main, args); 17 | } 18 | 19 | protected void validateOptions() { 20 | super.validateOptions(); 21 | if (extractedLinesPath == null) throw new IllegalArgumentException("-extractedLinesPath is required."); 22 | } 23 | 24 | public void run(List commandLineArgs) { 25 | List inputDocPathList = getInputDocPathList(); 26 | List inputDocuments = LazyRawImageLoader.loadDocuments(inputDocPathList, extractedLinesPath, numDocs, numDocsToSkip, uniformLineHeight, binarizeThreshold, crop); 27 | if (inputDocuments.isEmpty()) throw new NoDocumentsFoundException(); 28 | for (Document doc : inputDocuments) { 29 | doc.loadLineImages(); 30 | } 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/main/InitializeGlyphSubstitutionModel.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.main; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.FileOutputStream; 6 | import java.io.IOException; 7 | import java.io.ObjectInputStream; 8 | import java.io.ObjectOutputStream; 9 | import java.util.List; 10 | import java.util.Set; 11 | import java.util.zip.GZIPInputStream; 12 | import java.util.zip.GZIPOutputStream; 13 | 14 | import edu.berkeley.cs.nlp.ocular.gsm.BasicGlyphSubstitutionModel.BasicGlyphSubstitutionModelFactory; 15 | import edu.berkeley.cs.nlp.ocular.gsm.GlyphSubstitutionModel; 16 | import edu.berkeley.cs.nlp.ocular.lm.CodeSwitchLanguageModel; 17 | import tberg.murphy.fig.Option; 18 | import tberg.murphy.indexer.Indexer; 19 | 20 | /** 21 | * @author Dan Garrette (dhgarrette@gmail.com) 22 | */ 23 | public class InitializeGlyphSubstitutionModel extends OcularRunnable { 24 | 25 | @Option(gloss = "Path to the language model file (so that it knows which characters to create images for).") 26 | public static String inputLmPath = null; // Required. 27 | 28 | @Option(gloss = "Output font file path.") 29 | public static String outputGsmPath = null; // Required. 30 | 31 | @Option(gloss = "The default number of counts that every glyph gets in order to smooth the glyph substitution model estimation.") 32 | public static double gsmSmoothingCount = 1.0; 33 | 34 | @Option(gloss = "gsmElisionSmoothingCountMultiplier.") 35 | public static double gsmElisionSmoothingCountMultiplier = 100.0; 36 | 37 | @Option(gloss = "Exponent on GSM scores.") 38 | public static double gsmPower = 4.0; 39 | 40 | public static void main(String[] args) { 41 | System.out.println("InitializeGlyphSubstitutionModel"); 42 | InitializeGlyphSubstitutionModel main = new InitializeGlyphSubstitutionModel(); 43 | main.doMain(main, args); 44 | } 45 | 46 | protected void validateOptions() { 47 | if (inputLmPath == null) throw new IllegalArgumentException("-inputLmPath not set"); 48 | if (outputGsmPath == null) throw new IllegalArgumentException("-outputGsmPath not set"); 49 | } 50 | 51 | public void run(List commandLineArgs) { 52 | final CodeSwitchLanguageModel lm = InitializeLanguageModel.readCodeSwitchLM(inputLmPath); 53 | final Indexer charIndexer = lm.getCharacterIndexer(); 54 | final Indexer langIndexer = lm.getLanguageIndexer(); 55 | Set[] activeCharacterSets = FonttrainTranscribeShared.makeActiveCharacterSets(lm); 56 | 57 | // Fake stuff 58 | int minCountsForEvalGsm = 0; 59 | String outputPath = null; 60 | 61 | BasicGlyphSubstitutionModelFactory factory = new BasicGlyphSubstitutionModelFactory( 62 | gsmSmoothingCount, gsmElisionSmoothingCountMultiplier, 63 | langIndexer, charIndexer, 64 | activeCharacterSets, gsmPower, minCountsForEvalGsm, outputPath); 65 | 66 | System.out.println("Initializing a uniform GSM."); 67 | GlyphSubstitutionModel gsm = factory.uniform(); 68 | 69 | System.out.println("Writing intialized gsm to " + outputGsmPath); 70 | writeGSM(gsm, outputGsmPath); 71 | } 72 | 73 | public static GlyphSubstitutionModel readGSM(String gsmPath) { 74 | ObjectInputStream in = null; 75 | try { 76 | File file = new File(gsmPath); 77 | if (!file.exists()) { 78 | throw new RuntimeException("Serialized GlyphSubstitutionModel file " + gsmPath + " not found"); 79 | } 80 | in = new ObjectInputStream(new GZIPInputStream(new FileInputStream(file))); 81 | return (GlyphSubstitutionModel) in.readObject(); 82 | } catch (Exception e) { 83 | throw new RuntimeException(e); 84 | } finally { 85 | if (in != null) 86 | try { in.close(); } catch (IOException e) { throw new RuntimeException(e); } 87 | } 88 | } 89 | 90 | public static void writeGSM(GlyphSubstitutionModel gsm, String gsmPath) { 91 | ObjectOutputStream out = null; 92 | try { 93 | new File(gsmPath).getAbsoluteFile().getParentFile().mkdirs(); 94 | out = new ObjectOutputStream(new GZIPOutputStream(new FileOutputStream(gsmPath))); 95 | out.writeObject(gsm); 96 | } catch (Exception e) { 97 | throw new RuntimeException(e); 98 | } finally { 99 | if (out != null) 100 | try { out.close(); } catch (IOException e) { throw new RuntimeException(e); } 101 | } 102 | } 103 | 104 | } 105 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/main/LineExtractionOptions.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.main; 2 | 3 | import java.io.File; 4 | import java.util.Arrays; 5 | import java.util.List; 6 | 7 | import tberg.murphy.fig.Option; 8 | import tberg.murphy.fileio.f; 9 | 10 | /** 11 | * @author Dan Garrette (dhgarrette@gmail.com) 12 | */ 13 | public abstract class LineExtractionOptions extends OcularRunnable { 14 | 15 | // Main Options 16 | 17 | @Option(gloss = "Path to the directory that contains the input document images. The entire directory will be searched recursively for any files that do not end in `.txt` (and that do not start with `.`). Files will be processed in lexicographical order.") 18 | public static String inputDocPath = null; // Either inputDocPath or inputDocListPath is required. 19 | 20 | @Option(gloss = "Path to a file that contains a list of paths to images files that should be used. The file should contain one path per line. These paths will be searched in order. Each path may point to either a file or a directory, which will be searched recursively for any files that do not end in `.txt` (and that do not start with `.`). Paths will be processed in the order given in the file, and each path will be searched in lexicographical order.") 21 | public static String inputDocListPath = null; // Either inputDocPath or inputDocListPath is required. 22 | 23 | @Option(gloss = "Number of documents (pages) to use, counting alphabetically. Ignore or use 0 to use all documents. Default: Use all documents.") 24 | public static int numDocs = Integer.MAX_VALUE; 25 | 26 | @Option(gloss = "Number of training documents (pages) to skip over, counting alphabetically. Useful, in combination with -numDocs, if you want to break a directory of documents into several chunks.") 27 | public static int numDocsToSkip = 0; 28 | 29 | @Option(gloss = "Path of the directory where the line-extraction images should be read/written. If the line files exist here, they will be used; if not, they will be extracted and then written here. Useful if: 1) you plan to run Ocular on the same documents multiple times and you want to save some time by not re-extracting the lines, or 2) you use an alternate line extractor (such as Tesseract) to pre-process the document. If ignored, the document will simply be read from the original document image file, and no line images will be written.") 30 | public static String extractedLinesPath = null; // Don't read or write line image files. 31 | 32 | // Line Extraction Options 33 | 34 | @Option(gloss = "Quantile to use for pixel value thresholding. (High values mean more black pixels.)") 35 | public static double binarizeThreshold = 0.12; 36 | 37 | @Option(gloss = "Crop pages?") 38 | public static boolean crop = true; 39 | 40 | @Option(gloss = "Scale all lines to have the same height?") 41 | public static boolean uniformLineHeight = true; 42 | 43 | 44 | 45 | protected void validateOptions() { 46 | if ((inputDocPath == null) == (inputDocListPath == null)) throw new IllegalArgumentException("Either -inputDocPath or -inputDocListPath is required."); 47 | if (inputDocPath != null) 48 | for (String path : inputDocPath.split("[\\s,;:]+")) 49 | if (!new File(path).exists()) throw new IllegalArgumentException("inputDocPath "+path+" does not exist [looking in "+(new File(".").getAbsolutePath())+"]"); 50 | if (inputDocListPath != null && !new File(inputDocListPath).exists()) throw new IllegalArgumentException("-inputDocListPath "+inputDocListPath+" does not exist [looking in "+(new File(".").getAbsolutePath())+"]"); 51 | if (numDocsToSkip < 0) throw new IllegalArgumentException("-numDocsToSkip must be >= 0. Was "+numDocsToSkip+"."); 52 | } 53 | 54 | protected static List getInputDocPathList() { 55 | return inputDocPath != null ? Arrays.asList(inputDocPath.split("[\\s+,;:]")) : f.readLines(inputDocListPath); 56 | } 57 | 58 | 59 | } 60 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/main/NoDocumentsFoundException.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.main; 2 | 3 | /** 4 | * @author Dan Garrette (dhgarrette@gmail.com) 5 | */ 6 | public class NoDocumentsFoundException extends RuntimeException { 7 | private static final long serialVersionUID = 1L; 8 | 9 | public NoDocumentsFoundException() { 10 | super("No documents were found in the given input path(s)."); 11 | } 12 | 13 | public NoDocumentsFoundException(String message) { 14 | super(message); 15 | } 16 | 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/main/NoDocumentsToProcessException.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.main; 2 | 3 | /** 4 | * @author Dan Garrette (dhgarrette@gmail.com) 5 | */ 6 | public class NoDocumentsToProcessException extends RuntimeException { 7 | private static final long serialVersionUID = 1L; 8 | 9 | public NoDocumentsToProcessException() { 10 | super(); 11 | } 12 | 13 | public NoDocumentsToProcessException(String message) { 14 | super(message); 15 | } 16 | 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/main/OcularRunnable.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.main; 2 | 3 | import java.text.SimpleDateFormat; 4 | import java.util.Arrays; 5 | import java.util.Date; 6 | import java.util.List; 7 | 8 | import tberg.murphy.fig.OptionsParser; 9 | 10 | /** 11 | * @author Dan Garrette (dhgarrette@gmail.com) 12 | */ 13 | public abstract class OcularRunnable { 14 | 15 | private SimpleDateFormat sdf = new SimpleDateFormat("MM/dd/yyyy HH:mm:ss"); 16 | 17 | final protected void doMain(OcularRunnable main, String[] args) { 18 | System.out.println(toArgListString(args)); 19 | long startTime = System.currentTimeMillis(); 20 | printStartTime(startTime); 21 | OptionsParser parser = new OptionsParser(); 22 | parser.doRegisterAll(new Object[] { main }); 23 | if (!parser.doParse(args)) System.exit(1); 24 | main.validateOptions(); 25 | main.run(Arrays.asList(args)); 26 | long endTime = System.currentTimeMillis(); 27 | printEndTime(startTime, endTime); 28 | } 29 | 30 | abstract protected void run(List commandLineArgs); 31 | 32 | abstract protected void validateOptions(); 33 | 34 | private static String toArgListString(String[] args) { 35 | StringBuffer sb = new StringBuffer(); 36 | for (int i = 0; i < args.length; ++i) { 37 | sb.append(" " + args[i]); 38 | if (i % 2 != 0) 39 | sb.append("\n"); 40 | } 41 | return sb.toString(); 42 | } 43 | 44 | private void printEndTime(long startTime, long endTime) { 45 | System.out.println("\n"+ convertSecondsToAmountOfTimeString(endTime - startTime) + " elapsed. Completed at "+sdf.format(new Date(endTime))); 46 | } 47 | 48 | private void printStartTime(long startTime) { 49 | System.out.println("Started job at "+sdf.format(new Date(startTime))+"\n"); 50 | } 51 | 52 | private String convertSecondsToAmountOfTimeString(long millis) { 53 | long seconds = millis / 1000; 54 | long s = seconds % 60; 55 | long m = (seconds / 60) % 60; 56 | long h = (seconds / (60 * 60)); 57 | return String.format("%02d:%02d:%02d", h,m,s); 58 | } 59 | 60 | } 61 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/main/TrainFont.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.main; 2 | 3 | import java.util.List; 4 | import java.util.Set; 5 | 6 | import edu.berkeley.cs.nlp.ocular.data.Document; 7 | import edu.berkeley.cs.nlp.ocular.data.LazyRawImageLoader; 8 | import edu.berkeley.cs.nlp.ocular.eval.BasicSingleDocumentEvaluatorAndOutputPrinter; 9 | import edu.berkeley.cs.nlp.ocular.eval.MultiDocumentTranscriber; 10 | import edu.berkeley.cs.nlp.ocular.eval.SingleDocumentEvaluatorAndOutputPrinter; 11 | import edu.berkeley.cs.nlp.ocular.font.Font; 12 | import edu.berkeley.cs.nlp.ocular.gsm.BasicGlyphSubstitutionModel.BasicGlyphSubstitutionModelFactory; 13 | import edu.berkeley.cs.nlp.ocular.gsm.GlyphSubstitutionModel; 14 | import edu.berkeley.cs.nlp.ocular.lm.CodeSwitchLanguageModel; 15 | import edu.berkeley.cs.nlp.ocular.model.DecoderEM; 16 | import edu.berkeley.cs.nlp.ocular.train.FontTrainer; 17 | import edu.berkeley.cs.nlp.ocular.train.TrainingRestarter; 18 | import edu.berkeley.cs.nlp.ocular.util.FileUtil; 19 | import tberg.murphy.fig.Option; 20 | import tberg.murphy.indexer.Indexer; 21 | 22 | /** 23 | * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu) 24 | * @author Dan Garrette (dhgarrette@gmail.com) 25 | */ 26 | public class TrainFont extends FonttrainTranscribeShared { 27 | 28 | @Option(gloss = "Number of iterations of EM to use for font learning.") 29 | public static int numEMIters = 3; 30 | 31 | @Option(gloss = "If true, the font trainer will find the latest completed iteration in the outputPath and load it in order to pick up training from that point. Convenient if a training run crashes when only partially completed.") 32 | public static boolean continueFromLastCompleteIteration = false; 33 | 34 | @Option(gloss = "When using -evalInputDocPath, the font trainer will perform an evaluation every `evalFreq` iterations. Default: Evaluate only after all iterations have completed.") 35 | public static int evalFreq = Integer.MAX_VALUE; 36 | 37 | 38 | public static void main(String[] args) { 39 | System.out.println("TrainFont"); 40 | TrainFont main = new TrainFont(); 41 | main.doMain(main, args); 42 | } 43 | 44 | protected void validateOptions() { 45 | super.validateOptions(); 46 | 47 | if (numEMIters <= 0) new IllegalArgumentException("-numEMIters must be a positive number."); 48 | 49 | if (outputFontPath == null) throw new IllegalArgumentException("-outputFontPath is required for font training."); 50 | } 51 | 52 | public void run(List commandLineArgs) { 53 | Set outputFormats = parseOutputFormats(); 54 | 55 | CodeSwitchLanguageModel initialLM = loadInputLM(); 56 | Font initialFont = loadInputFont(); 57 | BasicGlyphSubstitutionModelFactory gsmFactory = makeGsmFactory(initialLM); 58 | GlyphSubstitutionModel initialGSM = loadInitialGSM(gsmFactory); 59 | 60 | Indexer charIndexer = initialLM.getCharacterIndexer(); 61 | Indexer langIndexer = initialLM.getLanguageIndexer(); 62 | 63 | DecoderEM decoderEM = makeDecoder(charIndexer); 64 | 65 | boolean evalCharIncludesDiacritic = true; 66 | SingleDocumentEvaluatorAndOutputPrinter documentOutputPrinterAndEvaluator = new BasicSingleDocumentEvaluatorAndOutputPrinter(charIndexer, langIndexer, allowGlyphSubstitution, evalCharIncludesDiacritic, commandLineArgs); 67 | 68 | List inputDocPathList = getInputDocPathList(); 69 | List inputDocuments = LazyRawImageLoader.loadDocuments(inputDocPathList, extractedLinesPath, numDocs, numDocsToSkip, uniformLineHeight, binarizeThreshold, crop); 70 | if (inputDocuments.isEmpty()) throw new NoDocumentsFoundException(); 71 | if (updateDocBatchSize > 0 && inputDocuments.size() < updateDocBatchSize) throw new RuntimeException("The number of available documents is less than -updateDocBatchSize!"); 72 | 73 | String newInputDocPath = FileUtil.lowestCommonPath(inputDocPathList); 74 | 75 | MultiDocumentTranscriber evalSetEvaluator = makeEvalSetEvaluator(charIndexer, decoderEM, documentOutputPrinterAndEvaluator); 76 | new FontTrainer().trainFont( 77 | inputDocuments, 78 | initialFont, initialLM, initialGSM, 79 | continueFromLastCompleteIteration ? new TrainingRestarter() : null, 80 | outputFontPath, outputLmPath, outputGsmPath, 81 | decoderEM, 82 | gsmFactory, documentOutputPrinterAndEvaluator, 83 | numEMIters, updateDocBatchSize > 0 ? updateDocBatchSize : inputDocuments.size(), false, true, 84 | numMstepThreads, 85 | newInputDocPath, outputPath, outputFormats, 86 | evalSetEvaluator, evalFreq, evalBatches, 87 | skipFailedDocs); 88 | } 89 | 90 | } 91 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/main/gui/GridLayout2.java: -------------------------------------------------------------------------------- 1 | /* 2 | * FROM: 3 | * http://core0.staticworld.net/downloads/idge/imported/article/jvw/2006/10/gridlayout2.java 4 | * 5 | */ 6 | 7 | package edu.berkeley.cs.nlp.ocular.main.gui; 8 | 9 | import java.awt.*; 10 | 11 | /** 12 | * Grid Layout which allows components of different sizes 13 | * 14 | * @author 15 | */ 16 | public class GridLayout2 extends GridLayout { 17 | private static final long serialVersionUID = 1L; 18 | 19 | public GridLayout2() { 20 | this(1, 0, 0, 0); 21 | } 22 | 23 | public GridLayout2(int rows, int cols) { 24 | this(rows, cols, 0, 0); 25 | } 26 | 27 | public GridLayout2(int rows, int cols, int hgap, int vgap) { 28 | super(rows, cols, hgap, vgap); 29 | } 30 | 31 | public Dimension preferredLayoutSize(Container parent) { 32 | //System.err.println("preferredLayoutSize"); 33 | synchronized (parent.getTreeLock()) { 34 | Insets insets = parent.getInsets(); 35 | int ncomponents = parent.getComponentCount(); 36 | int nrows = getRows(); 37 | int ncols = getColumns(); 38 | if (nrows > 0) { 39 | ncols = (ncomponents + nrows - 1) / nrows; 40 | } 41 | else { 42 | nrows = (ncomponents + ncols - 1) / ncols; 43 | } 44 | int[] w = new int[ncols]; 45 | int[] h = new int[nrows]; 46 | for (int i = 0; i < ncomponents; i ++) { 47 | int r = i / ncols; 48 | int c = i % ncols; 49 | Component comp = parent.getComponent(i); 50 | Dimension d = comp.getPreferredSize(); 51 | if (w[c] < d.width) { 52 | w[c] = d.width; 53 | } 54 | if (h[r] < d.height) { 55 | h[r] = d.height; 56 | } 57 | } 58 | int nw = 0; 59 | for (int j = 0; j < ncols; j ++) { 60 | nw += w[j]; 61 | } 62 | int nh = 0; 63 | for (int i = 0; i < nrows; i ++) { 64 | nh += h[i]; 65 | } 66 | return new Dimension(insets.left + insets.right + nw + (ncols-1)*getHgap(), 67 | insets.top + insets.bottom + nh + (nrows-1)*getVgap()); 68 | } 69 | } 70 | 71 | public Dimension minimumLayoutSize(Container parent) { 72 | System.err.println("minimumLayoutSize"); 73 | synchronized (parent.getTreeLock()) { 74 | Insets insets = parent.getInsets(); 75 | int ncomponents = parent.getComponentCount(); 76 | int nrows = getRows(); 77 | int ncols = getColumns(); 78 | if (nrows > 0) { 79 | ncols = (ncomponents + nrows - 1) / nrows; 80 | } 81 | else { 82 | nrows = (ncomponents + ncols - 1) / ncols; 83 | } 84 | int[] w = new int[ncols]; 85 | int[] h = new int[nrows]; 86 | for (int i = 0; i < ncomponents; i ++) { 87 | int r = i / ncols; 88 | int c = i % ncols; 89 | Component comp = parent.getComponent(i); 90 | Dimension d = comp.getMinimumSize(); 91 | if (w[c] < d.width) { 92 | w[c] = d.width; 93 | } 94 | if (h[r] < d.height) { 95 | h[r] = d.height; 96 | } 97 | } 98 | int nw = 0; 99 | for (int j = 0; j < ncols; j ++) { 100 | nw += w[j]; 101 | } 102 | int nh = 0; 103 | for (int i = 0; i < nrows; i ++) { 104 | nh += h[i]; 105 | } 106 | return new Dimension(insets.left + insets.right + nw + (ncols-1)*getHgap(), 107 | insets.top + insets.bottom + nh + (nrows-1)*getVgap()); 108 | } 109 | } 110 | 111 | public void layoutContainer(Container parent) { 112 | //System.err.println("layoutContainer"); 113 | synchronized (parent.getTreeLock()) { 114 | Insets insets = parent.getInsets(); 115 | int ncomponents = parent.getComponentCount(); 116 | int nrows = getRows(); 117 | int ncols = getColumns(); 118 | if (ncomponents == 0) { 119 | return; 120 | } 121 | if (nrows > 0) { 122 | ncols = (ncomponents + nrows - 1) / nrows; 123 | } 124 | else { 125 | nrows = (ncomponents + ncols - 1) / ncols; 126 | } 127 | int hgap = getHgap(); 128 | int vgap = getVgap(); 129 | // scaling factors 130 | Dimension pd = preferredLayoutSize(parent); 131 | double sw = (1.0 * parent.getWidth()) / pd.width; 132 | double sh = (1.0 * parent.getHeight()) / pd.height; 133 | // scale 134 | int[] w = new int[ncols]; 135 | int[] h = new int[nrows]; 136 | for (int i = 0; i < ncomponents; i ++) { 137 | int r = i / ncols; 138 | int c = i % ncols; 139 | Component comp = parent.getComponent(i); 140 | Dimension d = comp.getPreferredSize(); 141 | d.width = (int) (sw * d.width); 142 | d.height = (int) (sh * d.height); 143 | if (w[c] < d.width) { 144 | w[c] = d.width; 145 | } 146 | if (h[r] < d.height) { 147 | h[r] = d.height; 148 | } 149 | } 150 | for (int c = 0, x = insets.left; c < ncols; c ++) { 151 | for (int r = 0, y = insets.top; r < nrows; r ++) { 152 | int i = r * ncols + c; 153 | if (i < ncomponents) { 154 | parent.getComponent(i).setBounds(x, y, w[c], h[r]); 155 | } 156 | y += h[r] + vgap; 157 | } 158 | x += w[c] + hgap; 159 | } 160 | } 161 | } 162 | } 163 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/model/DecodeState.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.model; 2 | 3 | import edu.berkeley.cs.nlp.ocular.model.transition.SparseTransitionModel.TransitionState; 4 | 5 | /** 6 | * @author Dan Garrette (dhgarrette@gmail.com) 7 | */ 8 | public class DecodeState { 9 | public final TransitionState ts; 10 | public final int charAndPadWidth; 11 | public final int charWidth; 12 | public final int padWidth; 13 | public final int exposure; 14 | public final int verticalOffset; 15 | 16 | public DecodeState(TransitionState ts, int charAndPadWidth, int padWidth, int exposure, int verticalOffset) { 17 | this.ts = ts; 18 | this.charAndPadWidth = charAndPadWidth; 19 | this.padWidth = padWidth; 20 | this.charWidth = charAndPadWidth - padWidth; 21 | this.exposure = exposure; 22 | this.verticalOffset = verticalOffset; 23 | } 24 | 25 | } 26 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/model/TransitionStateType.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.model; 2 | 3 | /** 4 | * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu) 5 | */ 6 | public enum TransitionStateType { 7 | TMPL, LMRGN, LMRGN_HPHN, RMRGN, RMRGN_HPHN_INIT, RMRGN_HPHN 8 | } 9 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/model/em/DefaultInnerLoop.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.model.em; 2 | 3 | import edu.berkeley.cs.nlp.ocular.model.CharacterTemplate; 4 | import tberg.murphy.gpu.CudaUtil; 5 | 6 | /** 7 | * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu) 8 | */ 9 | public class DefaultInnerLoop implements EmissionCacheInnerLoop { 10 | 11 | int numThreads; 12 | float[][] whiteTemplates; 13 | float[][] blackTemplates; 14 | int[] templateNumIndices; 15 | int[] templateIndicesOffsets; 16 | int maxTemplateWidth; 17 | int minTemplateWidth; 18 | 19 | public DefaultInnerLoop(int numThreads) { 20 | this.numThreads = numThreads; 21 | } 22 | 23 | public void startup(float[][] whiteTemplates, float[][] blackTemplates, int[] templateNumIndices, int[] templateIndicesOffsets, int minTemplateWidth, int maxTemplateWidth, int maxSequenceLength, int totalTemplateNumIndices) { 24 | this.whiteTemplates = whiteTemplates; 25 | this.blackTemplates = blackTemplates; 26 | this.templateNumIndices = templateNumIndices; 27 | this.templateIndicesOffsets = templateIndicesOffsets; 28 | this.maxTemplateWidth = maxTemplateWidth; 29 | this.minTemplateWidth = minTemplateWidth; 30 | } 31 | 32 | public void shutdown() { 33 | } 34 | 35 | public void compute(final float[] scores, final float[] whiteObservations, final float[] blackObservations, final int sequenceLength) { 36 | for (int tw=minTemplateWidth; tw<=maxTemplateWidth; ++tw) { 37 | float[] whiteTemplatesForWidth = whiteTemplates[tw-minTemplateWidth]; 38 | float[] blackTemplateForWidth = blackTemplates[tw-minTemplateWidth]; 39 | for (int t=0; t<(sequenceLength-tw)+1; ++t) { 40 | for (int i=0; i> forwardTransitions(); 33 | public Collection> nextLineStartStates(); 34 | public double endLogProb(); 35 | } 36 | 37 | public Collection> startStates(); 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/output/HtmlOutputWriter.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.output; 2 | 3 | import java.io.File; 4 | import java.util.List; 5 | 6 | import edu.berkeley.cs.nlp.ocular.data.textreader.Charset; 7 | import edu.berkeley.cs.nlp.ocular.gsm.GlyphChar; 8 | import edu.berkeley.cs.nlp.ocular.gsm.GlyphChar.GlyphType; 9 | import edu.berkeley.cs.nlp.ocular.model.DecodeState; 10 | import edu.berkeley.cs.nlp.ocular.model.transition.SparseTransitionModel.TransitionState; 11 | import edu.berkeley.cs.nlp.ocular.util.FileUtil; 12 | import tberg.murphy.fileio.f; 13 | import tberg.murphy.indexer.Indexer; 14 | 15 | /** 16 | * @author Dan Garrette (dhgarrette@gmail.com) 17 | */ 18 | public class HtmlOutputWriter { 19 | 20 | private Indexer charIndexer; 21 | private Indexer langIndexer; 22 | 23 | public HtmlOutputWriter(Indexer charIndexer, Indexer langIndexer) { 24 | this.charIndexer = charIndexer; 25 | this.langIndexer = langIndexer; 26 | } 27 | 28 | public void write(int numLines, List[] viterbiTransStates, String imgFilename, String outputFilenameBase) { 29 | String htmlOutputFilename = outputFilenameBase + ".html"; 30 | 31 | StringBuffer outputBuffer = new StringBuffer(); 32 | outputBuffer.append("\n"); 33 | outputBuffer.append("\n"); 34 | outputBuffer.append("\n"); 35 | outputBuffer.append("
\n"); 36 | 37 | String[] colors = new String[] { "Black", "Red", "Blue", "Olive", "Orange", "Magenta", "Lime", "Cyan", "Purple", "Green", "Brown" }; 38 | 39 | int prevLanguage = -1; 40 | for (int line = 0; line < numLines; ++line) { 41 | for (DecodeState ds : viterbiTransStates[line]) { 42 | TransitionState ts = ds.ts; 43 | int lmChar = ts.getLmCharIndex(); 44 | GlyphChar glyph = ts.getGlyphChar(); 45 | int glyphChar = glyph.templateCharIndex; 46 | String sglyphChar = Charset.unescapeChar(charIndexer.getObject(glyphChar)); 47 | 48 | int currLanguage = ts.getLanguageIndex(); 49 | if (currLanguage != prevLanguage) { 50 | outputBuffer.append(""); 51 | } 52 | 53 | if (lmChar != glyphChar || glyph.glyphType != GlyphType.NORMAL_CHAR) { 54 | String norm = Charset.unescapeChar(charIndexer.getObject(lmChar)); 55 | String dipl = (glyph.glyphType == GlyphType.DOUBLED ? "2x"+sglyphChar : glyph.isElided() ? "" : sglyphChar); 56 | outputBuffer.append("[" + norm + "/" + dipl + "]"); 57 | } 58 | else { 59 | outputBuffer.append(sglyphChar); 60 | } 61 | 62 | prevLanguage = currLanguage; 63 | } 64 | outputBuffer.append("
\n"); 65 | } 66 | outputBuffer.append("



\n"); 67 | for (int i = -1; i < langIndexer.size(); ++i) { 68 | outputBuffer.append("" + (i < 0 ? "none" : langIndexer.getObject(i)) + "
\n"); 69 | } 70 | 71 | outputBuffer.append("
\n"); 72 | outputBuffer.append("\n"); 73 | outputBuffer.append("
\n"); 74 | outputBuffer.append("\n"); 75 | outputBuffer.append("\n\n\n"); 76 | outputBuffer.append("\n\n\n\n\n"); 77 | String outputString = outputBuffer.toString(); 78 | 79 | System.out.println("Writing html output to " + htmlOutputFilename); 80 | f.writeString(htmlOutputFilename, outputString); 81 | } 82 | 83 | } 84 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/preprocessing/Binarizer.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.preprocessing; 2 | 3 | import edu.berkeley.cs.nlp.ocular.image.ImageUtils; 4 | 5 | /** 6 | * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu) 7 | */ 8 | public class Binarizer { 9 | 10 | public static boolean isBinary(double[][] levels) { 11 | int[] histogram = new int[(int) ImageUtils.MAX_LEVEL+1]; 12 | for (int i=0; i 0) nonZeroEntries++; 20 | } 21 | return nonZeroEntries <= 2; 22 | } 23 | 24 | public static void binarizeAlreadyBinary(double[][] levels) { 25 | double min = Double.POSITIVE_INFINITY; 26 | double max = Double.NEGATIVE_INFINITY; 27 | for (double[] vals : levels) { 28 | for (double val : vals) { 29 | min = Math.min(val, min); 30 | max = Math.max(val, max); 31 | } 32 | } 33 | double threshold = (max + min) / 2.0; 34 | for (int i = 0; i < levels.length; i++) { 35 | for (int j = 0; j < levels[i].length; j++) { 36 | if (levels[i][j] <= threshold) { 37 | levels[i][j] = 0; 38 | } else { 39 | levels[i][j] = ImageUtils.MAX_LEVEL; 40 | } 41 | } 42 | } 43 | } 44 | 45 | public static void binarizeGlobal(double blackPercential, double[][] levels) { 46 | if (isBinary(levels)) { 47 | binarizeAlreadyBinary(levels); 48 | return; 49 | } 50 | 51 | int[] histogram = new int[(int) ImageUtils.MAX_LEVEL+1]; 52 | int total = 0; 53 | for (int i=0; i= rank) { 66 | threshold = v; 67 | break; 68 | } 69 | } 70 | for (int i = 0; i < levels.length; i++) { 71 | for (int j = 0; j < levels[i].length; j++) { 72 | if (levels[i][j] <= threshold) { 73 | levels[i][j] = 0; 74 | } else { 75 | levels[i][j] = ImageUtils.MAX_LEVEL; 76 | } 77 | } 78 | } 79 | } 80 | 81 | public static void binarizeLocal(double blackPercential, double radiusFactor, double[][] levels) { 82 | if (isBinary(levels)) { 83 | binarizeAlreadyBinary(levels); 84 | return; 85 | } 86 | 87 | int radius = (int) (levels.length * radiusFactor); 88 | 89 | int dWidth = (int) Math.ceil((double) levels.length / radius); 90 | int dHeight = (int) Math.ceil((double) levels[0].length / radius); 91 | double[][] thresholds = new double[dWidth][dHeight]; 92 | for (int di=0; di= rank) { 111 | threshold = v; 112 | break; 113 | } 114 | } 115 | thresholds[di][dj] = threshold; 116 | } 117 | } 118 | } 119 | 120 | 121 | for (int i = 0; i < levels.length; i++) { 122 | for (int j = 0; j < levels[i].length; j++) { 123 | if (levels[i][j] <= thresholds[i/radius][j/radius]) { 124 | levels[i][j] = 0; 125 | } else { 126 | levels[i][j] = ImageUtils.MAX_LEVEL; 127 | } 128 | } 129 | } 130 | } 131 | 132 | } 133 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/preprocessing/LineExtractor.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.preprocessing; 2 | 3 | import java.io.File; 4 | import java.io.FilenameFilter; 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | 8 | import edu.berkeley.cs.nlp.ocular.image.ImageUtils; 9 | import edu.berkeley.cs.nlp.ocular.image.ImageUtils.ConnectedComponentProcessor; 10 | import edu.berkeley.cs.nlp.ocular.preprocessing.VerticalProfile.VerticalSegmentation; 11 | import tberg.murphy.fileio.f; 12 | import tberg.murphy.tuple.Pair; 13 | 14 | /** 15 | * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu) 16 | */ 17 | public class LineExtractor { 18 | 19 | public static List extractLines(double[][] levels) { 20 | VerticalProfile verticalProfile = new VerticalProfile(levels); 21 | VerticalModel trainedModel = verticalProfile.runEM(5, 100); 22 | // trainedModel.freezeSizeParams(1); 23 | VerticalSegmentation viterbiSegments = verticalProfile.decode(trainedModel); 24 | // ImageUtils.display(Visualizer.renderLineExtraction(levels, viterbiSegments)); 25 | 26 | List result = new ArrayList(); 27 | 28 | int topDist = 29; 29 | int botDist = 11; 30 | List> segments = viterbiSegments.retrieveLineBoundaries(); 31 | List baselines = viterbiSegments.retrieveBaselines(); 32 | for (int s=0; s= levels[0].length){ 41 | // if (pos < 0 || pos >= levels[0].length || pos < upper-5 || pos >= lower+5){ 42 | line[x][t] = ImageUtils.MAX_LEVEL; 43 | } else { 44 | line[x][t] = levels[x][pos]; 45 | } 46 | } 47 | } 48 | for (int b=0; b= levels[0].length){ 52 | // if (pos < 0 || pos >= levels[0].length || pos < upper-5 || pos >= lower+5){ 53 | line[x][topDist+b] = ImageUtils.MAX_LEVEL; 54 | } else { 55 | line[x][topDist+b] = levels[x][pos]; 56 | } 57 | } 58 | } 59 | result.add(line); 60 | } 61 | 62 | // List> lineBoundaries = viterbiSegments.retrieveLineBoundaries(); 63 | // for (Pair boundary : lineBoundaries) { 64 | // double[][] line = new double[levels.length][boundary.getSecond().intValue() - boundary.getFirst().intValue()]; 65 | // for (int y = boundary.getFirst().intValue(); y < boundary.getSecond().intValue(); y++) { 66 | // for (int x = 0; x < levels.length; x++) { 67 | // line[x][y-boundary.getFirst()] = levels[x][y]; 68 | // } 69 | // } 70 | // result.add(line); 71 | // } 72 | 73 | System.out.println("Extractor returned " + result.size() + " line images"); 74 | return result; 75 | } 76 | 77 | public static void main(String[] args) { 78 | String path = "/Users/tberg/Desktop/F-tem/seg_extraction/"; 79 | File dir = new File(path); 80 | for (String name : dir.list(new FilenameFilter() { 81 | public boolean accept(File dir, String name) { 82 | return name.endsWith(".png") || name.endsWith(".jpg"); 83 | } 84 | })) { 85 | double[][] levels = ImageUtils.getLevels(f.readImage(path+"/"+name)); 86 | ConnectedComponentProcessor ccprocBig = new ConnectedComponentProcessor() { 87 | public void process(double[][] levels, List connectedComponent) { 88 | if (connectedComponent.size() > 1000) { 89 | for (int[] pixel : connectedComponent) { 90 | levels[pixel[0]][pixel[1]] = 255.0; 91 | } 92 | } 93 | } 94 | }; 95 | ImageUtils.processConnectedComponents(levels, 50.0, ccprocBig); 96 | Binarizer.binarizeGlobal(0.13, levels); 97 | ConnectedComponentProcessor ccprocSmall = new ConnectedComponentProcessor() { 98 | public void process(double[][] levels, List connectedComponent) { 99 | if (connectedComponent.size() < 20 || connectedComponent.size() > 1000) { 100 | for (int[] pixel : connectedComponent) { 101 | levels[pixel[0]][pixel[1]] = 255.0; 102 | } 103 | } 104 | } 105 | }; 106 | ImageUtils.processConnectedComponents(levels, 127.0, ccprocSmall); 107 | List lines = extractLines(levels); 108 | for (double[][] line : lines) { 109 | ImageUtils.display(ImageUtils.makeImage(line)); 110 | } 111 | } 112 | } 113 | 114 | } 115 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/preprocessing/ManualStackCropperPrep.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.preprocessing; 2 | 3 | import java.io.File; 4 | import java.io.FilenameFilter; 5 | import java.util.Arrays; 6 | import java.util.List; 7 | 8 | import edu.berkeley.cs.nlp.ocular.image.ImageUtils; 9 | import edu.berkeley.cs.nlp.ocular.image.ImageUtils.ConnectedComponentProcessor; 10 | import tberg.murphy.fileio.f; 11 | 12 | public class ManualStackCropperPrep { 13 | 14 | public static void main(String[] args) { 15 | String path = args[0]; 16 | double binarizeThresh = 0.1; 17 | if (args.length > 1) { 18 | binarizeThresh = Double.parseDouble(args[1]); 19 | } 20 | File dir = new File(path); 21 | String[] names = dir.list(new FilenameFilter() { 22 | public boolean accept(File dir, String name) { 23 | return name.endsWith(".png") || name.endsWith(".jpg"); 24 | } 25 | }); 26 | Arrays.sort(names); 27 | File oddDirCol1 = new File(path + "/odd_col1"); 28 | File oddDirCol2 = new File(path + "/odd_col2"); 29 | oddDirCol1.mkdirs(); 30 | oddDirCol2.mkdirs(); 31 | File evenDirCol1 = new File(path + "/even_col1"); 32 | File evenDirCol2 = new File(path + "/even_col2"); 33 | evenDirCol1.mkdirs(); 34 | evenDirCol2.mkdirs(); 35 | File dirExtr = new File(path + "/col_extraction"); 36 | dirExtr.mkdirs(); 37 | boolean odd = false; 38 | for (String name : names) { 39 | double[][] levels = ImageUtils.getLevels(f.readImage(path+"/"+name)); 40 | ConnectedComponentProcessor ccprocBig = new ConnectedComponentProcessor() { 41 | public void process(double[][] levels, List connectedComponent) { 42 | if (connectedComponent.size() > 1000) { 43 | for (int[] pixel : connectedComponent) { 44 | levels[pixel[0]][pixel[1]] = 255.0; 45 | } 46 | } 47 | } 48 | }; 49 | ImageUtils.processConnectedComponents(levels, 50.0, ccprocBig); 50 | Binarizer.binarizeGlobal(binarizeThresh, levels); 51 | ConnectedComponentProcessor ccprocSmall = new ConnectedComponentProcessor() { 52 | public void process(double[][] levels, List connectedComponent) { 53 | if (connectedComponent.size() < 20 || connectedComponent.size() > 500) { 54 | for (int[] pixel : connectedComponent) { 55 | levels[pixel[0]][pixel[1]] = 255.0; 56 | } 57 | } 58 | } 59 | }; 60 | ImageUtils.processConnectedComponents(levels, 127.0, ccprocSmall); 61 | double[][] rotLevels = Straightener.straighten(levels); 62 | String baseName = (name.lastIndexOf('.') == -1) ? name : name.substring(0, name.lastIndexOf('.')); 63 | f.writeImage((odd ? oddDirCol1.getAbsolutePath() : evenDirCol1.getAbsolutePath()) +"/"+ baseName + "_col1.png", ImageUtils.makeImage(rotLevels)); 64 | f.writeImage((odd ? oddDirCol2.getAbsolutePath() : evenDirCol2.getAbsolutePath()) +"/"+ baseName + "_col2.png", ImageUtils.makeImage(rotLevels)); 65 | odd = !odd; 66 | } 67 | } 68 | 69 | } 70 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/preprocessing/Straightener.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.preprocessing; 2 | 3 | import java.awt.image.BufferedImage; 4 | import java.io.File; 5 | import java.io.FilenameFilter; 6 | import java.util.Arrays; 7 | import java.util.List; 8 | 9 | import edu.berkeley.cs.nlp.ocular.image.ImageUtils; 10 | import edu.berkeley.cs.nlp.ocular.image.ImageUtils.ConnectedComponentProcessor; 11 | import tberg.murphy.fileio.f; 12 | 13 | /** 14 | * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu) 15 | */ 16 | public class Straightener { 17 | 18 | private static final double MIN_ANGLE_RADIANS = -0.05; 19 | private static final double MAX_ANGLE_RADIANS = 0.05; 20 | private static final int ANGLE_SAMPLE_POINTS = 20; 21 | 22 | public static double[][] straighten(double[][] levels) { 23 | BufferedImage image = ImageUtils.makeImage(levels); 24 | double maxTotalVar = Double.NEGATIVE_INFINITY; 25 | double bestAngle = Double.NEGATIVE_INFINITY; 26 | for (int i=0; i maxTotalVar) { 32 | maxTotalVar = totalVar; 33 | bestAngle = angle; 34 | } 35 | } 36 | return ImageUtils.getLevels(ImageUtils.rotateImage(ImageUtils.makeImage(levels), bestAngle)); 37 | } 38 | 39 | private static double verticalTotalVariation(double[][] levels) { 40 | double[] horizontalAvg = new double[levels[0].length]; 41 | for (int i=0; i 1) { 57 | binarizeThresh = Double.parseDouble(args[1]); 58 | } 59 | File dir = new File(path); 60 | String[] names = dir.list(new FilenameFilter() { 61 | public boolean accept(File dir, String name) { 62 | return name.endsWith(".png") || name.endsWith(".jpg"); 63 | } 64 | }); 65 | Arrays.sort(names); 66 | File straightDir = new File(path + "/straight"); 67 | straightDir.mkdirs(); 68 | for (String name : names) { 69 | double[][] levels = ImageUtils.getLevels(f.readImage(path+"/"+name)); 70 | ConnectedComponentProcessor ccprocBig = new ConnectedComponentProcessor() { 71 | public void process(double[][] levels, List connectedComponent) { 72 | if (connectedComponent.size() > 1000) { 73 | for (int[] pixel : connectedComponent) { 74 | levels[pixel[0]][pixel[1]] = 255.0; 75 | } 76 | } 77 | } 78 | }; 79 | ImageUtils.processConnectedComponents(levels, 50.0, ccprocBig); 80 | Binarizer.binarizeGlobal(binarizeThresh, levels); 81 | ConnectedComponentProcessor ccprocSmall = new ConnectedComponentProcessor() { 82 | public void process(double[][] levels, List connectedComponent) { 83 | if (connectedComponent.size() < 20 || connectedComponent.size() > 500) { 84 | for (int[] pixel : connectedComponent) { 85 | levels[pixel[0]][pixel[1]] = 255.0; 86 | } 87 | } 88 | } 89 | }; 90 | ImageUtils.processConnectedComponents(levels, 127.0, ccprocSmall); 91 | double[][] rotLevels = Straightener.straighten(levels); 92 | String baseName = (name.lastIndexOf('.') == -1) ? name : name.substring(0, name.lastIndexOf('.')); 93 | f.writeImage(straightDir.getAbsolutePath() +"/"+ baseName + ".png", ImageUtils.makeImage(rotLevels)); 94 | } 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/preprocessing/Test.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.preprocessing; 2 | 3 | import java.awt.image.BufferedImage; 4 | import java.io.File; 5 | 6 | import edu.berkeley.cs.nlp.ocular.data.PdfImageReader; 7 | import edu.berkeley.cs.nlp.ocular.image.ImageUtils; 8 | import tberg.murphy.fileio.f; 9 | 10 | /** 11 | * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu) 12 | */ 13 | public class Test { 14 | 15 | public static void main(String[] args) { 16 | // String path = "sample_images/multilingual/"; 17 | // String path = "/Users/dhg/Desktop/pl/"; 18 | // File dir = new File(path); 19 | // for (String name : dir.list()) { 20 | // double[][] levels = ImageUtils.getLevels(f.readImage(path+"/"+name)); 21 | // double[][] rotLevels = Straightener.straighten(levels); 22 | // Binarizer.binarizeGlobal(0.08, rotLevels); 23 | // ImageUtils.display(ImageUtils.makeImage(rotLevels)); 24 | // 25 | // 26 | // // double[][] cropLevels = Cropper.crop(rotLevels); 27 | // // ImageUtils.display(ImageUtils.makeImage(cropLevels)); 28 | 29 | 30 | { 31 | File file = new File("sample_images/multilingual/pl_blac_047_00039-800.jpg"); 32 | BufferedImage image = f.readImage(file.getPath()); 33 | double[][] levels = ImageUtils.getLevels(image); 34 | double[][] rotLevels = Straightener.straighten(levels); 35 | Binarizer.binarizeGlobal(0.08, rotLevels); 36 | ImageUtils.display(ImageUtils.makeImage(rotLevels)); 37 | double[][] cropLevels = Cropper.crop(rotLevels, 0.12); 38 | ImageUtils.display(ImageUtils.makeImage(cropLevels)); 39 | } 40 | 41 | { 42 | File file = new File("sample_images/multilingual/adv.pdf"); 43 | BufferedImage image = PdfImageReader.readPdfPageAsImage(file, 1); 44 | double[][] levels = ImageUtils.getLevels(image); 45 | double[][] rotLevels = Straightener.straighten(levels); 46 | Binarizer.binarizeGlobal(0.08, rotLevels); 47 | ImageUtils.display(ImageUtils.makeImage(rotLevels)); 48 | double[][] cropLevels = Cropper.crop(rotLevels, 0.12); 49 | ImageUtils.display(ImageUtils.makeImage(cropLevels)); 50 | } 51 | 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/train/ModelPathMaker.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.train; 2 | 3 | /** 4 | * @author Dan Garrette (dhgarrette@gmail.com) 5 | */ 6 | public class ModelPathMaker { 7 | 8 | public static String makeFontDir(String outputPath) { 9 | return outputPath + "/font/"; 10 | } 11 | public static String makeFontPath(String outputPath, int iter, int batch) { 12 | return makeFontDir(outputPath) + makeOutputFilePrefix(iter, batch) + ".fontser"; 13 | } 14 | public static String makeFontFilenameRegex() { 15 | return makeOutputFilePrefixRegex() + ".fontser"; 16 | } 17 | 18 | public static String makeLmDir(String outputPath) { 19 | return outputPath + "/lm/"; 20 | } 21 | public static String makeLmPath(String outputPath, int iter, int batch) { 22 | return makeLmDir(outputPath) + makeOutputFilePrefix(iter, batch) + ".lmser"; 23 | } 24 | public static String makeLmFilenameRegex() { 25 | return makeOutputFilePrefixRegex() + ".lmser"; 26 | } 27 | 28 | public static String makeGsmDir(String outputPath) { 29 | return outputPath + "/gsm/"; 30 | } 31 | public static String makeGsmPath(String outputPath, int iter, int batch) { 32 | return makeGsmDir(outputPath) + makeOutputFilePrefix(iter, batch) + ".gsmser"; 33 | } 34 | public static String makeGsmFilenameRegex() { 35 | return makeOutputFilePrefixRegex() + ".gsmser"; 36 | } 37 | 38 | private static String makeOutputFilePrefix(int iter, int batch) { 39 | return "retrained_iter-"+iter+"_batch-"+batch; 40 | } 41 | public static String makeOutputFilePrefixRegex() { 42 | return "retrained_iter-(\\d+)_batch-(\\d+)"; 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/train/TrainingRestarter.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.train; 2 | 3 | import static edu.berkeley.cs.nlp.ocular.train.ModelPathMaker.makeFontPath; 4 | import static edu.berkeley.cs.nlp.ocular.train.ModelPathMaker.makeGsmPath; 5 | import static edu.berkeley.cs.nlp.ocular.train.ModelPathMaker.makeLmPath; 6 | import static edu.berkeley.cs.nlp.ocular.util.Tuple2.Tuple2; 7 | import static edu.berkeley.cs.nlp.ocular.util.Tuple3.Tuple3; 8 | 9 | import java.io.File; 10 | 11 | import edu.berkeley.cs.nlp.ocular.font.Font; 12 | import edu.berkeley.cs.nlp.ocular.gsm.GlyphSubstitutionModel; 13 | import edu.berkeley.cs.nlp.ocular.lm.CodeSwitchLanguageModel; 14 | import edu.berkeley.cs.nlp.ocular.main.InitializeFont; 15 | import edu.berkeley.cs.nlp.ocular.main.InitializeGlyphSubstitutionModel; 16 | import edu.berkeley.cs.nlp.ocular.main.InitializeLanguageModel; 17 | import edu.berkeley.cs.nlp.ocular.util.Tuple2; 18 | import edu.berkeley.cs.nlp.ocular.util.Tuple3; 19 | 20 | /** 21 | * @author Dan Garrette (dhgarrette@gmail.com) 22 | */ 23 | public class TrainingRestarter { 24 | 25 | /** 26 | * If requested, try and pick up where we left off 27 | */ 28 | public Tuple2> getRestartModels( 29 | Font inputFont, CodeSwitchLanguageModel inputLm, GlyphSubstitutionModel inputGsm, 30 | boolean updateLM, boolean updateGsm, String outputPath, 31 | int numEMIters, int numUsableDocs, int updateDocBatchSize, boolean noUpdateIfBatchTooSmall) { 32 | 33 | int lastCompletedIteration = 0; 34 | String fontPath = null; 35 | int lastBatchNumOfIteration = getLastBatchNumOfIteration(numUsableDocs, updateDocBatchSize, noUpdateIfBatchTooSmall); 36 | for (int iter = 1; iter <= numEMIters; ++iter) { 37 | fontPath = makeFontPath(outputPath, iter, lastBatchNumOfIteration); 38 | if (new File(fontPath).exists()) { 39 | lastCompletedIteration = iter; 40 | } 41 | } 42 | 43 | Font newFont = inputFont; 44 | CodeSwitchLanguageModel newLm = inputLm; 45 | GlyphSubstitutionModel newGsm = inputGsm; 46 | 47 | if (lastCompletedIteration == numEMIters) { 48 | System.out.println("All iterations are already complete!"); 49 | } 50 | else if (lastCompletedIteration > 0) { 51 | System.out.println("Last completed iteration: "+lastCompletedIteration); 52 | if (fontPath != null) { 53 | String lastFontPath = makeFontPath(outputPath, lastCompletedIteration, lastBatchNumOfIteration); 54 | System.out.println(" Loading font of last completed iteration: "+lastFontPath); 55 | newFont = InitializeFont.readFont(lastFontPath); 56 | } 57 | if (updateLM) { 58 | String lastLmPath = makeLmPath(outputPath, lastCompletedIteration, lastBatchNumOfIteration); 59 | System.out.println(" Loading lm of last completed iteration: "+lastLmPath); 60 | newLm = InitializeLanguageModel.readCodeSwitchLM(lastLmPath); 61 | } 62 | if (updateGsm) { 63 | String lastGsmPath = makeGsmPath(outputPath, lastCompletedIteration, lastBatchNumOfIteration); 64 | System.out.println(" Loading gsm of last completed iteration: "+lastGsmPath); 65 | newGsm = InitializeGlyphSubstitutionModel.readGSM(lastGsmPath); 66 | } 67 | } 68 | else { 69 | System.out.println("No completed iterations found"); 70 | } 71 | 72 | return Tuple2(lastCompletedIteration, Tuple3(newFont,newLm,newGsm)); 73 | } 74 | 75 | private int getLastBatchNumOfIteration(int numUsableDocs, int updateDocBatchSize, boolean noUpdateIfBatchTooSmall) { 76 | int completedBatchesInIteration = 0; 77 | int currentBatchSize = 0; 78 | for (int docNum = 0; docNum < numUsableDocs; ++docNum) { 79 | ++currentBatchSize; 80 | if (FontTrainer.isBatchComplete(numUsableDocs, docNum, currentBatchSize, updateDocBatchSize, noUpdateIfBatchTooSmall)) { 81 | ++completedBatchesInIteration; 82 | } 83 | } 84 | return completedBatchesInIteration; 85 | } 86 | 87 | } 88 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/util/ArrayHelper.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.util; 2 | 3 | import java.util.Arrays; 4 | 5 | /** 6 | * @author Dan Garrette (dhgarrette@gmail.com) 7 | */ 8 | public class ArrayHelper { 9 | 10 | public static int sum(int[] xs) { 11 | int result = 0; 12 | for (int x : xs) result += x; 13 | return result; 14 | } 15 | 16 | public static double sum(double[] xs) { 17 | double result = 0.0; 18 | for (double x : xs) result += x; 19 | return result; 20 | } 21 | 22 | public static double avg(int[] xs) { 23 | if (xs.length == 0) return 0.0; 24 | else return ((double)sum(xs)) / xs.length; 25 | } 26 | 27 | public static double avg(double[] xs) { 28 | if (xs.length == 0) return 0.0; 29 | else return sum(xs) / xs.length; 30 | } 31 | 32 | public static int max(int... xs) { 33 | if (xs.length == 0) throw new RuntimeException("ArrayHelper.max cannot be used on an empty array."); 34 | int v = Integer.MIN_VALUE; 35 | for (int x : xs) { 36 | if (x > v) v = x; 37 | } 38 | return v; 39 | } 40 | 41 | public static double max(double... xs) { 42 | if (xs.length == 0) throw new RuntimeException("ArrayHelper.max cannot be used on an empty array."); 43 | double v = Double.MIN_VALUE; 44 | for (double x : xs) { 45 | if (x > v) v = x; 46 | } 47 | return v; 48 | } 49 | 50 | public static int min(int... xs) { 51 | if (xs.length == 0) throw new RuntimeException("ArrayHelper.min cannot be used on an empty array."); 52 | int v = Integer.MAX_VALUE; 53 | for (int x : xs) 54 | if (x < v) v = x; 55 | return v; 56 | } 57 | 58 | public static double min(double... xs) { 59 | if (xs.length == 0) throw new RuntimeException("ArrayHelper.min cannot be used on an empty array."); 60 | double v = Double.MAX_VALUE; 61 | for (double x : xs) 62 | if (x < v) v = x; 63 | return v; 64 | } 65 | 66 | public static int[] prepend(int c, int[] vec1) { 67 | int[] result = new int[vec1.length + 1]; 68 | if (vec1.length > 0) System.arraycopy(vec1, 0, result, 1, vec1.length); 69 | result[0] = c; 70 | return result; 71 | } 72 | 73 | public static A[] append(A[] vec1, A c) { 74 | A[] result = Arrays.copyOf(vec1, vec1.length + 1); 75 | result[result.length - 1] = c; 76 | return result; 77 | } 78 | 79 | public static int[] take(int[] vec1, int n) { 80 | int n2 = Math.min(vec1.length, n); 81 | int[] result = new int[n2]; 82 | if (vec1.length > 0) System.arraycopy(vec1, 0, result, 0, n2); 83 | return result; 84 | } 85 | 86 | public static int[] takeRight(int[] vec1, int n) { 87 | int n2 = Math.min(vec1.length, n); 88 | int[] result = new int[n2]; 89 | if (vec1.length > 0) System.arraycopy(vec1, vec1.length - n2, result, 0, n2); 90 | return result; 91 | } 92 | 93 | } 94 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/util/FileHelper.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.util; 2 | 3 | import java.io.BufferedWriter; 4 | import java.io.File; 5 | import java.io.FileOutputStream; 6 | import java.io.IOException; 7 | import java.io.OutputStreamWriter; 8 | 9 | /** 10 | * @author Dan Garrette (dhgarrette@gmail.com) 11 | */ 12 | public class FileHelper { 13 | 14 | public static void writeString(String path, String str) { 15 | BufferedWriter out = null; 16 | try { 17 | File f = new File(path); 18 | f.getAbsoluteFile().getParentFile().mkdirs(); 19 | out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), "utf-8")); 20 | out.write(str); 21 | } catch (IOException ex) { 22 | throw new RuntimeException(ex); 23 | } finally { 24 | if (out != null) { 25 | try { out.close(); } catch (Exception ex) {} 26 | } 27 | } 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/util/StringHelper.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.util; 2 | 3 | import java.util.List; 4 | 5 | /** 6 | * @author Dan Garrette (dhgarrette@gmail.com) 7 | */ 8 | public class StringHelper { 9 | 10 | public static String toUnicode(String s) { 11 | //if (s.length() != 1) throw new RuntimeException("toUnicode input must be a single character"); 12 | StringBuilder sb = new StringBuilder(); 13 | for (int i = 0; i < s.length(); ++i) 14 | sb.append(toUnicode(s.charAt(i))); 15 | return sb.toString(); 16 | } 17 | 18 | public static String toUnicode(char c) { 19 | return "\\u" + Integer.toHexString(c | 0x10000).substring(1); 20 | } 21 | 22 | public static String take(String s, int n) { 23 | if (n <= 0) 24 | return ""; 25 | else if (n < s.length()) 26 | return s.substring(0, n); 27 | else 28 | return s; 29 | } 30 | 31 | public static String drop(String s, int n) { 32 | if (n <= 0) 33 | return s; 34 | else if (n < s.length()) 35 | return s.substring(n); 36 | else 37 | return ""; 38 | } 39 | 40 | public static String last(String s) { 41 | if (s.isEmpty()) throw new IllegalArgumentException("cannot get `last` of empty string"); 42 | return s.substring(s.length() - 1); 43 | } 44 | 45 | public static String join(String... xs) { 46 | StringBuilder sb = new StringBuilder(); 47 | for (String x : xs) 48 | sb.append(x); 49 | return sb.toString(); 50 | } 51 | 52 | public static String join(List xs) { 53 | StringBuilder sb = new StringBuilder(); 54 | for (String x : xs) 55 | sb.append(x); 56 | return sb.toString(); 57 | } 58 | 59 | public static String join(List xs, String sep) { 60 | int sepLen = sep.length(); 61 | StringBuilder sb = new StringBuilder(); 62 | for (String x : xs) 63 | sb.append(x).append(sep); 64 | return sb.length() > 0 ? sb.delete(sb.length() - sepLen, sb.length()).toString() : ""; 65 | } 66 | 67 | public static boolean equals(String a, String b) { 68 | if (a == null) 69 | return b == null; 70 | else 71 | return a.equals(b); 72 | } 73 | 74 | public static int longestCommonPrefix(String a, String b) { 75 | int i = 0; 76 | char[] as = a.toCharArray(); 77 | char[] bs = b.toCharArray(); 78 | int aLen = as.length; 79 | int bLen = bs.length; 80 | while (i < aLen && i < bLen && as[i] == bs[i]) 81 | ++i; 82 | return i; 83 | } 84 | 85 | } 86 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/util/Tuple2.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.util; 2 | 3 | import java.io.Serializable; 4 | import java.util.Comparator; 5 | 6 | /** 7 | * @author Dan Klein 8 | * @author Dan Garrette (dhgarrette@gmail.com) 9 | */ 10 | public class Tuple2 implements Serializable { 11 | static final long serialVersionUID = 52; 12 | 13 | public final A1 _1; 14 | public final A2 _2; 15 | 16 | public boolean equals(Object o) { 17 | if (this == o) 18 | return true; 19 | if (!(o instanceof Tuple2)) 20 | return false; 21 | 22 | @SuppressWarnings("rawtypes") 23 | final Tuple2 tuple = (Tuple2) o; 24 | 25 | if (_1 != null ? !_1.equals(tuple._1) : tuple._1 != null) 26 | return false; 27 | if (_2 != null ? !_2.equals(tuple._2) : tuple._2 != null) 28 | return false; 29 | 30 | return true; 31 | } 32 | 33 | public int hashCode() { 34 | int result; 35 | result = (_1 != null ? _1.hashCode() : 0); 36 | result = 29 * result + (_2 != null ? _2.hashCode() : 0); 37 | return result; 38 | } 39 | 40 | public String toString() { 41 | return "(" + _1 + ", " + _2 + ")"; 42 | } 43 | 44 | public Tuple2(A1 _1, A2 _2) { 45 | this._1 = _1; 46 | this._2 = _2; 47 | } 48 | 49 | public static Tuple2 Tuple2(A1 _1, A2 _2) { 50 | return new Tuple2(_1, _2); 51 | } 52 | 53 | public static class LexicographicTuple2Comparator implements Comparator> { 54 | Comparator _1Comparator; 55 | Comparator _2Comparator; 56 | 57 | public int compare(Tuple2 tuple1, Tuple2 tuple2) { 58 | int _1Compare = _1Comparator.compare(tuple1._1, tuple2._1); 59 | if (_1Compare != 0) 60 | return _1Compare; 61 | return _2Comparator.compare(tuple1._2, tuple2._2); 62 | } 63 | 64 | public LexicographicTuple2Comparator(Comparator _1Comparator, Comparator _2Comparator) { 65 | this._1Comparator = _1Comparator; 66 | this._2Comparator = _2Comparator; 67 | } 68 | } 69 | 70 | public static class DefaultLexicographicTuple2Comparator, A2 extends Comparable> 71 | implements Comparator> { 72 | 73 | public int compare(Tuple2 x, Tuple2 y) { 74 | int _1Compare = x._1.compareTo(y._1); 75 | if (_1Compare != 0) { 76 | return _1Compare; 77 | } 78 | return y._2.compareTo(y._2); 79 | } 80 | 81 | } 82 | 83 | } 84 | -------------------------------------------------------------------------------- /src/main/java/edu/berkeley/cs/nlp/ocular/util/Tuple3.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.util; 2 | 3 | import java.io.Serializable; 4 | import java.util.Comparator; 5 | 6 | /** 7 | * @author Dan Klein 8 | * @author Dan Garrette (dhgarrette@gmail.com) 9 | */ 10 | public class Tuple3 implements Serializable { 11 | static final long serialVersionUID = 53; 12 | 13 | public final A1 _1; 14 | public final A2 _2; 15 | public final A3 _3; 16 | 17 | public boolean equals(Object o) { 18 | if (this == o) 19 | return true; 20 | if (!(o instanceof Tuple3)) 21 | return false; 22 | 23 | @SuppressWarnings("rawtypes") 24 | final Tuple3 tuple = (Tuple3) o; 25 | 26 | if (_1 != null ? !_1.equals(tuple._1) : tuple._1 != null) 27 | return false; 28 | if (_2 != null ? !_2.equals(tuple._2) : tuple._2 != null) 29 | return false; 30 | if (_3 != null ? !_3.equals(tuple._3) : tuple._3 != null) 31 | return false; 32 | 33 | return true; 34 | } 35 | 36 | public int hashCode() { 37 | int result; 38 | result = (_1 != null ? _1.hashCode() : 0); 39 | result = 29 * result + (_2 != null ? _2.hashCode() : 0); 40 | result = 31 * result + (_3 != null ? _3.hashCode() : 0); 41 | return result; 42 | } 43 | 44 | public String toString() { 45 | return "(" + _1 + ", " + _2 + ", " + _3 + ")"; 46 | } 47 | 48 | public Tuple3(A1 _1, A2 _2, A3 _3) { 49 | this._1 = _1; 50 | this._2 = _2; 51 | this._3 = _3; 52 | } 53 | 54 | public static Tuple3 Tuple3(A1 _1, A2 _2, A3 _3) { 55 | return new Tuple3(_1, _2, _3); 56 | } 57 | 58 | public static class LexicographicTuple3Comparator implements Comparator> { 59 | Comparator _1Comparator; 60 | Comparator _2Comparator; 61 | Comparator _3Comparator; 62 | 63 | public int compare(Tuple3 tuple1, Tuple3 tuple2) { 64 | int _1Compare = _1Comparator.compare(tuple1._1, tuple2._1); 65 | if (_1Compare != 0) 66 | return _1Compare; 67 | int _2Compare = _2Comparator.compare(tuple1._2, tuple2._2); 68 | if (_2Compare != 0) 69 | return _2Compare; 70 | return _3Comparator.compare(tuple1._3, tuple2._3); 71 | } 72 | 73 | public LexicographicTuple3Comparator(Comparator _1Comparator, Comparator _2Comparator, Comparator _3Comparator) { 74 | this._1Comparator = _1Comparator; 75 | this._2Comparator = _2Comparator; 76 | this._3Comparator = _3Comparator; 77 | } 78 | } 79 | 80 | public static class DefaultLexicographicTuple3Comparator, A2 extends Comparable, A3 extends Comparable> 81 | implements Comparator> { 82 | 83 | public int compare(Tuple3 x, Tuple3 y) { 84 | int _1Compare = x._1.compareTo(y._1); 85 | if (_1Compare != 0) { 86 | return _1Compare; 87 | } 88 | int _2Compare = x._2.compareTo(y._2); 89 | if (_2Compare != 0) { 90 | return _2Compare; 91 | } 92 | return x._3.compareTo(y._3); 93 | } 94 | 95 | } 96 | 97 | } 98 | -------------------------------------------------------------------------------- /src/test/java/edu/berkeley/cs/nlp/ocular/data/textreader/BasicTextReaderTests.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.data.textreader; 2 | 3 | import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.ACUTE_COMBINING; 4 | import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.DIAERESIS_COMBINING; 5 | import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.GRAVE_COMBINING; 6 | import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.MACRON_COMBINING; 7 | import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.TILDE_COMBINING; 8 | import static org.junit.Assert.assertEquals; 9 | import static org.junit.Assert.fail; 10 | 11 | import java.util.Arrays; 12 | import java.util.List; 13 | 14 | import org.junit.Test; 15 | 16 | /** 17 | * @author Dan Garrette (dhgarrette@gmail.com) 18 | */ 19 | public class BasicTextReaderTests { 20 | 21 | private String s1 = "ing th\\~q || | follies of thõsè, who éither ``sæek'' out th\\\"os\\`e wæys \"and\" means, which either are sq̃uccess lessons"; 22 | 23 | @Test 24 | public void test_readCharacters_qtilde() { 25 | TextReader tr = new BasicTextReader(); 26 | assertEquals(Arrays.asList("q" + TILDE_COMBINING), tr.readCharacters("q̃")); 27 | assertEquals(Arrays.asList("t", "h", "q" + TILDE_COMBINING, "r"), tr.readCharacters("thq̃r")); 28 | assertEquals(Arrays.asList("t", "h", "q" + TILDE_COMBINING, "r"), tr.readCharacters("th\\~qr")); 29 | } 30 | 31 | @Test 32 | public void test_readCharacters_stackedDiacritics() { 33 | TextReader tr = new BasicTextReader(); 34 | assertEquals(Arrays.asList("n" + TILDE_COMBINING + MACRON_COMBINING + DIAERESIS_COMBINING + ACUTE_COMBINING + GRAVE_COMBINING), tr.readCharacters("\\`\\'ñ" + MACRON_COMBINING + DIAERESIS_COMBINING)); 35 | } 36 | 37 | @Test 38 | public void test_readCharacters_dia() { 39 | TextReader tr = new BasicTextReader(); 40 | List r = Arrays.asList("i", "n", "g", " ", "t", "h", "q" + TILDE_COMBINING, " ", "|", "|", " ", "|", " ", "f", "o", "l", "l", "i", "e", "s", " ", "o", "f", " ", "t", "h", "o" + TILDE_COMBINING, "s", "e" + GRAVE_COMBINING, ",", " ", "w", "h", "o", " ", "e" + ACUTE_COMBINING, "i", "t", "h", "e", "r", " ", "\"", "s", "æ", "e", "k", "\"", " ", "o", "u", "t", " ", "t", "h", "o" + DIAERESIS_COMBINING, "s", "e" + GRAVE_COMBINING, " ", "w", "æ", "y", "s", " ", "\"", "a", "n", "d", "\"", " ", "m", "e", "a", "n", "s", ",", " ", "w", "h", "i", "c", "h", " ", "e", "i", "t", "h", "e", "r", " ", "a", "r", "e", " ", "s", "q" + TILDE_COMBINING, "u", "c", "c", "e", "s", "s", " ", "l", "e", "s", "s", "o", "n", "s"); 41 | assertEquals(r, tr.readCharacters(s1)); 42 | } 43 | 44 | @Test 45 | public void test_readCharacters_backslash() { 46 | TextReader tr = new BasicTextReader(); 47 | List r = Arrays.asList("t", "h", "i", "s", "\\\\", "t", "h", "a", "t", "\\\\", "t", "h", "e", "\\\\"); 48 | assertEquals(r, tr.readCharacters("this\\\\that\\\\the\\\\")); 49 | try { 50 | List r2 = tr.readCharacters("this\\that\\the\\"); 51 | fail("Exception expected, found: ["+r2+"]"); 52 | } catch (RuntimeException e) { 53 | assertEquals("Unrecognized escape sequence: [\\t]", e.getMessage()); 54 | } 55 | } 56 | 57 | @Test 58 | public void test_readCharacters_noEscapeChar() { 59 | BasicTextReader tr = new BasicTextReader(false); 60 | assertEquals(Arrays.asList("t", "h", "\\\\", "~", "q", "r", "\\\\", "\\\\", "x"), tr.readCharacters("th\\~qr\\\\x")); 61 | } 62 | 63 | } 64 | -------------------------------------------------------------------------------- /src/test/java/edu/berkeley/cs/nlp/ocular/data/textreader/BlacklistCharacterSetTextReaderTests.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.data.textreader; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | 5 | import java.util.Arrays; 6 | 7 | import org.junit.Test; 8 | 9 | import edu.berkeley.cs.nlp.ocular.util.CollectionHelper; 10 | 11 | /** 12 | * @author Dan Garrette (dhgarrette@gmail.com) 13 | */ 14 | public class BlacklistCharacterSetTextReaderTests { 15 | 16 | @Test 17 | public void test_readCharacters() { 18 | String s = "thi&s tha$t t$he"; 19 | 20 | TextReader tr = new BlacklistCharacterSetTextReader(CollectionHelper.makeSet("&", "$"), new BasicTextReader()); 21 | assertEquals(Arrays.asList("t", "h", "i", "s", " ", "t", "h", "a", "t", " ", "t", "h", "e"), tr.readCharacters(s)); 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/test/java/edu/berkeley/cs/nlp/ocular/data/textreader/CharIndexerTests.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.data.textreader; 2 | 3 | import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.TILDE_COMBINING; 4 | import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.TILDE_ESCAPE; 5 | import static org.junit.Assert.assertEquals; 6 | import static org.junit.Assert.assertFalse; 7 | import static org.junit.Assert.assertTrue; 8 | 9 | import org.junit.Test; 10 | 11 | import tberg.murphy.indexer.Indexer; 12 | 13 | /** 14 | * @author Dan Garrette (dhgarrette@gmail.com) 15 | */ 16 | public class CharIndexerTests { 17 | 18 | @Test 19 | public void test() { 20 | Indexer i = new CharIndexer(); 21 | 22 | String ae = TILDE_ESCAPE + "a"; 23 | String ac = "a" + TILDE_COMBINING; 24 | 25 | String ee = TILDE_ESCAPE + "e"; 26 | String ec = "e" + TILDE_COMBINING; 27 | 28 | String ne = TILDE_ESCAPE + "n"; 29 | String nc = "n" + TILDE_COMBINING; 30 | String np = "ñ"; 31 | 32 | i.index(new String[] { "a", "b", ec }); 33 | 34 | assertTrue(i.contains("a")); 35 | assertTrue(i.contains("b")); 36 | assertTrue(i.contains(ec)); 37 | assertTrue(i.contains(ee)); 38 | assertEquals(0, i.getIndex("a")); 39 | assertEquals("a", i.getObject(0)); 40 | assertEquals(1, i.getIndex("b")); 41 | assertEquals("b", i.getObject(1)); 42 | assertEquals(2, i.getIndex(ec)); 43 | assertEquals(ec, i.getObject(2)); 44 | assertEquals(2, i.getIndex(ec)); 45 | assertEquals(3, i.size()); 46 | 47 | assertFalse(i.contains(ae)); 48 | assertFalse(i.contains(ac)); 49 | assertEquals(3, i.getIndex(ae)); 50 | assertTrue(i.contains(ae)); 51 | assertTrue(i.contains(ac)); 52 | assertEquals(3, i.getIndex(ac)); 53 | assertTrue(i.contains(ae)); 54 | assertTrue(i.contains(ac)); 55 | assertEquals(4, i.size()); 56 | 57 | assertFalse(i.contains(ne)); 58 | assertFalse(i.contains(nc)); 59 | assertFalse(i.contains(np)); 60 | assertEquals(4, i.getIndex(np)); 61 | assertEquals(nc, i.getObject(4)); 62 | assertTrue(i.contains(ne)); 63 | assertTrue(i.contains(nc)); 64 | assertTrue(i.contains(np)); 65 | assertEquals(4, i.getIndex(ne)); 66 | assertEquals(4, i.getIndex(nc)); 67 | assertEquals(nc, i.getObject(4)); 68 | assertEquals(5, i.size()); 69 | 70 | assertFalse(i.locked()); 71 | i.lock(); 72 | assertTrue(i.locked()); 73 | } 74 | 75 | } 76 | -------------------------------------------------------------------------------- /src/test/java/edu/berkeley/cs/nlp/ocular/data/textreader/ConvertLongSTextReaderTests.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.data.textreader; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.*; 5 | 6 | import java.util.Arrays; 7 | import java.util.List; 8 | 9 | import org.junit.Test; 10 | 11 | /** 12 | * @author Dan Garrette (dhgarrette@gmail.com) 13 | */ 14 | public class ConvertLongSTextReaderTests { 15 | 16 | private String s1 = "ing th\\~q || | follies of thõsè, who éither ``sæek'' out th\\\"os\\`e wæys \"and\" means, which either are sq̃uccess confession asi \\\\lessons"; 17 | 18 | @Test 19 | public void test_readCharacters() { 20 | TextReader tr = new ConvertLongSTextReader(new BasicTextReader()); 21 | assertEquals(Arrays.asList("t", "h", "o" + TILDE_COMBINING, "ſ", "e" + GRAVE_COMBINING), tr.readCharacters("thõsè")); 22 | assertEquals(Arrays.asList("ſ", "i"), tr.readCharacters("si")); 23 | assertEquals(Arrays.asList("ſ", "i", "n"), tr.readCharacters("sin")); 24 | assertEquals(Arrays.asList("a", "ſ", "i"), tr.readCharacters("asi")); 25 | assertEquals(Arrays.asList("ſ", "s", "i"), tr.readCharacters("ssi")); 26 | assertEquals(Arrays.asList("a", "ſ", "s", "i"), tr.readCharacters("assi")); 27 | assertEquals(Arrays.asList("ſ", "s", "i", "n"), tr.readCharacters("ssin")); 28 | assertEquals(Arrays.asList("a", "ſ", "s", "i", "n"), tr.readCharacters("assin")); 29 | List r = Arrays.asList("i", "n", "g", " ", "t", "h", "q" + TILDE_COMBINING, " ", "|", "|", " ", "|", " ", "f", "o", "l", "l", "i", "e", "s", " ", "o", "f", " ", "t", "h", "o" + TILDE_COMBINING, "ſ", "e" + GRAVE_COMBINING, ",", " ", "w", "h", "o", " ", "e" + ACUTE_COMBINING, "i", "t", "h", "e", "r", " ", "\"", "ſ", "æ", "e", "k", "\"", " ", "o", "u", "t", " ", "t", "h", "o" + DIAERESIS_COMBINING, "ſ", "e" + GRAVE_COMBINING, " ", "w", "æ", "y", "s", " ", "\"", "a", "n", "d", "\"", " ", "m", "e", "a", "n", "s", ",", " ", "w", "h", "i", "c", "h", " ", "e", "i", "t", "h", "e", "r", " ", "a", "r", "e", " ", "ſ", "q" + TILDE_COMBINING, "u", "c", "c", "e", "ſ", "s", " ", "c", "o", "n", "f", "e", "ſ", "s", "i", "o", "n", " ", "a", "ſ", "i", " ", "\\\\", "l", "e", "ſ", "ſ", "o", "n", "s"); 30 | assertEquals(r, tr.readCharacters(s1)); 31 | } 32 | 33 | @Test 34 | public void test_readCharacters_removeDia() { 35 | TextReader tr = new ConvertLongSTextReader(new RemoveAllDiacriticsTextReader(new BasicTextReader())); 36 | List r = Arrays.asList("i", "n", "g", " ", "t", "h", "q", " ", "|", "|", " ", "|", " ", "f", "o", "l", "l", "i", "e", "s", " ", "o", "f", " ", "t", "h", "o", "ſ", "e", ",", " ", "w", "h", "o", " ", "e", "i", "t", "h", "e", "r", " ", "\"", "ſ", "æ", "e", "k", "\"", " ", "o", "u", "t", " ", "t", "h", "o", "ſ", "e", " ", "w", "æ", "y", "s", " ", "\"", "a", "n", "d", "\"", " ", "m", "e", "a", "n", "s", ",", " ", "w", "h", "i", "c", "h", " ", "e", "i", "t", "h", "e", "r", " ", "a", "r", "e", " ", "ſ", "q", "u", "c", "c", "e", "ſ", "s", " ", "c", "o", "n", "f", "e", "ſ", "s", "i", "o", "n", " ", "a", "ſ", "i", " ", "\\\\", "l", "e", "ſ", "ſ", "o", "n", "s"); 37 | assertEquals(r, tr.readCharacters(s1)); 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /src/test/java/edu/berkeley/cs/nlp/ocular/data/textreader/RemoveAllDiacriticsTextReaderTests.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.data.textreader; 2 | 3 | import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.DIAERESIS_COMBINING; 4 | import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.MACRON_COMBINING; 5 | import static org.junit.Assert.assertEquals; 6 | 7 | import java.util.Arrays; 8 | import java.util.List; 9 | 10 | import org.junit.Test; 11 | 12 | /** 13 | * @author Dan Garrette (dhgarrette@gmail.com) 14 | */ 15 | public class RemoveAllDiacriticsTextReaderTests { 16 | 17 | private String s1 = "ing th\\~q || | follies of thõsè, who éither ``sæek'' out th\\\"os\\`e wæys \"and\" means, which either are sq̃uccess lessons"; 18 | 19 | @Test 20 | public void test_readCharacters_qtilde_nodia() { 21 | TextReader tr = new RemoveAllDiacriticsTextReader(new BasicTextReader()); 22 | assertEquals(Arrays.asList("t", "h", "q", "r"), tr.readCharacters("thq̃r")); 23 | assertEquals(Arrays.asList("t", "h", "q", "r"), tr.readCharacters("th\\~qr")); 24 | } 25 | 26 | @Test 27 | public void test_readCharacters_stackedDiacritics_nodia() { 28 | TextReader tr = new RemoveAllDiacriticsTextReader(new BasicTextReader()); 29 | assertEquals(Arrays.asList("n"), tr.readCharacters("\\`\\'ñ" + MACRON_COMBINING + DIAERESIS_COMBINING)); 30 | } 31 | 32 | @Test 33 | public void test_readCharacters_plain() { 34 | TextReader tr = new RemoveAllDiacriticsTextReader(new BasicTextReader()); 35 | //assertEquals(Arrays.asList(), tr.readCharacters("tiquinhu\\-almoqu\\-ixtililia")); 36 | 37 | List r = Arrays.asList("i", "n", "g", " ", "t", "h", "q", " ", "|", "|", " ", "|", " ", "f", "o", "l", "l", "i", "e", "s", " ", "o", "f", " ", "t", "h", "o", "s", "e", ",", " ", "w", "h", "o", " ", "e", "i", "t", "h", "e", "r", " ", "\"", "s", "æ", "e", "k", "\"", " ", "o", "u", "t", " ", "t", "h", "o", "s", "e", " ", "w", "æ", "y", "s", " ", "\"", "a", "n", "d", "\"", " ", "m", "e", "a", "n", "s", ",", " ", "w", "h", "i", "c", "h", " ", "e", "i", "t", "h", "e", "r", " ", "a", "r", "e", " ", "s", "q", "u", "c", "c", "e", "s", "s", " ", "l", "e", "s", "s", "o", "n", "s"); 38 | assertEquals(r, tr.readCharacters(s1)); 39 | 40 | } 41 | 42 | } 43 | -------------------------------------------------------------------------------- /src/test/java/edu/berkeley/cs/nlp/ocular/data/textreader/ReplaceSomeTextReaderTests.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.data.textreader; 2 | 3 | import static edu.berkeley.cs.nlp.ocular.util.CollectionHelper.makeList; 4 | import static edu.berkeley.cs.nlp.ocular.util.Tuple2.Tuple2; 5 | import static org.junit.Assert.assertEquals; 6 | 7 | import java.util.List; 8 | 9 | import org.junit.Test; 10 | 11 | import edu.berkeley.cs.nlp.ocular.util.StringHelper; 12 | 13 | /** 14 | * @author Dan Garrette (dhgarrette@gmail.com) 15 | */ 16 | public class ReplaceSomeTextReaderTests { 17 | 18 | @Test 19 | public void test_readCharacters_1() { 20 | TextReader tr = new ReplaceSomeTextReader(makeList(Tuple2(Tuple2((List) makeList("a", "b"), (List) makeList("x", "y", "z")), 3)), new BasicTextReader()); 21 | assertEquals("ab1ab2xyz3ab4ab5xyz6ab7ab8", StringHelper.join(tr.readCharacters("ab1ab2ab3ab4ab5ab6ab7ab8"))); 22 | } 23 | 24 | @Test 25 | public void test_readCharacters_2() { 26 | TextReader tr = new ReplaceSomeTextReader(makeList(Tuple2(Tuple2((List) makeList("a", "b"), (List) makeList("x", "y", "z")), 4)), new BasicTextReader()); 27 | assertEquals("ab1ab2ab3xyz4ab5ab6ab7xyz8", StringHelper.join(tr.readCharacters("ab1ab2ab3ab4ab5ab6ab7ab8"))); 28 | } 29 | 30 | @Test 31 | public void test_readCharacters_3() { 32 | TextReader tr = new ReplaceSomeTextReader(makeList(Tuple2(Tuple2((List) makeList("a", "b"), (List) makeList("x", "y", "z")), 1)), new BasicTextReader()); 33 | assertEquals("xyz", StringHelper.join(tr.readCharacters("ab"))); 34 | } 35 | 36 | @Test 37 | public void test_readCharacters_4() { 38 | TextReader tr = new ReplaceSomeTextReader(makeList(Tuple2(Tuple2((List) makeList("a", "b"), (List) makeList("x", "y", "z")), 4)), new BasicTextReader()); 39 | assertEquals("ab1ab2ab3xyz4ab5ab6ab7xyz", StringHelper.join(tr.readCharacters("ab1ab2ab3ab4ab5ab6ab7ab"))); 40 | } 41 | 42 | @Test 43 | public void test_readCharacters_5() { 44 | TextReader tr = new ReplaceSomeTextReader(makeList( // 45 | Tuple2(Tuple2((List) makeList("a", "b"), (List) makeList("x", "y", "z")), 3), // 46 | Tuple2(Tuple2((List) makeList("y", "z"), (List) makeList("e")), 2)), // 47 | new BasicTextReader()); 48 | assertEquals("ab1ab2xyz3ab4ab5xe6ab7ab8", StringHelper.join(tr.readCharacters("ab1ab2ab3ab4ab5ab6ab7ab8"))); 49 | } 50 | 51 | @Test 52 | public void test_readCharacters_6() { 53 | TextReader tr = new ReplaceSomeTextReader(makeList(Tuple2(Tuple2((List) makeList("x", "x"), (List) makeList("a")), 1)), new BasicTextReader()); 54 | assertEquals("aa", StringHelper.join(tr.readCharacters("xxxx"))); 55 | } 56 | 57 | @Test 58 | public void test_readCharacters_7() { 59 | TextReader tr = new ReplaceSomeTextReader(makeList(Tuple2(Tuple2((List) makeList("x", "x"), (List) makeList("a", "x")), 1)), new BasicTextReader()); 60 | assertEquals("axax", StringHelper.join(tr.readCharacters("xxxx"))); 61 | } 62 | 63 | } 64 | -------------------------------------------------------------------------------- /src/test/java/edu/berkeley/cs/nlp/ocular/data/textreader/WhitelistCharacterSetTextReaderTests.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.data.textreader; 2 | 3 | import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.ACUTE_COMBINING; 4 | import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.GRAVE_COMBINING; 5 | import static org.junit.Assert.assertEquals; 6 | 7 | import java.util.Arrays; 8 | 9 | import org.junit.Test; 10 | 11 | import edu.berkeley.cs.nlp.ocular.util.CollectionHelper; 12 | 13 | /** 14 | * @author Dan Garrette (dhgarrette@gmail.com) 15 | */ 16 | public class WhitelistCharacterSetTextReaderTests { 17 | 18 | @Test 19 | public void test_readCharacters_default() { 20 | String s = "thi&s thá$t t$hè"; 21 | WhitelistCharacterSetTextReader tr1 = new WhitelistCharacterSetTextReader(CollectionHelper.makeSet("a", "e", "h", "i", "s", "t"), new BasicTextReader()); 22 | assertEquals(Arrays.asList("t", "h", "i", "s", " ", "t", "h", "t", " ", "t", "h"), tr1.readCharacters(s)); 23 | WhitelistCharacterSetTextReader tr2 = new WhitelistCharacterSetTextReader(CollectionHelper.makeSet("a", "e", "h", "i", "s", "t", "\\'a"), new BasicTextReader()); 24 | assertEquals(Arrays.asList("t", "h", "i", "s", " ", "t", "h", "a" + ACUTE_COMBINING, "t", " ", "t", "h"), tr2.readCharacters(s)); 25 | WhitelistCharacterSetTextReader tr3 = new WhitelistCharacterSetTextReader(CollectionHelper.makeSet("a", "e", "h", "í", "s", "t"), new BasicTextReader()); 26 | assertEquals(Arrays.asList("t", "h", "s", " ", "t", "h", "t", " ", "t", "h"), tr3.readCharacters(s)); 27 | } 28 | 29 | @Test 30 | public void test_readCharacters_considerDiacritics() { 31 | String s = "thi&s thá$t t$hè"; 32 | WhitelistCharacterSetTextReader tr1 = new WhitelistCharacterSetTextReader(CollectionHelper.makeSet("a", "e", "h", "i", "s", "t"), false, new BasicTextReader()); 33 | assertEquals(Arrays.asList("t", "h", "i", "s", " ", "t", "h", "t", " ", "t", "h"), tr1.readCharacters(s)); 34 | WhitelistCharacterSetTextReader tr2 = new WhitelistCharacterSetTextReader(CollectionHelper.makeSet("a", "e", "h", "i", "s", "t", "\\'a"), false, new BasicTextReader()); 35 | assertEquals(Arrays.asList("t", "h", "i", "s", " ", "t", "h", "a" + ACUTE_COMBINING, "t", " ", "t", "h"), tr2.readCharacters(s)); 36 | WhitelistCharacterSetTextReader tr3 = new WhitelistCharacterSetTextReader(CollectionHelper.makeSet("a", "e", "h", "í", "s", "t"), false, new BasicTextReader()); 37 | assertEquals(Arrays.asList("t", "h", "s", " ", "t", "h", "t", " ", "t", "h"), tr3.readCharacters(s)); 38 | } 39 | 40 | @Test 41 | public void test_readCharacters_disregardDiacritics() { 42 | String s = "thi&s thá$t t$hè"; 43 | WhitelistCharacterSetTextReader tr1 = new WhitelistCharacterSetTextReader(CollectionHelper.makeSet("a", "e", "h", "i", "s", "t"), true, new BasicTextReader()); 44 | assertEquals(Arrays.asList("t", "h", "i", "s", " ", "t", "h", "a" + ACUTE_COMBINING, "t", " ", "t", "h", "e" + GRAVE_COMBINING), tr1.readCharacters(s)); 45 | WhitelistCharacterSetTextReader tr2 = new WhitelistCharacterSetTextReader(CollectionHelper.makeSet("a", "e", "h", "i", "s", "t", "\\'a"), true, new BasicTextReader()); 46 | assertEquals(Arrays.asList("t", "h", "i", "s", " ", "t", "h", "a" + ACUTE_COMBINING, "t", " ", "t", "h", "e" + GRAVE_COMBINING), tr2.readCharacters(s)); 47 | WhitelistCharacterSetTextReader tr3 = new WhitelistCharacterSetTextReader(CollectionHelper.makeSet("a", "e", "h", "í", "s", "t"), true, new BasicTextReader()); 48 | assertEquals(Arrays.asList("t", "h", "s", " ", "t", "h", "a" + ACUTE_COMBINING, "t", " ", "t", "h", "e" + GRAVE_COMBINING), tr3.readCharacters(s)); 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /src/test/java/edu/berkeley/cs/nlp/ocular/gsm/BasicGlyphSubstitutionModelTests.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.gsm; 2 | 3 | import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.TILDE_COMBINING; 4 | import static edu.berkeley.cs.nlp.ocular.util.CollectionHelper.makeSet; 5 | import static org.junit.Assert.assertEquals; 6 | 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | import java.util.Set; 10 | 11 | import org.junit.Test; 12 | 13 | import edu.berkeley.cs.nlp.ocular.data.textreader.Charset; 14 | import edu.berkeley.cs.nlp.ocular.gsm.BasicGlyphSubstitutionModel.BasicGlyphSubstitutionModelFactory; 15 | import tberg.murphy.indexer.HashMapIndexer; 16 | import tberg.murphy.indexer.Indexer; 17 | 18 | /** 19 | * @author Dan Garrette (dhgarrette@gmail.com) 20 | */ 21 | public class BasicGlyphSubstitutionModelTests { 22 | 23 | @Test 24 | public void test_getSmoothingValue() { 25 | 26 | double gsmSmoothingCount = 0.1; 27 | double gsmElisionSmoothingCountMultiplier = 500.0; 28 | Indexer langIndexer = new HashMapIndexer(); langIndexer.index(new String[] {"spanish", "latin"}); langIndexer.lock(); 29 | String[] chars = new String[] {" ","-","a","b","c","d","e","f","k","n","o","s","\\'o"}; 30 | Indexer charIndexer = new HashMapIndexer(); charIndexer.index(chars); 31 | 32 | List charIndices = new ArrayList(); 33 | for (String c : chars) charIndices.add(charIndexer.getIndex(c)); 34 | Set fullCharSet = makeSet(charIndices); 35 | @SuppressWarnings("unchecked") 36 | Set[] activeCharacterSets = new Set[] {fullCharSet, fullCharSet}; 37 | charIndexer.getIndex("z"); 38 | charIndexer.getIndex(Charset.LONG_S); 39 | for (String c : new String[] {"a","b","c","d","e","f","k","n","o","s","z"}) charIndices.add(charIndexer.getIndex(c+TILDE_COMBINING)); 40 | charIndexer.lock(); 41 | double gsmPower = 2.0; 42 | int minCountsForEvalGsm = 2; 43 | String outputPath = ""; 44 | 45 | BasicGlyphSubstitutionModelFactory gsmf = new BasicGlyphSubstitutionModelFactory( 46 | gsmSmoothingCount, 47 | gsmElisionSmoothingCountMultiplier, 48 | langIndexer, 49 | charIndexer, 50 | activeCharacterSets, 51 | gsmPower, 52 | minCountsForEvalGsm, 53 | outputPath); 54 | 55 | assertEquals(gsmSmoothingCount*gsmElisionSmoothingCountMultiplier, gsmf.getSmoothingValue(0, charIndexer.getIndex("\\'o"), gsmf.GLYPH_ELISION_TILDE), 1e-9); 56 | assertEquals(gsmSmoothingCount, gsmf.getSmoothingValue(0, charIndexer.getIndex("k"), charIndexer.getIndex("k")), 1e-9); 57 | assertEquals(gsmSmoothingCount*gsmElisionSmoothingCountMultiplier, gsmf.getSmoothingValue(0, charIndexer.getIndex("k"), gsmf.GLYPH_FIRST_ELIDED), 1e-9); 58 | assertEquals(gsmSmoothingCount*gsmElisionSmoothingCountMultiplier, gsmf.getSmoothingValue(0, charIndexer.getIndex("k"), gsmf.GLYPH_FIRST_ELIDED), 1e-9); 59 | assertEquals(gsmSmoothingCount*gsmElisionSmoothingCountMultiplier, gsmf.getSmoothingValue(0, charIndexer.getIndex("k"), gsmf.GLYPH_TILDE_ELIDED), 1e-9); 60 | assertEquals(gsmSmoothingCount, gsmf.getSmoothingValue(0, charIndexer.getIndex("a"), charIndexer.getIndex("a")), 1e-9); 61 | assertEquals(gsmSmoothingCount*gsmElisionSmoothingCountMultiplier, gsmf.getSmoothingValue(0, charIndexer.getIndex("n"), gsmf.GLYPH_TILDE_ELIDED), 1e-9); 62 | assertEquals(gsmSmoothingCount, gsmf.getSmoothingValue(0, charIndexer.getIndex("a"), charIndexer.getIndex("a")), 1e-9); 63 | assertEquals(0.0, gsmf.getSmoothingValue(0, charIndexer.getIndex("a"), charIndexer.getIndex("z")), 1e-9); 64 | assertEquals(0.0, gsmf.getSmoothingValue(0, charIndexer.getIndex("a"), charIndexer.getIndex(Charset.LONG_S)), 1e-9); 65 | assertEquals(gsmSmoothingCount, gsmf.getSmoothingValue(0, charIndexer.getIndex("s"), charIndexer.getIndex(Charset.LONG_S)), 1e-9); 66 | 67 | } 68 | 69 | } 70 | -------------------------------------------------------------------------------- /src/test/java/edu/berkeley/cs/nlp/ocular/model/FontTrainEMTests.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.model; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | 5 | import java.util.Collection; 6 | import java.util.List; 7 | 8 | import org.junit.Test; 9 | 10 | import edu.berkeley.cs.nlp.ocular.gsm.GlyphChar; 11 | import edu.berkeley.cs.nlp.ocular.gsm.GlyphChar.GlyphType; 12 | import edu.berkeley.cs.nlp.ocular.model.transition.SparseTransitionModel.TransitionState; 13 | import edu.berkeley.cs.nlp.ocular.train.FontTrainer; 14 | import edu.berkeley.cs.nlp.ocular.util.Tuple2; 15 | import static edu.berkeley.cs.nlp.ocular.util.CollectionHelper.*; 16 | import tberg.murphy.indexer.HashMapIndexer; 17 | import tberg.murphy.indexer.Indexer; 18 | import edu.berkeley.cs.nlp.ocular.model.DecodeState; 19 | import static edu.berkeley.cs.nlp.ocular.model.TransitionStateType.*; 20 | 21 | /** 22 | * @author Dan Garrette (dhgarrette@gmail.com) 23 | */ 24 | public class FontTrainEMTests { 25 | 26 | class TS implements TransitionState { 27 | public final int id; 28 | private int languageIndex; 29 | private int lmCharIndex; 30 | private TransitionStateType type; 31 | private GlyphChar glyphChar; 32 | 33 | public TS(int id, int languageIndex, int lmCharIndex, TransitionStateType type, GlyphChar glyphChar) { 34 | this.id = id; 35 | this.languageIndex = languageIndex; 36 | this.lmCharIndex = lmCharIndex; 37 | this.type = type; 38 | this.glyphChar = glyphChar; 39 | } 40 | @Override public int getLanguageIndex() { return languageIndex; } 41 | @Override public int getLmCharIndex() { return lmCharIndex; } 42 | @Override public TransitionStateType getType() { return type; } 43 | @Override public GlyphChar getGlyphChar() { return glyphChar; } 44 | 45 | @Override public int getOffset() { return -1; } 46 | @Override public int getExposure() { return -1; } 47 | @Override public Collection> forwardTransitions() { return null; } 48 | @Override public Collection> nextLineStartStates() { return null; } 49 | @Override public double endLogProb() { return -1; } 50 | 51 | @Override public String toString() { 52 | return "TS("+id+", "+languageIndex+", "+lmCharIndex+", "+type+", "+glyphChar+")"; 53 | } 54 | } 55 | 56 | private DecodeState DS(TS ts) { 57 | return new DecodeState(ts, 0, 0, 0, 0); 58 | } 59 | 60 | @Test 61 | public void test_makeFullViterbiStateSeq() { 62 | 63 | Indexer charIndexer = new HashMapIndexer(); 64 | charIndexer.index(new String[] { " ", "-", "a", "b", "c" }); 65 | DecodeState[][] decodeStates = new DecodeState[][] { 66 | new DecodeState[]{ DS(new TS(1, -1, 0, LMRGN, new GlyphChar(0, GlyphType.NORMAL_CHAR))), 67 | DS(new TS(2, -1, 0, LMRGN, new GlyphChar(0, GlyphType.NORMAL_CHAR))), 68 | DS(new TS(3, -1, 0, TMPL, new GlyphChar(0, GlyphType.NORMAL_CHAR))), 69 | DS(new TS(4, 1, 2, TMPL, new GlyphChar(2, GlyphType.NORMAL_CHAR))), 70 | DS(new TS(5, 1, 3, TMPL, new GlyphChar(3, GlyphType.NORMAL_CHAR))), 71 | DS(new TS(6, 1, 4, TMPL, new GlyphChar(4, GlyphType.NORMAL_CHAR))), 72 | DS(new TS(7, 1, 1, RMRGN_HPHN_INIT, new GlyphChar(1, GlyphType.NORMAL_CHAR))), 73 | DS(new TS(8, 1, 0, RMRGN_HPHN, new GlyphChar(0, GlyphType.NORMAL_CHAR))), 74 | DS(new TS(9, 1, 0, RMRGN_HPHN, new GlyphChar(0, GlyphType.NORMAL_CHAR))) }, 75 | new DecodeState[]{ DS(new TS(10, 1, 0, LMRGN_HPHN, new GlyphChar(0, GlyphType.NORMAL_CHAR))), 76 | DS(new TS(11, 1, 0, LMRGN_HPHN, new GlyphChar(0, GlyphType.NORMAL_CHAR))), 77 | DS(new TS(12, 1, 0, TMPL, new GlyphChar(0, GlyphType.NORMAL_CHAR))), 78 | DS(new TS(13, 1, 2, TMPL, new GlyphChar(2, GlyphType.NORMAL_CHAR))), 79 | DS(new TS(14, 1, 3, TMPL, new GlyphChar(3, GlyphType.NORMAL_CHAR))), 80 | DS(new TS(15, 1, 4, TMPL, new GlyphChar(4, GlyphType.NORMAL_CHAR))), 81 | DS(new TS(16, 1, 0, RMRGN, new GlyphChar(0, GlyphType.NORMAL_CHAR))), 82 | DS(new TS(17, 1, 0, RMRGN, new GlyphChar(0, GlyphType.NORMAL_CHAR))) } 83 | }; 84 | List tsSeq = FontTrainer.makeFullViterbiStateSeq(decodeStates, charIndexer); 85 | List expectedIds = makeList(2, 3, 4, 1); 86 | for (int i = 0; i < expectedIds.size(); ++i) { 87 | assertEquals(expectedIds.get(i).intValue(), ((TS)tsSeq.get(i).ts).id); 88 | } 89 | 90 | 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /src/test/java/edu/berkeley/cs/nlp/ocular/util/ArrayHelperTests.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.util; 2 | 3 | import static org.junit.Assert.*; 4 | 5 | import org.junit.Test; 6 | 7 | /** 8 | * @author Dan Garrette (dhgarrette@gmail.com) 9 | */ 10 | public class ArrayHelperTests { 11 | 12 | @Test 13 | public void test_sum_int() { 14 | assertEquals(225, ArrayHelper.sum(new int[] { 50, 0, 150, 25 })); 15 | assertEquals(25, ArrayHelper.sum(new int[] { 25 })); 16 | assertEquals(0, ArrayHelper.sum(new int[] { 0 })); 17 | assertEquals(0, ArrayHelper.sum(new int[] { 0, 0 })); 18 | assertEquals(0, ArrayHelper.sum(new int[0])); 19 | } 20 | 21 | @Test 22 | public void test_avg_int() { 23 | assertEquals(54.8, ArrayHelper.avg(new int[] { 50, 0, 150, 74, 0 }), 1e-9); 24 | assertEquals(67.5, ArrayHelper.avg(new int[] { 50, 150, 70, 0 }), 1e-9); 25 | assertEquals(90, ArrayHelper.avg(new int[] { 50, 150, 70 }), 1e-9); 26 | assertEquals(25.0, ArrayHelper.avg(new int[] { 25 }), 1e-9); 27 | assertEquals(0, ArrayHelper.avg(new int[] { 0 }), 1e-9); 28 | assertEquals(0, ArrayHelper.avg(new int[] { 0, 0 }), 1e-9); 29 | assertEquals(0, ArrayHelper.avg(new int[0]), 1e-9); 30 | } 31 | 32 | @Test 33 | public void test_sum_double() { 34 | assertEquals(2.25, ArrayHelper.sum(new double[] { 0.5, 0.0, 1.5, 0.25 }), 1e-9); 35 | assertEquals(0.25, ArrayHelper.sum(new double[] { 0.25 }), 1e-9); 36 | assertEquals(0.0, ArrayHelper.sum(new double[] { 0.0 }), 1e-9); 37 | assertEquals(0.0, ArrayHelper.sum(new double[] { 0.0, 0.0 }), 1e-9); 38 | assertEquals(0.0, ArrayHelper.sum(new double[0]), 1e-9); 39 | } 40 | 41 | @Test 42 | public void test_avg_double() { 43 | assertEquals(0.54, ArrayHelper.avg(new double[] { 0.5, 0.0, 1.5, 0.7, 0.0 }), 1e-9); 44 | assertEquals(0.675, ArrayHelper.avg(new double[] { 0.5, 1.5, 0.7, 0.0 }), 1e-9); 45 | assertEquals(0.9, ArrayHelper.avg(new double[] { 0.5, 1.5, 0.7 }), 1e-9); 46 | assertEquals(0.25, ArrayHelper.avg(new double[] { 0.25 }), 1e-9); 47 | assertEquals(0.0, ArrayHelper.avg(new double[] { 0.0 }), 1e-9); 48 | assertEquals(0.0, ArrayHelper.avg(new double[] { 0.0, 0.0 }), 1e-9); 49 | assertEquals(0.0, ArrayHelper.avg(new double[0]), 1e-9); 50 | } 51 | 52 | @Test 53 | public void test_min_int() { 54 | assertEquals(10, ArrayHelper.min(new int[] { 50, 10, 25, 150, 10, 25 })); 55 | assertEquals(25, ArrayHelper.min(new int[] { 25 })); 56 | assertEquals(20, ArrayHelper.min(new int[] { 20 })); 57 | assertEquals(20, ArrayHelper.min(new int[] { 20, 20 })); 58 | try { 59 | ArrayHelper.min(new int[0]); 60 | fail("exception expected"); 61 | } 62 | catch(RuntimeException e) { 63 | // good 64 | } 65 | } 66 | 67 | @Test 68 | public void test_prepend() { 69 | { 70 | int[] b = ArrayHelper.prepend(0, new int[] { 1, 2, 3 }); 71 | assertEquals(4, b.length); 72 | assertEquals(0, b[0]); 73 | assertEquals(1, b[1]); 74 | assertEquals(2, b[2]); 75 | assertEquals(3, b[3]); 76 | } 77 | { 78 | int[] b = ArrayHelper.prepend(0, new int[] {}); 79 | assertEquals(1, b.length); 80 | assertEquals(0, b[0]); 81 | } 82 | } 83 | 84 | @Test 85 | public void test_append() { 86 | { 87 | Integer[] b = ArrayHelper.append(new Integer[] { 0, 1, 2 }, 3); 88 | assertEquals(4, b.length); 89 | assertEquals((int) 0, (int) b[0]); 90 | assertEquals((int) 1, (int) b[1]); 91 | assertEquals((int) 2, (int) b[2]); 92 | assertEquals((int) 3, (int) b[3]); 93 | } 94 | { 95 | Integer[] b = ArrayHelper.append(new Integer[] {}, 0); 96 | assertEquals(1, b.length); 97 | assertEquals((int) 0, (int) b[0]); 98 | } 99 | } 100 | 101 | @Test 102 | public void test_take() { 103 | { 104 | int[] b = ArrayHelper.take(new int[] { 1, 2, 3 }, 2); 105 | assertEquals(2, b.length); 106 | assertEquals(1, b[0]); 107 | assertEquals(2, b[1]); 108 | } 109 | { 110 | int[] b = ArrayHelper.take(new int[] { 1, 2, 3 }, 3); 111 | assertEquals(3, b.length); 112 | assertEquals(1, b[0]); 113 | assertEquals(2, b[1]); 114 | assertEquals(3, b[2]); 115 | } 116 | { 117 | int[] b = ArrayHelper.take(new int[] { 1, 2, 3 }, 0); 118 | assertEquals(0, b.length); 119 | } 120 | { 121 | int[] b = ArrayHelper.take(new int[] { 1, 2, 3 }, 8); 122 | assertEquals(3, b.length); 123 | assertEquals(1, b[0]); 124 | assertEquals(2, b[1]); 125 | assertEquals(3, b[2]); 126 | } 127 | { 128 | int[] b = ArrayHelper.take(new int[] {}, 0); 129 | assertEquals(0, b.length); 130 | } 131 | { 132 | int[] b = ArrayHelper.take(new int[] {}, 2); 133 | assertEquals(0, b.length); 134 | } 135 | } 136 | 137 | @Test 138 | public void test_takeRight() { 139 | { 140 | int[] b = ArrayHelper.takeRight(new int[] { 1, 2, 3 }, 2); 141 | assertEquals(2, b.length); 142 | assertEquals(2, b[0]); 143 | assertEquals(3, b[1]); 144 | } 145 | { 146 | int[] b = ArrayHelper.takeRight(new int[] { 1, 2, 3 }, 3); 147 | assertEquals(3, b.length); 148 | assertEquals(1, b[0]); 149 | assertEquals(2, b[1]); 150 | assertEquals(3, b[2]); 151 | } 152 | { 153 | int[] b = ArrayHelper.takeRight(new int[] { 1, 2, 3 }, 0); 154 | assertEquals(0, b.length); 155 | } 156 | { 157 | int[] b = ArrayHelper.takeRight(new int[] { 1, 2, 3 }, 8); 158 | assertEquals(3, b.length); 159 | assertEquals(1, b[0]); 160 | assertEquals(2, b[1]); 161 | assertEquals(3, b[2]); 162 | } 163 | { 164 | int[] b = ArrayHelper.takeRight(new int[] {}, 0); 165 | assertEquals(0, b.length); 166 | } 167 | { 168 | int[] b = ArrayHelper.takeRight(new int[] {}, 2); 169 | assertEquals(0, b.length); 170 | } 171 | } 172 | } 173 | -------------------------------------------------------------------------------- /src/test/java/edu/berkeley/cs/nlp/ocular/util/FileUtilTests.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.util; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | 8 | import org.junit.Test; 9 | 10 | /** 11 | * @author Dan Garrette (dhgarrette@gmail.com) 12 | */ 13 | public class FileUtilTests { 14 | 15 | @Test 16 | public void test_lowestCommonPath() { 17 | { 18 | List paths = new ArrayList(); 19 | paths.add("/well/this/and/that/"); 20 | paths.add("/well/this/and/the/other.txt"); 21 | paths.add("/well/this/and/thus.txt"); 22 | String lcpd = FileUtil.lowestCommonPath(paths); 23 | assertEquals("/well/this/and", lcpd); 24 | } 25 | { 26 | List paths = new ArrayList(); 27 | paths.add("/well/this/and/thus.txt"); 28 | String lcpd = FileUtil.lowestCommonPath(paths); 29 | assertEquals("/well/this/and/thus.txt", lcpd); 30 | } 31 | { 32 | List paths = new ArrayList(); 33 | paths.add("/well/this/and/"); 34 | paths.add("/well/this/and/"); 35 | String lcpd = FileUtil.lowestCommonPath(paths); 36 | assertEquals("/well/this/and", lcpd); 37 | } 38 | { 39 | List paths = new ArrayList(); 40 | paths.add("/well/this/and/"); 41 | String lcpd = FileUtil.lowestCommonPath(paths); 42 | assertEquals("/well/this/and", lcpd); 43 | } 44 | } 45 | 46 | @Test 47 | public void test_pathRelativeTo() { 48 | String d0 = "/well/this/and/"; 49 | String f1 = "/well/this/and/that.txt"; 50 | String f2 = "/well/this/and/that"; 51 | String f3 = "/well/this/and/that/or.txt"; 52 | String f4 = "/well/this/and/that/else/"; 53 | 54 | assertEquals("that.txt", FileUtil.pathRelativeTo(f1,d0)); 55 | assertEquals("that", FileUtil.pathRelativeTo(f2,d0)); 56 | assertEquals("that/or.txt", FileUtil.pathRelativeTo(f3,d0)); 57 | assertEquals("that/else", FileUtil.pathRelativeTo(f4,d0)); 58 | } 59 | 60 | } 61 | -------------------------------------------------------------------------------- /src/test/java/edu/berkeley/cs/nlp/ocular/util/StringHelperTests.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.cs.nlp.ocular.util; 2 | 3 | import static edu.berkeley.cs.nlp.ocular.util.StringHelper.drop; 4 | import static edu.berkeley.cs.nlp.ocular.util.StringHelper.join; 5 | import static edu.berkeley.cs.nlp.ocular.util.StringHelper.last; 6 | import static edu.berkeley.cs.nlp.ocular.util.StringHelper.longestCommonPrefix; 7 | import static edu.berkeley.cs.nlp.ocular.util.StringHelper.take; 8 | import static edu.berkeley.cs.nlp.ocular.util.StringHelper.toUnicode; 9 | import static org.junit.Assert.assertEquals; 10 | import static org.junit.Assert.assertFalse; 11 | import static org.junit.Assert.assertTrue; 12 | import static org.junit.Assert.fail; 13 | 14 | import java.util.Arrays; 15 | 16 | import org.junit.Test; 17 | 18 | /** 19 | * @author Dan Garrette (dhgarrette@gmail.com) 20 | */ 21 | public class StringHelperTests { 22 | 23 | @Test 24 | public void testToUnicode_string() { 25 | assertEquals("\\u0061", toUnicode("a")); 26 | } 27 | 28 | @Test 29 | public void testToUnicode_char() { 30 | assertEquals("\\u0061", toUnicode('a')); 31 | } 32 | 33 | @Test 34 | public void testTake() { 35 | assertEquals("", take("", 0)); 36 | assertEquals("", take("", -2)); 37 | assertEquals("", take("", 2)); 38 | assertEquals("", take("abc", 0)); 39 | assertEquals("", take("abc", -2)); 40 | assertEquals("a", take("a", 1)); 41 | assertEquals("a", take("a", 2)); 42 | assertEquals("ab", take("abc", 2)); 43 | } 44 | 45 | @Test 46 | public void testDrop() { 47 | assertEquals("", drop("", 0)); 48 | assertEquals("", drop("", -2)); 49 | assertEquals("", drop("", 2)); 50 | assertEquals("abc", drop("abc", 0)); 51 | assertEquals("abc", drop("abc", -2)); 52 | assertEquals("", drop("a", 1)); 53 | assertEquals("", drop("a", 2)); 54 | assertEquals("c", drop("abc", 2)); 55 | assertEquals("bc", drop("abc", 1)); 56 | } 57 | 58 | @Test 59 | public void testLast() { 60 | assertEquals("a", last("a")); 61 | assertEquals("c", last("abc")); 62 | try { 63 | assertEquals("a", last("")); 64 | fail(); 65 | } catch (IllegalArgumentException e) { 66 | } 67 | } 68 | 69 | @Test 70 | public void testJoin_varargs() { 71 | assertEquals("abc", join("a", "", "b", "c")); 72 | } 73 | 74 | @Test 75 | public void testJoin_list() { 76 | assertEquals("abc", join(Arrays.asList(new String[] { "a", "", "b", "c" }))); 77 | } 78 | 79 | @Test 80 | public void testJoin_list_sep() { 81 | assertEquals("a;;b;c", join(Arrays.asList(new String[] { "a", "", "b", "c" }), ";")); 82 | } 83 | 84 | @Test 85 | public void testEquals() { 86 | assertTrue(StringHelper.equals("", "")); 87 | assertFalse(StringHelper.equals("a", "")); 88 | assertFalse(StringHelper.equals("", "a")); 89 | assertFalse(StringHelper.equals(null, "")); 90 | assertFalse(StringHelper.equals("", null)); 91 | assertFalse(StringHelper.equals(null, "a")); 92 | assertFalse(StringHelper.equals("a", null)); 93 | assertTrue(StringHelper.equals(null, null)); 94 | } 95 | 96 | @Test 97 | public void testLongestCommonPrefix() { 98 | assertEquals("".length(), longestCommonPrefix("", "")); 99 | assertEquals("".length(), longestCommonPrefix("abc", "")); 100 | assertEquals("".length(), longestCommonPrefix("", "abc")); 101 | assertEquals("ab".length(), longestCommonPrefix("abc", "ab")); 102 | assertEquals("ab".length(), longestCommonPrefix("ab", "abc")); 103 | assertEquals("abc".length(), longestCommonPrefix("abc", "abc")); 104 | } 105 | 106 | } 107 | -------------------------------------------------------------------------------- /src/test/resources/.gitignore: -------------------------------------------------------------------------------- 1 | *.lmser 2 | *.fontser 3 | *.gsmser 4 | *train_output/ 5 | *test_output/ 6 | extracted_lines/ 7 | 8 | -------------------------------------------------------------------------------- /src/test/resources/doc.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tberg12/ocular/3470d41a38a16a2c8b5e77f382c82e51d886cdd6/src/test/resources/doc.jpg -------------------------------------------------------------------------------- /src/test/resources/doc.txt: -------------------------------------------------------------------------------- 1 | aabc abc bc, bc abc aa bc aa aa bc aa aabc 2 | aabc abc bc bc abc, aa bc aa aa bc aa aabc 3 | aabc abc, bc bc abc aa bc aa, aa bc aa aabc 4 | -------------------------------------------------------------------------------- /src/test/resources/doc_normalized.txt: -------------------------------------------------------------------------------- 1 | aabc abc bc bc abc aa bc aa aa bc aa aabc 2 | aabc abc bc bc abc aa bc aa aa bc aa aabc 3 | aabc abc bc bc abc aa bc aa aa bc aa aabc 4 | -------------------------------------------------------------------------------- /src/test/resources/extracted_lines/doc-line_extract_jpg/line00.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tberg12/ocular/3470d41a38a16a2c8b5e77f382c82e51d886cdd6/src/test/resources/extracted_lines/doc-line_extract_jpg/line00.jpg -------------------------------------------------------------------------------- /src/test/resources/extracted_lines/doc-line_extract_jpg/line01.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tberg12/ocular/3470d41a38a16a2c8b5e77f382c82e51d886cdd6/src/test/resources/extracted_lines/doc-line_extract_jpg/line01.jpg -------------------------------------------------------------------------------- /src/test/resources/extracted_lines/doc-line_extract_jpg/line02.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tberg12/ocular/3470d41a38a16a2c8b5e77f382c82e51d886cdd6/src/test/resources/extracted_lines/doc-line_extract_jpg/line02.jpg -------------------------------------------------------------------------------- /src/test/resources/initialize_font.sh: -------------------------------------------------------------------------------- 1 | target/start edu.berkeley.cs.nlp.ocular.main.InitializeFont \ 2 | -inputLmPath src/test/resources/doc.lmser \ 3 | -outputFontPath src/test/resources/doc-init.fontser 4 | 5 | target/start edu.berkeley.cs.nlp.ocular.main.InitializeFont \ 6 | -inputLmPath src/test/resources/multiling.lmser \ 7 | -outputFontPath src/test/resources/multiling-init.fontser 8 | -------------------------------------------------------------------------------- /src/test/resources/initialize_lm.sh: -------------------------------------------------------------------------------- 1 | target/start edu.berkeley.cs.nlp.ocular.main.InitializeLanguageModel \ 2 | -inputTextPath src/test/resources/doc.txt \ 3 | -outputLmPath src/test/resources/doc.lmser \ 4 | -minCharCount 0 5 | 6 | target/start edu.berkeley.cs.nlp.ocular.main.InitializeLanguageModel \ 7 | -inputTextPath "Lang1->src/test/resources/doc.txt,Lang2->src/test/resources/doc.txt" \ 8 | -outputLmPath src/test/resources/multiling.lmser \ 9 | -charNgramLength "Lang1->6,Lang2->4" \ 10 | -minCharCount 0 11 | 12 | -------------------------------------------------------------------------------- /src/test/resources/train_font.sh: -------------------------------------------------------------------------------- 1 | target/start edu.berkeley.cs.nlp.ocular.main.TrainFont \ 2 | -inputFontPath src/test/resources/doc-init.fontser \ 3 | -inputLmPath src/test/resources/doc.lmser \ 4 | -inputDocPath src/test/resources/doc.jpg \ 5 | -extractedLinesPath src/test/resources/extracted_lines \ 6 | -outputFontPath src/test/resources/doc-trained.fontser \ 7 | -outputPath src/test/resources/train_output \ 8 | -numEmIters 1 9 | # -allowGlyphSubstitution true \ 10 | # -updateGsm true \ 11 | # -outputGsmPath src/test/resources/doc.gsmser \ 12 | 13 | target/start edu.berkeley.cs.nlp.ocular.main.TrainFont \ 14 | -inputFontPath src/test/resources/multiling-init.fontser \ 15 | -inputLmPath src/test/resources/multiling.lmser \ 16 | -inputDocPath src/test/resources/doc.jpg \ 17 | -extractedLinesPath src/test/resources/extracted_lines \ 18 | -outputFontPath src/test/resources/multiling-trained.fontser \ 19 | -outputPath src/test/resources/multiling_train_output \ 20 | -numEmIters 1 \ 21 | -allowGlyphSubstitution true \ 22 | -updateGsm true \ 23 | -outputGsmPath src/test/resources/multiling.gsmser \ 24 | --------------------------------------------------------------------------------