├── .gitignore
├── LICENSE.txt
├── README.md
├── build.sbt
├── check_for_author_tags.py
├── conf
    ├── base.conf
    ├── fast.conf
    ├── first_folio_init_font.conf
    ├── first_folio_lm_train.conf
    └── first_folio_main.conf
├── lib
    ├── PDFRenderer-0.9.1.jar
    ├── junit-4.12.jar
    └── murphy.jar
├── make-readme-options.py
├── make_jar.sh
├── make_run_script.sh
├── options_lists.txt
├── project
    └── plugins.sbt
├── publish_jar_with_dependencies.sh
├── replace
    ├── latin.txt
    ├── nahuatl.txt
    └── spanish.txt
├── sample_images
    ├── advertencias
    │   ├── pl_blac_047_00039-800.jpg
    │   ├── pl_blac_047_00039-800.txt
    │   ├── pl_blac_047_00040-800.jpg
    │   ├── pl_blac_047_00040-800.txt
    │   ├── pl_blac_047_00041-800.jpg
    │   └── pl_blac_047_00041-800.txt
    └── english
    │   ├── 184101040058.jpg
    │   ├── 184101040060.jpg
    │   └── 184101040062.jpg
├── sbt-launch-0.13.8.jar
└── src
    ├── main
        └── java
        │   └── edu
        │       └── berkeley
        │           └── cs
        │               └── nlp
        │                   └── ocular
        │                       ├── data
        │                           ├── Document.java
        │                           ├── FirstFolioRawImageLoader.java
        │                           ├── LazyRawImageDocument.java
        │                           ├── LazyRawImageLoader.java
        │                           ├── LazyRawPdfImageDocument.java
        │                           ├── LazyRawSingleImageDocument.java
        │                           ├── PdfImageReader.java
        │                           ├── RawImageLoader.java
        │                           ├── TextAndLineImagesLoader.java
        │                           └── textreader
        │                           │   ├── BasicTextReader.java
        │                           │   ├── BlacklistCharacterSetTextReader.java
        │                           │   ├── CharIndexer.java
        │                           │   ├── Charset.java
        │                           │   ├── ConvertLongSTextReader.java
        │                           │   ├── FlipUVTextReader.java
        │                           │   ├── RemoveAllDiacriticsTextReader.java
        │                           │   ├── ReplaceSomeTextReader.java
        │                           │   ├── TextReader.java
        │                           │   └── WhitelistCharacterSetTextReader.java
        │                       ├── eval
        │                           ├── AlignedFormPair.java
        │                           ├── BasicMultiDocumentTranscriber.java
        │                           ├── BasicSingleDocumentEvaluatorAndOutputPrinter.java
        │                           ├── ErrorSampler.java
        │                           ├── EvalPrinter.java
        │                           ├── Evaluator.java
        │                           ├── Form.java
        │                           ├── Glyph.java
        │                           ├── LmPerplexity.java
        │                           ├── MarkovEditDistanceComputer.java
        │                           ├── ModelTranscriptions.java
        │                           ├── MultiDocumentTranscriber.java
        │                           ├── Operation.java
        │                           └── SingleDocumentEvaluatorAndOutputPrinter.java
        │                       ├── font
        │                           └── Font.java
        │                       ├── gsm
        │                           ├── BasicGlyphSubstitutionModel.java
        │                           ├── GlyphChar.java
        │                           ├── GlyphSubstitutionModel.java
        │                           └── NoSubGlyphSubstitutionModel.java
        │                       ├── image
        │                           ├── FontRenderer.java
        │                           ├── ImageUtils.java
        │                           └── Visualizer.java
        │                       ├── lm
        │                           ├── BasicCodeSwitchLanguageModel.java
        │                           ├── CodeSwitchLanguageModel.java
        │                           ├── CorpusCounter.java
        │                           ├── CountDb.java
        │                           ├── CountDbBig.java
        │                           ├── CountDbSimple.java
        │                           ├── CountType.java
        │                           ├── InterpolatingSingleLanguageModel.java
        │                           ├── LanguageModel.java
        │                           ├── LongArrWrapper.java
        │                           ├── LongNgram.java
        │                           ├── Ngram.java
        │                           ├── NgramCounts.java
        │                           ├── NgramLanguageModel.java
        │                           ├── NgramWrapper.java
        │                           ├── SingleLanguageModel.java
        │                           └── UniformLanguageModel.java
        │                       ├── main
        │                           ├── ExtractLinesOnly.java
        │                           ├── FonttrainTranscribeShared.java
        │                           ├── InitializeFont.java
        │                           ├── InitializeGlyphSubstitutionModel.java
        │                           ├── InitializeLanguageModel.java
        │                           ├── LineExtractionOptions.java
        │                           ├── NoDocumentsFoundException.java
        │                           ├── NoDocumentsToProcessException.java
        │                           ├── OcularRunnable.java
        │                           ├── TrainFont.java
        │                           ├── Transcribe.java
        │                           └── gui
        │                           │   ├── GridLayout2.java
        │                           │   ├── InitializeFontGUI.java
        │                           │   ├── TrainLanguageModelGUI.java
        │                           │   └── TranscribeOrTrainFontGUI.java
        │                       ├── model
        │                           ├── CharacterTemplate.java
        │                           ├── DecodeState.java
        │                           ├── DecoderEM.java
        │                           ├── TransitionStateType.java
        │                           ├── em
        │                           │   ├── BeamingSemiMarkovDP.java
        │                           │   ├── CUDAInnerLoop.java
        │                           │   ├── DefaultInnerLoop.java
        │                           │   ├── DenseBigramTransitionModel.java
        │                           │   ├── EmissionCacheInnerLoop.java
        │                           │   ├── EmptyBeamException.java
        │                           │   └── JOCLInnerLoop.java
        │                           ├── emission
        │                           │   ├── CachingEmissionModel.java
        │                           │   ├── CachingEmissionModelExplicitOffset.java
        │                           │   └── EmissionModel.java
        │                           └── transition
        │                           │   ├── CharacterNgramTransitionModel.java
        │                           │   ├── CharacterNgramTransitionModelMarkovOffset.java
        │                           │   ├── CodeSwitchTransitionModel.java
        │                           │   └── SparseTransitionModel.java
        │                       ├── output
        │                           ├── AltoOutputWriter.java
        │                           └── HtmlOutputWriter.java
        │                       ├── preprocessing
        │                           ├── Binarizer.java
        │                           ├── Cropper.java
        │                           ├── LineExtractor.java
        │                           ├── ManualCropper.java
        │                           ├── ManualStackCropperPrep.java
        │                           ├── Straightener.java
        │                           ├── Test.java
        │                           ├── VerticalModel.java
        │                           └── VerticalProfile.java
        │                       ├── train
        │                           ├── FontTrainer.java
        │                           ├── ModelPathMaker.java
        │                           └── TrainingRestarter.java
        │                       └── util
        │                           ├── ArrayHelper.java
        │                           ├── CollectionHelper.java
        │                           ├── FileHelper.java
        │                           ├── FileUtil.java
        │                           ├── StringHelper.java
        │                           ├── Tuple2.java
        │                           └── Tuple3.java
    └── test
        ├── java
            └── edu
            │   └── berkeley
            │       └── cs
            │           └── nlp
            │               └── ocular
            │                   ├── data
            │                       └── textreader
            │                       │   ├── BasicTextReaderTests.java
            │                       │   ├── BlacklistCharacterSetTextReaderTests.java
            │                       │   ├── CharIndexerTests.java
            │                       │   ├── CharsetTests.java
            │                       │   ├── ConvertLongSTextReaderTests.java
            │                       │   ├── RemoveAllDiacriticsTextReaderTests.java
            │                       │   ├── ReplaceSomeTextReaderTests.java
            │                       │   └── WhitelistCharacterSetTextReaderTests.java
            │                   ├── eval
            │                       └── LmPerplexityTests.java
            │                   ├── gsm
            │                       └── BasicGlyphSubstitutionModelTests.java
            │                   ├── lm
            │                       └── LanguageTransitionPriorsTests.java
            │                   ├── model
            │                       ├── FontTrainEMTests.java
            │                       └── PostViterbiTests.java
            │                   └── util
            │                       ├── ArrayHelperTests.java
            │                       ├── CollectionHelperTests.java
            │                       ├── FileUtilTests.java
            │                       └── StringHelperTests.java
        └── resources
            ├── .gitignore
            ├── doc.jpg
            ├── doc.txt
            ├── doc_normalized.txt
            ├── extracted_lines
                └── doc-line_extract_jpg
                │   ├── line00.jpg
                │   ├── line01.jpg
                │   └── line02.jpg
            ├── initialize_font.sh
            ├── initialize_lm.sh
            └── train_font.sh


/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | /font
 3 | /lm
 4 | /gsm
 5 | /data
 6 | /extracted_lines
 7 | /train_output
 8 | /transcribe_output
 9 | /cs_train_lineex/
10 | /cs_train_output/
11 | /cs_train_output.txt
12 | /cs_transcribe_lineex/
13 | /cs_transcribe_output/
14 | /cs_transcribe_output.txt
15 | /sample_images/
16 | /texts
17 | /replace
18 | 
19 | *.class
20 | *.log
21 | 
22 | /ocular-*.jar
23 | /lib
24 | 
25 | # sbt specific
26 | dist/*
27 | target/
28 | lib_managed/
29 | src_managed/
30 | project/boot/
31 | project/plugins/project/
32 | 
33 | # Scala-IDE specific
34 | .scala_dependencies
35 | .project
36 | .classpath
37 | .cache
38 | .cache-main
39 | .cache-tests
40 | .settings/
41 | .worksheet
42 | .pydevproject
43 | 
44 | /bin/
45 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
 1 | import com.typesafe.sbt.SbtStartScript
 2 | 
 3 | import com.github.retronym.SbtOneJar._
 4 | 
 5 | name := "ocular"
 6 | 
 7 | organization := "edu.berkeley.cs.nlp"
 8 | 
 9 | version := "0.3-SNAPSHOT"
10 | 
11 | scalaVersion := "2.12.1"
12 | 
13 | javacOptions ++= Seq("-source", "1.6", "-target", "1.6")
14 | 
15 | Seq(SbtStartScript.startScriptForClassesSettings: _*)
16 | 
17 | SbtStartScript.stage in Compile := Unit
18 | 
19 | oneJarSettings
20 | 
21 | mainClass in oneJar := None
22 | 
23 | 
24 | libraryDependencies ++= Seq(
25 | //  "org.apache.commons" % "commons-lang3" % "3.4", //to escape HTML special characters
26 |   "org.swinglabs" % "pdf-renderer" % "1.0.5",
27 |   "junit" % "junit" % "4.12" % "test",
28 |   "com.novocode" % "junit-interface" % "0.10" % "test")
29 | 


--------------------------------------------------------------------------------
/check_for_author_tags.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | def has_author(f):
 4 |   for line in f:
 5 |     line = line.split()
 6 |     if '@author' in line:
 7 |       return True
 8 |     if 'class' in line or 'interface' in line:
 9 |       return False
10 |   assert False, 'No class found...'
11 | 
12 | for (folder,dirs,files) in os.walk("."):
13 |   for fn in files:
14 |     fn = '%s/%s' % (folder, fn)
15 |     if fn.endswith('.java'):
16 |       with open(fn) as f:
17 |         if not has_author(f):
18 |           print fn
19 | 
20 | 


--------------------------------------------------------------------------------
/conf/base.conf:
--------------------------------------------------------------------------------
 1 | inputPath	./test
 2 | outputPath	./output
 3 | outputFontPath	./font/learned.fontser
 4 | lmPath	./lm/nyt.lmser
 5 | initFontPath	./font/init.fontser
 6 | 
 7 | binarizeThreshold	0.12
 8 | 
 9 | paddingMinWidth	1
10 | paddingMaxWidth	5
11 | 
12 | markovVerticalOffset	true
13 | beamSize	20
14 | learnFont	true
15 | numEMIters	4
16 | 
17 | emissionEngine	DEFAULT
18 | cudaDeviceID	0
19 | numMstepThreads	8
20 | numEmissionCacheThreads 8
21 | numDecodeThreads	4
22 | decodeBatchSize	16
23 | 


--------------------------------------------------------------------------------
/conf/fast.conf:
--------------------------------------------------------------------------------
 1 | inputPath	./test
 2 | outputPath	./output
 3 | outputFontPath	./font/learned.fontser
 4 | lmPath	./lm/nyt.lmser
 5 | initFontPath	./font/init.fontser
 6 | 
 7 | binarizeThreshold	0.12
 8 | 
 9 | paddingMinWidth	1
10 | paddingMaxWidth	5
11 | 
12 | markovVerticalOffset	false
13 | beamSize	10
14 | learnFont	true
15 | numEMIters	4
16 | 
17 | emissionEngine	DEFAULT
18 | cudaDeviceID	0
19 | numMstepThreads	8
20 | numEmissionCacheThreads 8
21 | numDecodeThreads	4
22 | decodeBatchSize	16
23 | 


--------------------------------------------------------------------------------
/conf/first_folio_init_font.conf:
--------------------------------------------------------------------------------
1 | inputLmPath	/Users/tberg/Desktop/ob-longs-uv-4gm-4pow.lmser
2 | outputFontPath	/Users/tberg/Desktop/init.fontser
3 | spaceMinWidthFraction	0.0


--------------------------------------------------------------------------------
/conf/first_folio_lm_train.conf:
--------------------------------------------------------------------------------
1 | lmPath	/Users/tberg/Desktop/ob-longs-uv-4gm-4pow.lmser
2 | textPath	/Users/tberg/Desktop/big-lm.txt
3 | insertLongS	true
4 | allowUVFlip	true
5 | charN	4
6 | power	4.0


--------------------------------------------------------------------------------
/conf/first_folio_main.conf:
--------------------------------------------------------------------------------
 1 | inputPath	/Users/tberg/Desktop/F-tem/seg_extraction
 2 | usePrebuiltLM	true
 3 | lmPath	/Users/tberg/Desktop/ob-longs-uv-4gm-4pow.lmser
 4 | lmTextPath	/Users/tberg/git/first_folio_attr/data/txt/F-tem
 5 | lmOrder	4
 6 | lmPower	4.0
 7 | initFontPath	/Users/tberg/Desktop/init.fontser
 8 | learnFont	true
 9 | outputPath	/Users/tberg/Desktop/F-tem-output-ob-uv-4pow-nospace-git
10 | outputFontPath	/Users/tberg/Desktop/F-tem-output-ob-uv-4pow-nospace-git/learned.fontser
11 | emissionEngine	CUDA
12 | cudaDeviceID	1
13 | 


--------------------------------------------------------------------------------
/lib/PDFRenderer-0.9.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tberg12/ocular/3470d41a38a16a2c8b5e77f382c82e51d886cdd6/lib/PDFRenderer-0.9.1.jar


--------------------------------------------------------------------------------
/lib/junit-4.12.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tberg12/ocular/3470d41a38a16a2c8b5e77f382c82e51d886cdd6/lib/junit-4.12.jar


--------------------------------------------------------------------------------
/lib/murphy.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tberg12/ocular/3470d41a38a16a2c8b5e77f382c82e51d886cdd6/lib/murphy.jar


--------------------------------------------------------------------------------
/make_jar.sh:
--------------------------------------------------------------------------------
 1 | cp lib/JCuda-All-0.6.0-bin-linux-x86_64/* lib/
 2 | cp lib/JCuda-All-0.6.0-bin-apple-x86_64/* lib/
 3 | 
 4 | 
 5 | java -Dfile.encoding=UTF8 -Xmx1536M -Xss1M -XX:+CMSClassUnloadingEnabled -XX:MaxPermSize=256m -jar sbt-launch-*.jar "one-jar"
 6 | JARPATH=`expr target/scala-*/ocular_*-*-one-jar.jar`
 7 | FILENAME=$(basename $JARPATH)
 8 | VERSION=${FILENAME:12}
 9 | VERSION=${VERSION::${#VERSION}-12}
10 | JARNAME="ocular-${VERSION}-with_dependencies.jar"
11 | TEMPDIR=${FILENAME::${#FILENAME}-4}
12 | mkdir $TEMPDIR
13 | mv $JARPATH $TEMPDIR
14 | cd $TEMPDIR
15 | jar -xf $FILENAME
16 | rm $FILENAME
17 | cp ../lib/*.jar lib/
18 | cp ../lib/JCuda-*/* lib/
19 | jar cmf META-INF/MANIFEST.MF ../$JARNAME *
20 | cd ..
21 | rm -r $TEMPDIR
22 | 
23 | 


--------------------------------------------------------------------------------
/make_run_script.sh:
--------------------------------------------------------------------------------
1 | cp lib/JCuda-All-0.6.0-bin-linux-x86_64/* lib/
2 | cp lib/JCuda-All-0.6.0-bin-apple-x86_64/* lib/
3 | 
4 | 
5 | java -Dfile.encoding=UTF8 -Xmx1536M -Xss1M -XX:+CMSClassUnloadingEnabled -XX:MaxPermSize=256m -jar sbt-launch-*.jar "start-script"
6 | 


--------------------------------------------------------------------------------
/options_lists.txt:
--------------------------------------------------------------------------------
  1 | ### InitializeLanguageModel
  2 | ##### Required
  3 | 	inputTextPath
  4 | 	outputLmPath
  5 | ##### Additional Options
  6 | 	minCharCount
  7 | 	insertLongS
  8 | 	charNgramLength
  9 | 	alternateSpellingReplacementPaths
 10 | ##### Rarely Used Options
 11 | 	removeDiacritics
 12 | 	pKeepSameLanguage
 13 | 	languagePriors
 14 | 	lmPower
 15 | 	explicitCharacterSet
 16 | 	lmCharCount
 17 | 
 18 | 
 19 | 
 20 | ### InitializeFont
 21 | ##### Required
 22 | 	inputLmPath
 23 | 	outputFontPath
 24 | ##### Additional Options
 25 | 	allowedFontsPath
 26 | ##### Rarely Used Options
 27 | 	numFontInitThreads
 28 | 	spaceMaxWidthFraction
 29 | 	spaceMinWidthFraction
 30 | 	templateMaxWidthFraction
 31 | 	templateMinWidthFraction
 32 | 
 33 | 
 34 | 
 35 | ### TrainFont
 36 | ##### Main Options
 37 | 	inputDocPath
 38 | 	inputDocListPath
 39 | 	inputFontPath
 40 | 	inputLmPath
 41 | 	numDocs
 42 | 	numDocsToSkip
 43 | 	numEMIters
 44 | 	continueFromLastCompleteIteration
 45 | 	outputPath
 46 | 	outputFormats
 47 | 	outputFontPath
 48 | ##### Additional Options
 49 | 	extractedLinesPath
 50 | 	updateDocBatchSize
 51 | These options affect the speed of font training
 52 | 	emissionEngine
 53 | 	beamSize
 54 | 	markovVerticalOffset
 55 | ##### Glyph Substitution Model Options
 56 | Glyph substitution is the feature that allows Ocular to use a probabilistic mapping from modern orthography (as used in the language model training text) to the orthography seen in the documents. If the glyph substitution feature is used, Ocular will jointly produce dual transcriptions: one that is an exact transcription of the document, and one that is a normalized version of the text.
 57 | 	allowGlyphSubstitution
 58 | 	inputGsmPath
 59 | 	updateGsm
 60 | 	outputGsmPath
 61 | ##### Language Model Training Options
 62 | 	updateLM
 63 | 	outputLmPath
 64 | ##### Line Extraction Options
 65 | 	binarizeThreshold
 66 | 	crop
 67 | ##### Evaluate During Training
 68 | 	evalInputDocPath
 69 | 	evalNumDocs
 70 | 	evalExtractedLinesPath
 71 | 	evalFreq
 72 | 	evalBatches
 73 | ##### Rarely Used Options
 74 | 	allowLanguageSwitchOnPunct
 75 | 	cudaDeviceID
 76 | 	decodeBatchSize
 77 | 	gsmElideAnything
 78 | 	gsmElisionSmoothingCountMultiplier
 79 | 	gsmNoCharSubPrior
 80 | 	gsmPower
 81 | 	gsmSmoothingCount
 82 | 	paddingMaxWidth
 83 | 	paddingMinWidth
 84 | 	uniformLineHeight
 85 | 	numDecodeThreads
 86 | 	numEmissionCacheThreads
 87 | 	numMstepThreads
 88 | 
 89 | 
 90 | 
 91 | 
 92 | 
 93 | ### Transcribe
 94 | ##### Main Options
 95 | 	inputDocPath
 96 | 	inputDocListPath
 97 | 	inputFontPath
 98 | 	inputLmPath
 99 | 	numDocs
100 | 	numDocsToSkip
101 | 	skipAlreadyTranscribedDocs
102 | 	outputPath
103 | 	outputFormats
104 | ##### Additional Options
105 | 	extractedLinesPath
106 | 	failIfAllDocsAlreadyTranscribed
107 | These options affect the speed of transcription
108 | 	emissionEngine
109 | 	beamSize
110 | 	markovVerticalOffset
111 | ##### Glyph Substitution Model Options
112 | Glyph substitution is the feature that allows Ocular to use a probabilistic mapping from modern orthography (as used in the language model training text) to the orthography seen in the documents. If the glyph substitution feature is used, Ocular will jointly produce dual transcriptions: one that is an exact transcription of the document, and one that is a normalized version of the text.
113 | 	allowGlyphSubstitution
114 | 	inputGsmPath
115 | ##### Model Updating Options
116 | 	updateDocBatchSize
117 | For updating the font model
118 | 	updateFont
119 | 	outputFontPath
120 | For updating the glyph substitution model
121 | 	updateGsm
122 | 	outputGsmPath
123 | For updating the language model
124 | 	updateLM
125 | 	outputLmPath
126 | ##### Line Extraction Options
127 | 	binarizeThreshold
128 | 	crop
129 | ##### Evaluate During Training
130 | 	evalInputDocPath
131 | 	evalNumDocs
132 | 	evalBatches
133 | 	evalExtractedLinesPath
134 | ##### Rarely Used Options
135 | 	allowLanguageSwitchOnPunct
136 | 	cudaDeviceID
137 | 	decodeBatchSize
138 | 	gsmElideAnything
139 | 	gsmElisionSmoothingCountMultiplier
140 | 	gsmNoCharSubPrior
141 | 	gsmPower
142 | 	gsmSmoothingCount
143 | 	paddingMaxWidth
144 | 	paddingMinWidth
145 | 	uniformLineHeight
146 | 	numDecodeThreads
147 | 	numEmissionCacheThreads
148 | 	numMstepThreads
149 | 
150 | 
151 | 
152 | 
153 | 
154 | 
155 | 
156 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.typesafe.sbt" % "sbt-start-script" % "0.10.0")
2 | 
3 | addSbtPlugin("org.scala-sbt.plugins" % "sbt-onejar" % "0.8")
4 | 
5 | 


--------------------------------------------------------------------------------
/publish_jar_with_dependencies.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | set -e
4 | 
5 | ./make_jar.sh
6 | scp ocular-0.3-SNAPSHOT-with_dependencies.jar k:public_html/maven-repository/snapshots/edu/berkeley/cs/nlp/ocular/0.3-SNAPSHOT/
7 | 
8 | 


--------------------------------------------------------------------------------
/replace/latin.txt:
--------------------------------------------------------------------------------
1 | an	\~a	4
2 | en	\~e	4
3 | in	\~i	4
4 | on	\~o	4
5 | un	\~u	4
6 | u	v	4
7 | v	u	5
8 | ae	æ	2
9 | 


--------------------------------------------------------------------------------
/replace/nahuatl.txt:
--------------------------------------------------------------------------------
1 | \'q	\~q	1
2 | u	v	4
3 | v	u	5
4 | 


--------------------------------------------------------------------------------
/replace/spanish.txt:
--------------------------------------------------------------------------------
 1 | \`a	a	1
 2 | \`e	e	1
 3 | \`i	i	1
 4 | \`o	o	1
 5 | \`u	u	1
 6 | que	\~q	4
 7 | per	\~p	4
 8 | ci	zi	4
 9 | ce	ze	4
10 | x	j	4
11 | j	x	5
12 | an	\~a	4
13 | en	\~e	4
14 | in	\~i	4
15 | on	\~o	4
16 | un	\~u	4
17 |  h	 	5
18 | be	ve	5
19 | u	v	4
20 | v	u	5
21 | \'a	a	4
22 | \'e	e	4
23 | \'i	i	4
24 | \'o	o	4
25 | \'u	u	4
26 | ae	æ	2
27 | oracion	o\~ron	5
28 | 


--------------------------------------------------------------------------------
/sample_images/advertencias/pl_blac_047_00039-800.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tberg12/ocular/3470d41a38a16a2c8b5e77f382c82e51d886cdd6/sample_images/advertencias/pl_blac_047_00039-800.jpg


--------------------------------------------------------------------------------
/sample_images/advertencias/pl_blac_047_00039-800.txt:
--------------------------------------------------------------------------------
 1 |        los confeſſores             9
 2 | 
 3 | quod quando quis fcit in confeſsione pecca-
 4 | tum alicuius, cum quo pænitens dicit ſe pec
 5 | caſſe, quando ille venerit ad confitendum po
 6 | teſt eum de tali peccato interrogare, & inge
 7 | nere, & inſpecie: ſi tale eſt peccatum de quo
 8 | ſolent confeſſores interrogare pęnitentes. Et
 9 | hoc dum modo talis non poſſet habere ſuſpì
10 | tionem, quod is, cum quo peccanit, fuerit de
11 | illo peccato confeſſus.
12 | ¶Algunas viejas y viejos ſe vienem a con-      6
13 | feſſar y a reconciliar que apenas puede el c\~o
14 | feſſor juzgar, ſi es peccado venial lo que di-
15 | zen. Sino los abſuelue van deſconſoladas,
16 | y ſi las abſuelue, queda con ſcrupulo de auer
17 | abſuelto ſin materia ſufficiente. Para eſto ab
18 | ſueluale deſta manera. Si vere peccato habes 
19 | & confeſſus es. Ego te abſoluo, ſi n\~o habes
20 | non. Aſsi lo enſe\~na el Maeſtro fray Bartho-
21 | lome de Medina en ſu ſumma cap. 12. Vega
22 | lib. I. caſo. 353.
23 | ¶ En algunas partes ſe hazen ya tan perezo-    7
24 | ſos los naturales para venir a confeſſarſe la-
25 | Quareſma, que ſino ſe tiene gran cuenta en
26 | preuenirlos deſde el Domingo antes de la ſe
27 | mana que vengan por ſus varrios a confeſſar
28 | ſe, no vienen. Y ſi los miniſtros a premian a
29 |                           C    1    los
30 | 
31 | 


--------------------------------------------------------------------------------
/sample_images/advertencias/pl_blac_047_00040-800.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tberg12/ocular/3470d41a38a16a2c8b5e77f382c82e51d886cdd6/sample_images/advertencias/pl_blac_047_00040-800.jpg


--------------------------------------------------------------------------------
/sample_images/advertencias/pl_blac_047_00040-800.txt:
--------------------------------------------------------------------------------
 1 | 
 2 |           Advertencias para
 3 | a los mandones a que los traygan, ellos he-
 4 | chan mano, de los primeros que topan por la
 5 | calle, o tianguez, y ſi les mandan aguardar
 6 | para que pienſen bien ſus peccados, y ſi quie
 7 | ra hagan algunos actos de contriction, ſucce-
 8 | de que quando acuerda el confeſſor ya ſe han
 9 | ydo y nunca mas bueluen. Pues confeſſarlos
10 | ſin preceder deuida penitencia y dìlig\~ete exa
11 | men de ſu conſciencia, ya ſe vee lo que diz\~e
12 | los Doctores que la confeſsion del que no hi
13 | zo la deuida diligencia para examinar bien
14 | ſu conſciencia, por lo qual ſe le oluido alg\~u
15 | peccado, o peccados mortales, es invalido y
16 | neceſſario repet\~eda ſino ſino es ìn articulo mor
17 | tis, que tunc excuſatur pœ ſi conſitea-
18 | tur fine pręuia examìnatìone. Para eſto diſ-
19 | pongale el confeſſor lo mejor que pudiere y
20 | ſupiere, por que ſuppletur deffectus examinis
21 | per interrogationem prudentis confeſſarij:
22 | præſertim vrg\~ete cauſa, & qu\~ado ruſtici eti\~a
23 | moniti neſciunt præmeditari peccata, como
24 | dize Nauarro y otros que alega Henrico H\~e
25 | riquez tom. I. lib. 2. de pęnit\~ecia cap. 5. §. I.
26 | ¶ Muchas vezes va vn ſacerdote por vn ca-     8
27 | mino, y llamanle a confeſſar a vn indio que
28 | eſta malo, y no ſabe el ſacerdote mucha len-
29 |                                          gua
30 | 
31 | 


--------------------------------------------------------------------------------
/sample_images/advertencias/pl_blac_047_00041-800.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tberg12/ocular/3470d41a38a16a2c8b5e77f382c82e51d886cdd6/sample_images/advertencias/pl_blac_047_00041-800.jpg


--------------------------------------------------------------------------------
/sample_images/advertencias/pl_blac_047_00041-800.txt:
--------------------------------------------------------------------------------
 1 |        los confeſſores            10
 2 | 
 3 | gua, que hara Confieſſele no hauiendo muy
 4 | cerca otro ſacerdote que lo pueda confeſſar,
 5 | vieno ſe\~nales de contricion en el : que por
 6 | pocos peccados que le entienda, aunque de-
 7 | xe de entender otros muchos, baſta para po-
 8 | derle abſoluer, y aquella alma queda reme-
 9 | diada. Por que ſi eſtaua attrita, con el ſacra-
10 | mento ſe haze contrita, y por el conſiguiente
11 | digna de vida eterna. Y concederle ha la in-
12 | dulgencia de la Bulla [teniendola] para \~q no
13 | ſe detenga en el purgatorio. Y eſto aunque
14 | no eſte muy enfermo, pues la experi\~ecia nos
15 | enſe\~na quanto los ſuele apreſurar la enferme
16 | dad y lleuarſe los en no nada. Pues fray Lu
17 | ys Lopez y fray Manuel Rodriguez, tom. I.
18 | cap. 61. conl. 3. num. 3 dizen que el confeſ-
19 | ſor Caſtellano que no ſabe la lenga Fr\~ace-
20 | la ſi no alguna coſa della, puede confeſſar al
21 | Frances \~q en ſu lengua ſe confieſſa con el, a\~u
22 | \~q ſea fuera del articulo de la muerte. Y no ſe
23 | qual es el ſacerdote \~q puede c\~ofeſſar en ſemejan
24 | te neceſsidad, particularmente los religioſos
25 | a quien ſiruen tambi\~e eſtos pobres naturales.
26 | ¶ Com\~umente diz\~e los doctores \~q nadie eſ     9
27 | ta obligado a c\~ofeſſarſe por interprete eſt\~ado
28 |                             C     2     con
29 | 
30 | 


--------------------------------------------------------------------------------
/sample_images/english/184101040058.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tberg12/ocular/3470d41a38a16a2c8b5e77f382c82e51d886cdd6/sample_images/english/184101040058.jpg


--------------------------------------------------------------------------------
/sample_images/english/184101040060.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tberg12/ocular/3470d41a38a16a2c8b5e77f382c82e51d886cdd6/sample_images/english/184101040060.jpg


--------------------------------------------------------------------------------
/sample_images/english/184101040062.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tberg12/ocular/3470d41a38a16a2c8b5e77f382c82e51d886cdd6/sample_images/english/184101040062.jpg


--------------------------------------------------------------------------------
/sbt-launch-0.13.8.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tberg12/ocular/3470d41a38a16a2c8b5e77f382c82e51d886cdd6/sbt-launch-0.13.8.jar


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/data/Document.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.data;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import edu.berkeley.cs.nlp.ocular.image.ImageUtils.PixelType;
 6 | 
 7 | /**
 8 |  * @author Dan Garrette (dhgarrette@gmail.com)
 9 |  */
10 | public interface Document {
11 | 	public String baseName();
12 | 	public PixelType[][][] loadLineImages();
13 | 	public String[][] loadDiplomaticTextLines();
14 | 	public String[][] loadNormalizedTextLines();
15 | 	public List<String> loadNormalizedText();
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/data/FirstFolioRawImageLoader.java:
--------------------------------------------------------------------------------
  1 | package edu.berkeley.cs.nlp.ocular.data;
  2 | 
  3 | import java.io.File;
  4 | import java.io.FilenameFilter;
  5 | import java.util.Arrays;
  6 | import java.util.List;
  7 | 
  8 | import edu.berkeley.cs.nlp.ocular.image.ImageUtils;
  9 | import edu.berkeley.cs.nlp.ocular.image.ImageUtils.ConnectedComponentProcessor;
 10 | import edu.berkeley.cs.nlp.ocular.image.ImageUtils.PixelType;
 11 | import edu.berkeley.cs.nlp.ocular.preprocessing.Binarizer;
 12 | import edu.berkeley.cs.nlp.ocular.preprocessing.LineExtractor;
 13 | import tberg.murphy.arrays.a;
 14 | import tberg.murphy.fileio.f;
 15 | import tberg.murphy.threading.BetterThreader;
 16 | 
 17 | public class FirstFolioRawImageLoader {
 18 | 
 19 | 	public static class FirstFolioRawImageDocument implements Document {
 20 | 		private final String baseName;
 21 | 		final PixelType[][][] observations;
 22 | 		
 23 | 		public FirstFolioRawImageDocument(String inputPath, String baseName, int lineHeight, double binarizeThreshold) {
 24 | 			this.baseName = baseName;
 25 | 			double[][] levels = ImageUtils.getLevels(f.readImage(inputPath+"/"+baseName));
 26 | 			ConnectedComponentProcessor ccprocBig = new ConnectedComponentProcessor() {
 27 | 				public void process(double[][] levels, List<int[]> connectedComponent) {
 28 | 					if (connectedComponent.size() > 1000) {
 29 | 						for (int[] pixel : connectedComponent) {
 30 | 							levels[pixel[0]][pixel[1]] = 255.0;
 31 | 						}
 32 | 					}
 33 | 				}
 34 | 			};
 35 | 			ImageUtils.processConnectedComponents(levels, 50.0, ccprocBig);
 36 | 			Binarizer.binarizeGlobal(binarizeThreshold, levels);
 37 | 			ConnectedComponentProcessor ccprocSmall = new ConnectedComponentProcessor() {
 38 | 				public void process(double[][] levels, List<int[]> connectedComponent) {
 39 | 					if (connectedComponent.size() < 20 || connectedComponent.size() > 1000) {
 40 | 						for (int[] pixel : connectedComponent) {
 41 | 							levels[pixel[0]][pixel[1]] = 255.0;
 42 | 						}
 43 | 					}
 44 | 				}
 45 | 			};
 46 | 			ImageUtils.processConnectedComponents(levels, 127.0, ccprocSmall);
 47 | 			
 48 | 			int padHeight = 0;
 49 | 			double[][] topPadLevels = new double[levels.length][];
 50 | 			for (int i=0; i<levels.length; ++i) topPadLevels[i] = a.append(a.add(a.zerosDouble(padHeight), 255.0), levels[i]);
 51 | 			
 52 | 			List<double[][]> lines = LineExtractor.extractLines(topPadLevels);
 53 | 			observations = new PixelType[lines.size()][][];
 54 | 			for (int i=0; i<lines.size(); ++i) {
 55 | 				if (lineHeight >= 0) {
 56 | 					observations[i] = ImageUtils.getPixelTypes(ImageUtils.resampleImage(ImageUtils.makeImage(lines.get(i)), lineHeight));
 57 | 				} else {
 58 | 					observations[i] = ImageUtils.getPixelTypes(ImageUtils.makeImage(lines.get(i)));
 59 | 				}
 60 | 			}
 61 | 		}
 62 | 
 63 | 		public PixelType[][][] loadLineImages() {
 64 | 			return observations;
 65 | 		}
 66 | 
 67 | 		public String[][] loadDiplomaticTextLines() {
 68 | 			return null;
 69 | 		}
 70 | 		
 71 | 		public String[][] loadNormalizedTextLines() {
 72 | 			return null;
 73 | 		}
 74 | 		
 75 | 		public List<String> loadNormalizedText() {
 76 | 			return null;
 77 | 		}
 78 | 		
 79 | 		public String baseName() {
 80 | 			return baseName;
 81 | 		}
 82 | 	}
 83 | 	
 84 | 	public static List<Document> loadDocuments(final String inputPath, final int lineHeight, final double binarizeThreshold, final int numThreads) {
 85 | 		System.out.println("Extracting text line images from dataset "+inputPath);
 86 | 		File dir = new File(inputPath);
 87 | 		final String[] dirList = dir.list(new FilenameFilter() {
 88 | 			public boolean accept(File dir, String name) {
 89 | 				if (name.startsWith(".")) { // ignore hidden files
 90 | 					return false;
 91 | 				}
 92 | 				else if (!name.endsWith(".png") && !name.endsWith(".jpg")) {
 93 | 					return false;
 94 | 				}
 95 | 				return true;
 96 | 			}
 97 | 		});
 98 | 		final Document[] docs = new Document[dirList.length]; 
 99 | 		BetterThreader.Function<Integer,Object> func = new BetterThreader.Function<Integer,Object>(){public void call(Integer i, Object ignore){
100 | 			String baseName = dirList[i];
101 | 			docs[i] = new FirstFolioRawImageDocument(inputPath, baseName, lineHeight, binarizeThreshold);
102 | 		}};
103 | 		BetterThreader<Integer,Object> threader = new BetterThreader<Integer,Object>(func, numThreads);
104 | 		for (int i=0; i<dirList.length; ++i) threader.addFunctionArgument(i);
105 | 		threader.run();
106 | 		return Arrays.asList(docs);
107 | 	}
108 | 
109 | }
110 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/data/LazyRawImageLoader.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.data;
 2 | 
 3 | import java.io.File;
 4 | import java.util.ArrayList;
 5 | import java.util.Arrays;
 6 | import java.util.Collections;
 7 | import java.util.Comparator;
 8 | import java.util.List;
 9 | 
10 | import edu.berkeley.cs.nlp.ocular.model.CharacterTemplate;
11 | import edu.berkeley.cs.nlp.ocular.util.FileUtil;
12 | 
13 | /**
14 |  * A dataset loader that reads files the files recursively, in lexicographical 
15 |  * order.  Images are loaded only as they are needed (lazily), and then stored
16 |  * in memory for later use.
17 |  * 
18 |  * @author Dan Garrette (dhgarrette@gmail.com)
19 |  */
20 | public class LazyRawImageLoader {
21 | 
22 | 	public static List<Document> loadDocuments(String inputPath, String extractedLinesPath, int numDocs, int numDocsToSkip) { 
23 | 		return loadDocuments(inputPath, extractedLinesPath, numDocs, numDocsToSkip, true, 0.12, false); 
24 | 	}
25 | 	public static List<Document> loadDocuments(String inputPath, String extractedLinesPath, int numDocs, int numDocsToSkip, boolean uniformLineHeight, double binarizeThreshold, boolean crop) {
26 | 		return loadDocuments(Arrays.asList(inputPath), extractedLinesPath, numDocs, numDocsToSkip, uniformLineHeight, binarizeThreshold, crop);
27 | 	}
28 | 
29 | 	public static List<Document> loadDocuments(List<String> inputPaths, String extractedLinesPath, int numDocs, int numDocsToSkip) { 
30 | 		return loadDocuments(inputPaths, extractedLinesPath, numDocs, numDocsToSkip, true, 0.12, false); 
31 | 	}
32 | 	public static List<Document> loadDocuments(List<String> inputPaths, String extractedLinesPath, int numDocs, int numDocsToSkip, boolean uniformLineHeight, double binarizeThreshold, boolean crop) {
33 | 		List<Document> lazyDocs = new ArrayList<Document>();
34 | 		for (String inputPath : inputPaths) {
35 | 			lazyDocs.addAll(loadDocumentsFromDir(inputPath, extractedLinesPath, uniformLineHeight, binarizeThreshold, crop));
36 | 		}
37 | 
38 | 		int actualNumDocsToSkip = Math.min(lazyDocs.size(), numDocsToSkip);
39 | 		int actualNumDocsToUse = Math.min(lazyDocs.size() - actualNumDocsToSkip, numDocs <= 0 ? Integer.MAX_VALUE : numDocs);
40 | 		System.out.println("Using "+actualNumDocsToUse+" documents (skipping "+actualNumDocsToSkip+")");
41 | 		for (int docNum = 0; docNum < actualNumDocsToSkip; ++docNum) {
42 | 			Document lazyDoc = lazyDocs.get(docNum);
43 | 			System.out.println("  Skipping the first "+numDocsToSkip+" documents: " + lazyDoc.baseName());
44 | 		}
45 | 		
46 | 		List<Document> documents = new ArrayList<Document>();
47 | 		for (int docNum = actualNumDocsToSkip; docNum < actualNumDocsToSkip + actualNumDocsToUse; ++docNum) {
48 | 			Document lazyDoc = lazyDocs.get(docNum);
49 | 			System.out.println("  Using " + lazyDoc.baseName());
50 | 			documents.add(lazyDoc);
51 | 		}
52 | 		return documents;
53 | 	}
54 | 	
55 | 	private static List<Document> loadDocumentsFromDir(String inputPath, String extractedLinesPath, boolean uniformLineHeight, double binarizeThreshold, boolean crop) {
56 | 		int lineHeight = uniformLineHeight ? CharacterTemplate.LINE_HEIGHT : -1;
57 | 
58 | 		File dir = new File(inputPath);
59 | 		System.out.println("Reading data from [" + dir + "], which " + (dir.exists() ? "exists" : "does not exist"));
60 | 		List<File> dirList = FileUtil.recursiveFiles(dir);
61 | 
62 | 		List<Document> lazyDocs = new ArrayList<Document>();
63 | 		for (File f : dirList) {
64 | 			if (f.getName().endsWith(".txt"))
65 | 				continue;
66 | 			else if (f.getName().endsWith(".pdf")) {
67 | 				int numPages = PdfImageReader.numPagesInPdf(f);
68 | 				for (int pageNumber = 1; pageNumber <= numPages; ++pageNumber) {
69 | 					lazyDocs.add(new LazyRawPdfImageDocument(f, pageNumber, inputPath, lineHeight, binarizeThreshold, crop, extractedLinesPath));
70 | 				}
71 | 			}
72 | 			else {
73 | 				lazyDocs.add(new LazyRawSingleImageDocument(f, inputPath, lineHeight, binarizeThreshold, crop, extractedLinesPath));
74 | 			}
75 | 		}
76 | 
77 | 		Collections.sort(lazyDocs, new Comparator<Document>() {
78 | 			public int compare(Document o1, Document o2) {
79 | 				return o1.baseName().compareTo(o2.baseName());
80 | 			}
81 | 		});
82 | 		
83 | 		return lazyDocs;
84 | 	}
85 | }
86 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/data/LazyRawPdfImageDocument.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.data;
 2 | 
 3 | import java.awt.image.BufferedImage;
 4 | import java.io.File;
 5 | 
 6 | import edu.berkeley.cs.nlp.ocular.util.FileUtil;
 7 | 
 8 | /**
 9 |  * A document that reads a page from a pdf file only as it is needed
10 |  * (and then stores the contents in memory for later use).
11 |  * 
12 |  * @author Dan Garrette (dhgarrette@gmail.com)
13 |  */
14 | public class LazyRawPdfImageDocument extends LazyRawImageDocument {
15 | 	private final File pdfFile;
16 | 	private final int pageNumber; // starts at one!
17 | 
18 | 	public LazyRawPdfImageDocument(File pdfFile, int pageNumber, String inputPath, int lineHeight, double binarizeThreshold, boolean crop, String extractedLinesPath) {
19 | 		super(inputPath, lineHeight, binarizeThreshold, crop, extractedLinesPath);
20 | 		this.pdfFile = pdfFile;
21 | 		this.pageNumber = pageNumber;
22 | 	}
23 | 
24 | 	protected BufferedImage doLoadBufferedImage() {
25 | 		System.out.println("Extracting text line images from " + pdfFile + ", page " + pageNumber);
26 | 		return PdfImageReader.readPdfPageAsImage(pdfFile, pageNumber);
27 | 	}
28 | 	
29 | 	protected File file() { return pdfFile; }
30 | 	protected String preext() { return new File(baseName()).getName(); }
31 | 	protected String ext() { return "png"; }
32 | 	
33 | 	public String baseName() {
34 | 		return FileUtil.withoutExtension(pdfFile.getPath()) + "_pdf_page" + String.format("%05d", pageNumber);
35 | 	}
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/data/LazyRawSingleImageDocument.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.data;
 2 | 
 3 | import java.awt.image.BufferedImage;
 4 | import java.io.File;
 5 | 
 6 | import edu.berkeley.cs.nlp.ocular.util.FileUtil;
 7 | import tberg.murphy.fileio.f;
 8 | 
 9 | /**
10 |  * A document that reads a file only as it is needed (and then stores
11 |  * the contents in memory for later use).
12 |  * 
13 |  * @author Dan Garrette (dhgarrette@gmail.com)
14 |  */
15 | public class LazyRawSingleImageDocument extends LazyRawImageDocument {
16 | 	private final File file;
17 | 
18 | 	public LazyRawSingleImageDocument(File file, String inputPath, int lineHeight, double binarizeThreshold, boolean crop, String extractedLinesPath) {
19 | 		super(inputPath, lineHeight, binarizeThreshold, crop, extractedLinesPath);
20 | 		this.file = file;
21 | 	}
22 | 
23 | 	protected BufferedImage doLoadBufferedImage() {
24 | 		System.out.println("Extracting text line images from " + file);
25 | 		return f.readImage(file.getPath());
26 |   }
27 | 	
28 | 	protected File file() { return file; }
29 | 	protected String preext() { return FileUtil.withoutExtension(file.getName()); }
30 | 	protected String ext() { return FileUtil.extension(file.getName()); }
31 | 
32 | 	public String baseName() {
33 | 		return file.getPath();
34 | 	}
35 | 
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/data/PdfImageReader.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.data;
 2 | 
 3 | import java.awt.Graphics2D;
 4 | import java.awt.Image;
 5 | import java.awt.Rectangle;
 6 | import java.awt.image.BufferedImage;
 7 | import java.io.File;
 8 | import java.io.IOException;
 9 | import java.io.RandomAccessFile;
10 | import java.nio.ByteBuffer;
11 | import java.nio.channels.FileChannel;
12 | import java.util.ArrayList;
13 | import java.util.List;
14 | 
15 | import com.sun.pdfview.PDFFile;
16 | import com.sun.pdfview.PDFPage;
17 | 
18 | /**
19 |  * @author Dan Garrette (dhgarrette@gmail.com)
20 |  */
21 | public class PdfImageReader {
22 | 
23 | 	public static int numPagesInPdf(File pdfFile) {
24 | 		try {
25 | 			RandomAccessFile raf = new RandomAccessFile(pdfFile, "r");
26 | 			FileChannel channel = raf.getChannel();
27 | 			ByteBuffer buf = channel.map(FileChannel.MapMode.READ_ONLY, 0, channel.size());
28 | 			PDFFile pdf = new PDFFile(buf);
29 | 			int numPages = pdf.getNumPages();
30 | 			raf.close();
31 | 			return numPages;
32 | 		}
33 | 		catch (IOException e) {
34 | 			throw new RuntimeException(e);
35 | 		}
36 | 	}
37 | 
38 | 	public static List<BufferedImage> readPdfAsImages(File pdfFile) {
39 | 		try {
40 | 			RandomAccessFile raf = new RandomAccessFile(pdfFile, "r");
41 | 			FileChannel channel = raf.getChannel();
42 | 			ByteBuffer buf = channel.map(FileChannel.MapMode.READ_ONLY, 0, channel.size());
43 | 			PDFFile pdf = new PDFFile(buf);
44 | 
45 | 			List<BufferedImage> images = new ArrayList<BufferedImage>();
46 | 			for (int pageNumber = 1; pageNumber <= pdf.getNumPages(); ++pageNumber) {
47 | 				images.add(readPage(pdf, pageNumber));
48 | 			}
49 | 
50 | 			raf.close();
51 | 			return images;
52 | 		}
53 | 		catch (IOException e) {
54 | 			throw new RuntimeException(e);
55 | 		}
56 | 	}
57 | 
58 | 	/**
59 | 	 * 
60 | 	 * @param pdfFile
61 | 	 *          Path to the pdf file.
62 | 	 * @param pageNumber
63 | 	 *          One-based page number to read
64 | 	 * @return
65 | 	 */
66 | 	public static BufferedImage readPdfPageAsImage(File pdfFile, int pageNumber) {
67 | 		if (pageNumber < 1)
68 | 			throw new RuntimeException("page numbering starts with 1; '" + pageNumber + "' given");
69 | 		try {
70 | 			RandomAccessFile raf = new RandomAccessFile(pdfFile, "r");
71 | 			FileChannel channel = raf.getChannel();
72 | 			ByteBuffer buf = channel.map(FileChannel.MapMode.READ_ONLY, 0, channel.size());
73 | 			PDFFile pdf = new PDFFile(buf);
74 | 			BufferedImage image = readPage(pdf, pageNumber);
75 | 			raf.close();
76 | 			return image;
77 | 		}
78 | 		catch (IOException e) {
79 | 			throw new RuntimeException(e);
80 | 		}
81 | 	}
82 | 
83 | 	private static BufferedImage readPage(PDFFile pdf, int pageNumber) {
84 | 		double scale = 2.5; // because otherwise the image comes out really tiny
85 | 		PDFPage page = pdf.getPage(pageNumber);
86 | 		Rectangle rect = new Rectangle(0, 0, (int) page.getBBox().getWidth(), (int) page.getBBox().getHeight());
87 | 		BufferedImage bufferedImage = new BufferedImage((int)(rect.width * scale), (int)(rect.height * scale), BufferedImage.TYPE_INT_RGB);
88 | 		Image image = page.getImage((int)(rect.width * scale), (int)(rect.height * scale), rect, null, true, true);
89 | 		Graphics2D bufImageGraphics = bufferedImage.createGraphics();
90 | 		bufImageGraphics.drawImage(image, 0, 0, null);
91 | 		return bufferedImage;
92 | 	}
93 | }
94 | 
95 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/data/RawImageLoader.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.data;
 2 | 
 3 | import tberg.murphy.fileio.f;
 4 | import edu.berkeley.cs.nlp.ocular.image.ImageUtils;
 5 | import edu.berkeley.cs.nlp.ocular.image.ImageUtils.PixelType;
 6 | 
 7 | import java.io.File;
 8 | import java.io.FilenameFilter;
 9 | import java.util.Arrays;
10 | import java.util.List;
11 | 
12 | import edu.berkeley.cs.nlp.ocular.preprocessing.Binarizer;
13 | import edu.berkeley.cs.nlp.ocular.preprocessing.Cropper;
14 | import edu.berkeley.cs.nlp.ocular.preprocessing.LineExtractor;
15 | import edu.berkeley.cs.nlp.ocular.preprocessing.Straightener;
16 | import tberg.murphy.threading.BetterThreader;
17 | 
18 | /**
19 |  * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu)
20 |  */
21 | public class RawImageLoader {
22 | 
23 | 	public static class RawImageDocument implements Document {
24 | 		private final String baseName;
25 | 		final PixelType[][][] observations;
26 | 		
27 | 		public RawImageDocument(String inputPath, String baseName, int lineHeight, double binarizeThreshold) {
28 | 			this.baseName = baseName;
29 | 			double[][] levels = ImageUtils.getLevels(f.readImage(inputPath+"/"+baseName));
30 | 			double[][] rotLevels = Straightener.straighten(levels);
31 | 			double[][] cropLevels = Cropper.crop(rotLevels, binarizeThreshold);
32 | 			Binarizer.binarizeGlobal(binarizeThreshold, cropLevels);
33 | 			List<double[][]> lines = LineExtractor.extractLines(cropLevels);
34 | 			observations = new PixelType[lines.size()][][];
35 | 			for (int i=0; i<lines.size(); ++i) {
36 | 				if (lineHeight >= 0) {
37 | 					observations[i] = ImageUtils.getPixelTypes(ImageUtils.resampleImage(ImageUtils.makeImage(lines.get(i)), lineHeight));
38 | 				} else {
39 | 					observations[i] = ImageUtils.getPixelTypes(ImageUtils.makeImage(lines.get(i)));
40 | 				}
41 | 			}
42 | 		}
43 | 
44 | 		public PixelType[][][] loadLineImages() {
45 | 			return observations;
46 | 		}
47 | 
48 | 		public String[][] loadDiplomaticTextLines() {
49 | 			return null;
50 | 		}
51 | 		
52 | 		public String[][] loadNormalizedTextLines() {
53 | 			return null;
54 | 		}
55 | 		
56 | 		public List<String> loadNormalizedText() {
57 | 			return null;
58 | 		}
59 | 		
60 | 		public String baseName() {
61 | 			return baseName;
62 | 		}
63 | 
64 | 	}
65 | 	
66 | 	public static List<Document> loadDocuments(final String inputPath, final int lineHeight, final double binarizeThreshold, final int numThreads) {
67 | 		System.out.println("Extracting text line images from dataset "+inputPath);
68 | 		File dir = new File(inputPath);
69 | 		final String[] dirList = dir.list(new FilenameFilter() {
70 | 			public boolean accept(File dir, String name) {
71 | 				if (name.startsWith(".")) { // ignore hidden files
72 | 					return false;
73 | 				}
74 | 				else if (!name.endsWith(".png") && !name.endsWith(".jpg")) {
75 | 					return false;
76 | 				}
77 | 				return true;
78 | 			}
79 | 		});
80 | 		final Document[] docs = new Document[dirList.length]; 
81 | 		BetterThreader.Function<Integer,Object> func = new BetterThreader.Function<Integer,Object>(){public void call(Integer i, Object ignore){
82 | 			String baseName = dirList[i];
83 | 			docs[i] = new RawImageDocument(inputPath, baseName, lineHeight, binarizeThreshold);
84 | 		}};
85 | 		BetterThreader<Integer,Object> threader = new BetterThreader<Integer,Object>(func, numThreads);
86 | 		for (int i=0; i<dirList.length; ++i) threader.addFunctionArgument(i);
87 | 		threader.run();
88 | 		return Arrays.asList(docs);
89 | 	}
90 | 
91 | }
92 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/data/TextAndLineImagesLoader.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.data;
 2 | 
 3 | import java.io.File;
 4 | import java.util.ArrayList;
 5 | import java.util.List;
 6 | 
 7 | import tberg.murphy.fileio.f;
 8 | import edu.berkeley.cs.nlp.ocular.image.ImageUtils;
 9 | import edu.berkeley.cs.nlp.ocular.image.ImageUtils.PixelType;
10 | 
11 | /**
12 |  * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu)
13 |  */
14 | public class TextAndLineImagesLoader {
15 | 	
16 | 	public static class TextAndLineImagesDocument implements Document {
17 | 		private final String imgPathPrefix;
18 | 		private final String imgNameSuffix;
19 | 		private final String textPath;
20 | 		private final boolean useLongS;
21 | 		private final int numLines;
22 | 		private final int lineHeight;
23 | 
24 | 		public TextAndLineImagesDocument(String imgPathPrefix, String imgNameSuffix, String textPath, boolean useLongS, int numLines, int lineHeight) {
25 | 			this.imgPathPrefix = imgPathPrefix;
26 | 			this.imgNameSuffix = imgNameSuffix;
27 | 			this.textPath = textPath;
28 | 			this.useLongS = useLongS;
29 | 			this.numLines = numLines;
30 | 			this.lineHeight = lineHeight;
31 | 		}
32 | 
33 | 		public PixelType[][][] loadLineImages() {
34 | 			final PixelType[][][] observations = new PixelType[numLines][][];
35 | 			for (int i=0; i<numLines; ++i) {
36 | 				try {
37 | 					if (lineHeight >= 0) {
38 | 						observations[i] = ImageUtils.getPixelTypes(ImageUtils.resampleImage(f.readImage(imgPathPrefix + i + imgNameSuffix), lineHeight));
39 | 					} else {
40 | 						observations[i] = ImageUtils.getPixelTypes(f.readImage(imgPathPrefix + i + imgNameSuffix));
41 | 					}
42 | 				} catch (Exception e) {
43 | 					throw new RuntimeException("Couldn't read doc from: " + imgPathPrefix + i + imgNameSuffix);
44 | 				}
45 | 			}
46 | 			return observations;
47 | 		}
48 | 
49 | 		public String[][] loadDiplomaticTextLines() {
50 | 			File textFile = new File(textPath);
51 | 			String[][] text = (!textFile.exists() ? null : f.readDocumentByCharacter(textPath, numLines));
52 | 			return text;
53 | 		}
54 | 		
55 | 		public String[][] loadNormalizedTextLines() {
56 | 			return null;
57 | 		}
58 | 		
59 | 		public List<String> loadNormalizedText() {
60 | 			return null;
61 | 		}
62 | 		
63 | 		public String baseName() {
64 | 			String[] split = imgPathPrefix.split("/");
65 | 			String baseNamePlusHyphen = split[split.length-1];
66 | 			return baseNamePlusHyphen.substring(0, baseNamePlusHyphen.length()-1);
67 | 		}
68 | 
69 | 		public boolean useLongS() {
70 | 			return useLongS;
71 | 		}
72 | 	}
73 | 	
74 | 	public static List<Document> loadDocuments(String inputPath, int lineHeight) {
75 | 		List<String> lines = f.readLines(inputPath);
76 | 		List<Document> docs = new ArrayList<Document>();
77 | 		File inputFile = new File(inputPath);
78 | 		for (String line : lines) {
79 | 			if (line.trim().equals("")) continue;
80 | 			String[] split = line.split("\\s+");
81 | 			docs.add(new TextAndLineImagesDocument(inputFile.getParentFile().getAbsolutePath()+"/"+split[0], split[1], inputFile.getParentFile().getAbsolutePath()+"/"+split[2], Boolean.parseBoolean(split[3]), Integer.parseInt(split[4]), lineHeight));
82 | 		}
83 | 		return docs;
84 | 	}
85 | 
86 | }
87 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/data/textreader/BasicTextReader.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.data.textreader;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | /**
 7 |  * @author Dan Garrette (dhgarrette@gmail.com)
 8 |  */
 9 | public class BasicTextReader implements TextReader {
10 | 
11 | 	private boolean treatBackslashAsEscape;
12 | 
13 | 	public BasicTextReader(boolean treatBackslashAsEscape) {
14 | 		this.treatBackslashAsEscape = treatBackslashAsEscape;
15 | 	}
16 | 
17 | 	public BasicTextReader() {
18 | 		this.treatBackslashAsEscape = true;
19 | 	}
20 | 
21 | 	public List<List<String>> readCharacters(List<String> lines) {
22 | 		List<List<String>> characterLines = new ArrayList<List<String>>();
23 | 		for (String l : lines)
24 | 			characterLines.add(readCharacters(l));
25 | 		return characterLines;
26 | 	}
27 | 
28 | 	public List<String> readCharacters(String line) {
29 | 		if (!treatBackslashAsEscape) {
30 | 			line = line.replace("\\", "\\\\");
31 | 		}
32 | 
33 | 		line = line.replace("``", "\"");
34 | 		line = line.replace("''", "\"");
35 | 		line = line.replace("\t", "    ");
36 | 
37 | 		// Split characters and convert to diacritic-normalized forms.
38 | 		List<String> normalizedChars = new ArrayList<String>();
39 | 		for (String c : Charset.readNormalizeCharacters(line)) {
40 | 			normalizedChars.add(c);
41 | 		}
42 | 		return normalizedChars;
43 | 	}
44 | 
45 | 	public String toString() {
46 | 		return "BasicTextReader(" + treatBackslashAsEscape + ")";
47 | 	}
48 | 
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/data/textreader/BlacklistCharacterSetTextReader.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.data.textreader;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.HashSet;
 5 | import java.util.List;
 6 | import java.util.Set;
 7 | 
 8 | /**
 9 |  * @author Dan Garrette (dhgarrette@gmail.com)
10 |  */
11 | public class BlacklistCharacterSetTextReader implements TextReader {
12 | 
13 | 	private Set<String> allInvalidCharacters = new HashSet<String>();
14 | 	private TextReader delegate;
15 | 
16 | 	public BlacklistCharacterSetTextReader(Set<String> invalidCharacters, TextReader delegate) {
17 | 		for (String c : invalidCharacters) {
18 | 			allInvalidCharacters.add(Charset.normalizeChar(c));
19 | 		}
20 | 		this.delegate = delegate;
21 | 	}
22 | 
23 | 	public List<String> readCharacters(String line) {
24 | 		List<String> chars = new ArrayList<String>();
25 | 		for (String c : delegate.readCharacters(line)) {
26 | 			if (!allInvalidCharacters.contains(c)) {
27 | 				chars.add(c);
28 | 			}
29 | 		}
30 | 		return chars;
31 | 	}
32 | 
33 | 	public String toString() {
34 | 		return "BlacklistCharacterSetTextReader(" + delegate + ")";
35 | 	}
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/data/textreader/CharIndexer.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.data.textreader;
 2 | 
 3 | import java.util.Collection;
 4 | 
 5 | import tberg.murphy.indexer.HashMapIndexer;
 6 | import tberg.murphy.indexer.Indexer;
 7 | 
 8 | /**
 9 |  * @author Dan Garrette (dhgarrette@gmail.com)
10 |  */
11 | public class CharIndexer implements Indexer<String> {
12 | 	private static final long serialVersionUID = 3212987272223100239L;
13 | 	
14 | 	private Indexer<String> delegate;
15 | 	
16 | 	public CharIndexer() {
17 | 		delegate = new HashMapIndexer<String>();
18 | 	}
19 | 
20 | 	public boolean contains(String object) {
21 | 		return delegate.contains(Charset.normalizeChar(object)); 
22 | 	}
23 | 	
24 | 	public int getIndex(String object) { 
25 | 		return delegate.getIndex(Charset.normalizeChar(object)); 
26 | 	}
27 | 	
28 | 	public void index(String[] vect) { 
29 | 		for (String x : vect)
30 | 			getIndex(x);
31 | 	}
32 | 	
33 | 	public boolean locked() { return delegate.locked(); }
34 | 	public void lock() { delegate.lock(); }
35 | 	public int size() { return delegate.size(); }
36 | 	public String getObject(int index) { return delegate.getObject(index); }
37 | 	public void forgetIndexLookup() { delegate.forgetIndexLookup(); }
38 | 	public Collection<String> getObjects() { return delegate.getObjects(); }
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/data/textreader/ConvertLongSTextReader.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.data.textreader;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | /**
 7 |  * @author Dan Garrette (dhgarrette@gmail.com)
 8 |  */
 9 | public class ConvertLongSTextReader implements TextReader {
10 | 
11 | 	private TextReader delegate;
12 | 
13 | 	public ConvertLongSTextReader(TextReader delegate) {
14 | 		this.delegate = delegate;
15 | 	}
16 | 
17 | 	public List<String> readCharacters(String line) {
18 | 		List<String> chars = new ArrayList<String>();
19 | 		for (String c : delegate.readCharacters(line)) {
20 | 			chars.add(c);
21 | 		}
22 | 
23 | 		/*
24 | 		 * Replace 's' characters with 'long-s' characters.
25 | 		 */
26 | 		// for every letter except the last (since the last letter can 
27 | 		//   never be a long-s since it can never be followed by a letter
28 | 		for (int t = 0; t < chars.size() - 1; t++) {
29 | 			if (chars.get(t).equals("s")) {
30 | 				String next = chars.get(t + 1);
31 | 				String nextWithoutDiacritics = Charset.removeAnyDiacriticFromChar(next);
32 | 				if (nextWithoutDiacritics.length() != 1) {
33 | 					if (!nextWithoutDiacritics.equals("\\\\")) {
34 | 						throw new AssertionError("expected nextWithoutDiacritics [" + nextWithoutDiacritics + "] length() == 1");
35 | 					}
36 | 				}
37 | 				char nextWithoutDiacriticsChar = nextWithoutDiacritics.charAt(0);
38 | 				if (t > 0 && chars.get(t - 1).equals(Charset.LONG_S) && nextWithoutDiacriticsChar == 'i') {
39 | 					// "ſsi": do nothing
40 | 				}
41 | 				else if (Character.isAlphabetic(nextWithoutDiacriticsChar)) {
42 | 					chars.set(t, Charset.LONG_S);
43 | 				}
44 | 			}
45 | 		}
46 | 
47 | 		return chars;
48 | 	}
49 | 	
50 | 	public String toString() {
51 | 		return "ConvertLongSTextReader(" + delegate + ")";
52 | 	}
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/data/textreader/FlipUVTextReader.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.data.textreader;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | import java.util.Random;
 6 | 
 7 | /**
 8 |  * @author Dan Garrette (dhgarrette@gmail.com)
 9 |  */
10 | public class FlipUVTextReader implements TextReader {
11 | 
12 | 	private double flipRate;
13 | 	private TextReader delegate;
14 | 	
15 | 	private Random rand = new Random(0);
16 | 
17 | 	public FlipUVTextReader(double flipRate, TextReader delegate) {
18 | 		this.flipRate = flipRate;
19 | 		this.delegate = delegate;
20 | 	}
21 | 
22 | 	public List<String> readCharacters(String line) {
23 | 		List<String> chars = new ArrayList<String>();
24 | 		for (String c : delegate.readCharacters(line)) {
25 | 			if (c.equals("u")) {
26 | 				if (rand.nextDouble() < flipRate) {
27 | 					chars.add("u");
28 | 				} else {
29 | 					chars.add("v");
30 | 				}
31 | 			} else if (c.equals("U")) {
32 | 				if (rand.nextDouble() < flipRate) {
33 | 					chars.add("U");
34 | 				} else {
35 | 					chars.add("V");
36 | 				}
37 | 			} else if (c.equals("v")) {
38 | 				if (rand.nextDouble() < flipRate) {
39 | 					chars.add("v");
40 | 				} else {
41 | 					chars.add("u");
42 | 				}
43 | 			} else if (c.equals("V")) {
44 | 				if (rand.nextDouble() < flipRate) {
45 | 					chars.add("V");
46 | 				} else {
47 | 					chars.add("U");
48 | 				}
49 | 			} else {
50 | 				chars.add(c);
51 | 			}
52 | 		}
53 | 		return chars;
54 | 	}
55 | 
56 | 	public String toString() {
57 | 		return "FlipUVTextReader(" + flipRate + ", " + delegate + ")";
58 | 	}
59 | 
60 | }
61 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/data/textreader/RemoveAllDiacriticsTextReader.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.data.textreader;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | /**
 7 |  * @author Dan Garrette (dhgarrette@gmail.com)
 8 |  */
 9 | public class RemoveAllDiacriticsTextReader implements TextReader {
10 | 
11 | 	private TextReader delegate;
12 | 
13 | 	public RemoveAllDiacriticsTextReader(TextReader delegate) {
14 | 		this.delegate = delegate;
15 | 	}
16 | 
17 | 	public List<String> readCharacters(String line) {
18 | 		List<String> chars = new ArrayList<String>();
19 | 		for (String c : delegate.readCharacters(line)) {
20 | 			chars.add(Charset.removeAnyDiacriticFromChar(c));
21 | 		}
22 | 		return chars;
23 | 	}
24 | 
25 | 	public String toString() {
26 | 		return "RemoveAllDiacriticsTextReader(" + delegate + ")";
27 | 	}
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/data/textreader/ReplaceSomeTextReader.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.data.textreader;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.Arrays;
 5 | import java.util.Iterator;
 6 | import java.util.List;
 7 | 
 8 | import edu.berkeley.cs.nlp.ocular.util.CollectionHelper;
 9 | import edu.berkeley.cs.nlp.ocular.util.Tuple2;
10 | import static edu.berkeley.cs.nlp.ocular.util.Tuple2.Tuple2;
11 | import tberg.murphy.fileio.f;
12 | 
13 | /**
14 |  * @author Dan Garrette (dhgarrette@gmail.com)
15 |  */
16 | public class ReplaceSomeTextReader implements TextReader {
17 | 
18 | 	private final List<Tuple2<Tuple2<List<String>, List<String>>, Integer>> rules;
19 | 	private final TextReader delegate;
20 | 	private final int[] occurrences;
21 | 
22 | 	/**
23 | 	 * @param delegate
24 | 	 * @param rules	<<input, output>, each> Replace `input` by `output` every `each` occurrences 
25 | 	 */
26 | 	public ReplaceSomeTextReader(List<Tuple2<Tuple2<List<String>, List<String>>, Integer>> rules, TextReader delegate) {
27 | 		this.rules = rules;
28 | 		this.delegate = delegate;
29 | 		this.occurrences = new int[rules.size()];
30 | 	}
31 | 
32 | 	public List<String> readCharacters(String line) {
33 | 		List<String> result = delegate.readCharacters(line);
34 | 		for (int i = 0; i < rules.size(); ++i) {
35 | 			Tuple2<Tuple2<List<String>, List<String>>, Integer> r = rules.get(i);
36 | 			List<String> input = r._1._1;
37 | 			List<String> output = r._1._2;
38 | 			int each = r._2;
39 | 			List<String> newResult = new ArrayList<String>();
40 | 			for (int j = 0; j < input.size() - 1; ++j) {
41 | 				// add some buffer to the end so sliding goes to the end
42 | 				result.add(null);
43 | 			}
44 | 			Iterator<List<String>> iter = CollectionHelper.sliding(result, input.size());
45 | 			while (iter.hasNext()) {
46 | 				List<String> x = iter.next();
47 | 				if (x.equals(input)) {
48 | 					if (x.equals(input) && occurrences[i] % each == each - 1) {
49 | 						newResult.addAll(output); // add `output` to the result (to replace `input`)
50 | 						for (int j = 0; j < input.size() - 1; ++j) {
51 | 							//remove the rest of `input` from `iter`
52 | 							iter.next();
53 | 						}
54 | 					}
55 | 					else {
56 | 						newResult.add(x.get(0));
57 | 					}
58 | 					++occurrences[i];
59 | 				}
60 | 				else {
61 | 					newResult.add(x.get(0));
62 | 				}
63 | 			}
64 | 			result = newResult;
65 | 		}
66 | 		return result;
67 | 	}
68 | 
69 | 	public static List<Tuple2<Tuple2<List<String>, List<String>>, Integer>> loadRulesFromFile(String path) {
70 | 		List<Tuple2<Tuple2<List<String>, List<String>>, Integer>> result = new ArrayList<Tuple2<Tuple2<List<String>, List<String>>, Integer>>();
71 | 		for (String line : f.readLines(path)) {
72 | 			if (!line.trim().isEmpty()) {
73 | 				String[] parts = line.split("\t");
74 | 				if (parts.length != 3) throw new RuntimeException("line does not contain 3 parts.  found: " + Arrays.asList(parts));
75 | 				result.add(Tuple2(Tuple2(Charset.readNormalizeCharacters(parts[0]), Charset.readNormalizeCharacters(parts[1])), Integer.valueOf(parts[2])));
76 | 			}
77 | 		}
78 | 		return result;
79 | 	}
80 | 
81 | 	public String toString() {
82 | 		return "ReplaceSomeTextReader(rules=..., " + delegate + ")";
83 | 	}
84 | 
85 | }
86 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/data/textreader/TextReader.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.data.textreader;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | /**
 6 |  * @author Dan Garrette (dhgarrette@gmail.com)
 7 |  */
 8 | public interface TextReader {
 9 | 
10 | 	/**
11 | 	 * @param line	A line of text, possibly containing diacritics (precomposed, composed, or escaped).
12 | 	 * @return	A list of normalized characters.
13 | 	 */
14 | 	public List<String> readCharacters(String line);
15 | 	
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/data/textreader/WhitelistCharacterSetTextReader.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.data.textreader;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.HashSet;
 5 | import java.util.List;
 6 | import java.util.Set;
 7 | 
 8 | /**
 9 |  * @author Dan Garrette (dhgarrette@gmail.com)
10 |  */
11 | public class WhitelistCharacterSetTextReader implements TextReader {
12 | 
13 | 	private Set<String> allValidCharacters = new HashSet<String>();
14 | 	private boolean disregardDiacritics;
15 | 	private TextReader delegate;
16 | 
17 | 	/**
18 | 	 * @param validCharacters	The set of characters that are allowed. 
19 | 	 * Any other character will be skipped.
20 | 	 * @param disregardDiacritics	If true, then a character with a diacritic
21 | 	 * will be considered valid even if only its non-diacritic version is in
22 | 	 * the validCharcters set.
23 | 	 * @param delegate
24 | 	 */
25 | 	public WhitelistCharacterSetTextReader(Set<String> validCharacters, boolean disregardDiacritics, TextReader delegate) {
26 | 		if (validCharacters.isEmpty()) {
27 | 			throw new RuntimeException("validCharacters is empty in WhitelistCharacterSetTextReader constructor");
28 | 		}
29 | 		
30 | 		for (String c : validCharacters) {
31 | 			allValidCharacters.add(Charset.normalizeChar(c));
32 | 		}
33 | 		allValidCharacters.add(Charset.SPACE);
34 | 		
35 | 		this.disregardDiacritics = disregardDiacritics;
36 | 		this.delegate = delegate;
37 | 	}
38 | 
39 | 	public WhitelistCharacterSetTextReader(Set<String> validCharacters, TextReader delegate) {
40 | 		this(validCharacters, false, delegate);
41 | 	}
42 | 
43 | 	public List<String> readCharacters(String line) {
44 | 		List<String> chars = new ArrayList<String>();
45 | 		for (String c : delegate.readCharacters(line)) {
46 | 			if (allValidCharacters.contains(c)) {
47 | 				chars.add(c);
48 | 			}
49 | 			else if (disregardDiacritics && allValidCharacters.contains(Charset.removeAnyDiacriticFromChar(c))) {
50 | 				chars.add(c);
51 | 			}
52 | 		}
53 | 		return chars;
54 | 	}
55 | 
56 | 	public String toString() {
57 | 		return "WhitelistCharacterSetTextReader(" + delegate + ")";
58 | 	}
59 | 
60 | }
61 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/eval/AlignedFormPair.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.eval;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | /**
 6 |  * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu)
 7 |  */
 8 | public class AlignedFormPair {
 9 |   
10 |   public final Form src;
11 |   public final Form trg;
12 |   public final List<Operation> ops;
13 |   public final double cost;
14 |   
15 |   public AlignedFormPair(Form src, Form trg, List<Operation> ops, double cost) {
16 |     this.src = src;
17 |     this.trg = trg;
18 |     this.ops = ops;
19 |     this.cost = cost;
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/eval/BasicMultiDocumentTranscriber.java:
--------------------------------------------------------------------------------
  1 | package edu.berkeley.cs.nlp.ocular.eval;
  2 | 
  3 | import static edu.berkeley.cs.nlp.ocular.util.Tuple2.Tuple2;
  4 | 
  5 | import java.io.File;
  6 | import java.text.SimpleDateFormat;
  7 | import java.util.ArrayList;
  8 | import java.util.Calendar;
  9 | import java.util.List;
 10 | import java.util.Map;
 11 | import java.util.Set;
 12 | 
 13 | import edu.berkeley.cs.nlp.ocular.data.Document;
 14 | import edu.berkeley.cs.nlp.ocular.eval.Evaluator.EvalSuffStats;
 15 | import edu.berkeley.cs.nlp.ocular.font.Font;
 16 | import edu.berkeley.cs.nlp.ocular.gsm.GlyphSubstitutionModel;
 17 | import edu.berkeley.cs.nlp.ocular.lm.CodeSwitchLanguageModel;
 18 | import edu.berkeley.cs.nlp.ocular.main.FonttrainTranscribeShared.OutputFormat;
 19 | import edu.berkeley.cs.nlp.ocular.model.CharacterTemplate;
 20 | import edu.berkeley.cs.nlp.ocular.model.DecodeState;
 21 | import edu.berkeley.cs.nlp.ocular.model.DecoderEM;
 22 | import edu.berkeley.cs.nlp.ocular.model.em.DenseBigramTransitionModel;
 23 | import edu.berkeley.cs.nlp.ocular.train.FontTrainer;
 24 | import edu.berkeley.cs.nlp.ocular.util.Tuple2;
 25 | import tberg.murphy.indexer.Indexer;
 26 | 
 27 | /**
 28 |  * Transcribe all document, write their results to files, and evaluate the results.
 29 |  * 
 30 |  * @author Dan Garrette (dhgarrette@gmail.com)
 31 |  */
 32 | public class BasicMultiDocumentTranscriber implements MultiDocumentTranscriber {
 33 | 	private List<Document> documents;
 34 | 	private String inputDocPath;
 35 | 	private String outputPath;
 36 | 	private Set<OutputFormat> outputFormats;
 37 | 	private DecoderEM decoderEM;
 38 | 	private SingleDocumentEvaluatorAndOutputPrinter docOutputPrinterAndEvaluator;
 39 | 	private Indexer<String> charIndexer;
 40 | 	private boolean skipFailedDocs;
 41 | 	
 42 | 	public BasicMultiDocumentTranscriber(
 43 | 			List<Document> documents, String inputDocPath, String outputPath, Set<OutputFormat> outputFormats,
 44 | 			DecoderEM decoderEM,
 45 | 			SingleDocumentEvaluatorAndOutputPrinter documentOutputPrinterAndEvaluator,
 46 | 			Indexer<String> charIndexer,
 47 | 			boolean skipFailedDocs) {
 48 | 		this.documents = documents;
 49 | 		this.inputDocPath = inputDocPath;
 50 | 		this.outputPath = outputPath;
 51 | 		this.outputFormats = outputFormats;
 52 | 		this.decoderEM = decoderEM;
 53 | 		this.docOutputPrinterAndEvaluator = documentOutputPrinterAndEvaluator;
 54 | 		this.charIndexer = charIndexer;
 55 | 		this.skipFailedDocs = skipFailedDocs;
 56 | 	}
 57 | 
 58 | 	public void transcribe(Font font, CodeSwitchLanguageModel lm, GlyphSubstitutionModel gsm) {
 59 | 		transcribe(0, 0, font, lm, gsm);
 60 | 	}
 61 | 	
 62 | 	public void transcribe(int iter, int batchId, Font font, CodeSwitchLanguageModel lm, GlyphSubstitutionModel gsm) {
 63 | 		int numDocs = documents.size();
 64 | 		CharacterTemplate[] templates = FontTrainer.loadTemplates(font, charIndexer);
 65 | 		DenseBigramTransitionModel backwardTransitionModel = new DenseBigramTransitionModel(lm);
 66 | 		
 67 | 		double totalJointLogProb = 0.0;
 68 | 		List<Tuple2<String, Map<String, EvalSuffStats>>> allDiplomaticEvals = new ArrayList<Tuple2<String, Map<String, EvalSuffStats>>>();
 69 | 		List<Tuple2<String, Map<String, EvalSuffStats>>> allNormalizedEvals = new ArrayList<Tuple2<String, Map<String, EvalSuffStats>>>();
 70 | 		for (int docNum = 0; docNum < numDocs; ++docNum) {
 71 | 			Document doc = documents.get(docNum);
 72 | 			System.out.println((iter > 0 ? "Training iteration "+iter+", " : "") + (batchId > 0 ? "batch "+batchId+", " : "") + "Transcribing eval document "+(docNum+1)+" of "+numDocs+":  "+doc.baseName() + "    " + (new SimpleDateFormat("yyyy/MM/dd HH:mm:ss").format(Calendar.getInstance().getTime())));
 73 | 			
 74 | 			try {
 75 | 				Tuple2<DecodeState[][], Double> decodeResults = decoderEM.computeEStep(doc, false, lm, gsm, templates, backwardTransitionModel);
 76 | 				final DecodeState[][] decodeStates = decodeResults._1;
 77 | 				totalJointLogProb += decodeResults._2;
 78 | 	
 79 | 				Tuple2<Map<String, EvalSuffStats>,Map<String, EvalSuffStats>> evals = docOutputPrinterAndEvaluator.evaluateAndPrintTranscription(iter, batchId, doc, decodeStates, inputDocPath, outputPath, outputFormats, lm);
 80 | 				if (evals._1 != null) allDiplomaticEvals.add(Tuple2(doc.baseName(), evals._1));
 81 | 				if (evals._2 != null) allNormalizedEvals.add(Tuple2(doc.baseName(), evals._2));
 82 | 			} catch(RuntimeException e) {
 83 | 				if (skipFailedDocs) {
 84 | 					System.err.println("DOCUMENT FAILED! Skipping " + doc.baseName());
 85 | 					e.printStackTrace();
 86 | 				} else {
 87 | 					throw e;
 88 | 				}
 89 | 			}
 90 | 		}
 91 | 		double avgLogProb = totalJointLogProb / numDocs;
 92 | 		System.out.println("Iteration "+iter+", batch "+batchId+": eval avg joint log prob: " + avgLogProb);
 93 | 		if (new File(inputDocPath).isDirectory()) {
 94 | 			//Document doc = documents.get(0);
 95 | 			//String fileParent = FileUtil.removeCommonPathPrefixOfParents(new File(inputDocPath), new File(doc.baseName()))._2;
 96 | 			String preext = "eval";
 97 | 			String outputFilenameBase = outputPath + "/all_transcriptions/" + new File(inputDocPath).getName() + "/" + preext;
 98 | 			if (iter > 0) outputFilenameBase += "_iter-" + iter;
 99 | 			if (batchId > 0) outputFilenameBase += "_batch-" + batchId;
100 | 			if (!allDiplomaticEvals.isEmpty())
101 | 				EvalPrinter.printEvaluation(allDiplomaticEvals, outputFilenameBase + "_diplomatic.txt");
102 | 			if (!allNormalizedEvals.isEmpty())
103 | 				EvalPrinter.printEvaluation(allNormalizedEvals, outputFilenameBase + "_normalized.txt");
104 | 		}
105 | 	}
106 | 
107 | }
108 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/eval/ErrorSampler.java:
--------------------------------------------------------------------------------
  1 | package edu.berkeley.cs.nlp.ocular.eval;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.Arrays;
  5 | import java.util.Collections;
  6 | import java.util.List;
  7 | import java.util.Random;
  8 | 
  9 | import edu.berkeley.cs.nlp.ocular.eval.MarkovEditDistanceComputer.EditDistanceParams;
 10 | import tberg.murphy.fileio.f;
 11 | import tberg.murphy.tuple.Pair;
 12 | 
 13 | /**
 14 |  * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu)
 15 |  */
 16 | public class ErrorSampler {
 17 |   
 18 |   public static class Error implements Comparable<Error> {
 19 |     public final int docIdx;
 20 |     public final int lineIdx;
 21 |     public final int guessTokenIdx;
 22 |     public final String guess;
 23 |     public final String gold;
 24 |     
 25 |     public static final String INSERTION = "<INSERTION>";
 26 |     public static final String DELETION = "<DELETION>";
 27 |     
 28 |     public Error(int docIdx, int lineIdx, int guessColumn, String guess, String gold) {
 29 |       this.docIdx = docIdx;
 30 |       this.lineIdx = lineIdx;
 31 |       this.guessTokenIdx = guessColumn;
 32 |       this.guess = guess;
 33 |       this.gold = gold;
 34 |     }
 35 | 
 36 |     @Override
 37 |     public int compareTo(Error e1) {
 38 |       if (this.docIdx != e1.docIdx) {
 39 |         return this.docIdx - e1.docIdx;
 40 |       } else if (this.lineIdx != e1.lineIdx) {
 41 |         return this.lineIdx - e1.lineIdx;
 42 |       }
 43 |       return this.guessTokenIdx - e1.guessTokenIdx;
 44 |     }
 45 |     
 46 |     public String toString() {
 47 |       return "Doc " + docIdx + ", line " + lineIdx + ", guess idx " + guessTokenIdx + ": guess = " + guess + ", gold = " + gold;
 48 |     }
 49 |     
 50 |   }
 51 |   
 52 |   public static void main(String[] args) {
 53 |     List<Error> errors = aggregateWordErrors(args);
 54 |     final int NUM_ERRORS = 50;
 55 |     Collections.shuffle(errors, new Random(0));
 56 |     List<Error> selectedErrors = errors.subList(0, Math.min(errors.size(), NUM_ERRORS));
 57 |     Collections.sort(selectedErrors);
 58 |     for (int i = 0; i < selectedErrors.size(); i++) {
 59 |       System.out.println(selectedErrors.get(i).toString());
 60 |     }
 61 |   }
 62 |   
 63 |   public static List<Error> aggregateWordErrors(String[] fileNames) {
 64 |     List<Error> allErrors = new ArrayList<Error>();
 65 |     for (int fileIdx = 0; fileIdx < fileNames.length; fileIdx++) {
 66 |       String fileName = fileNames[fileIdx];
 67 |       Pair<List<String>,List<String>> goldGuessLines = getGoldGuessLinesFromOutput(fileName);
 68 |       List<String> goldLines = goldGuessLines.getFirst();
 69 |       List<String> guessLines = goldGuessLines.getSecond();
 70 |       assert goldLines.size() == guessLines.size();
 71 |       for (int i = 0; i < goldLines.size(); i++) {
 72 |         String goldStr = goldLines.get(i).replaceAll("\\|", "s");
 73 |         String guessStr = guessLines.get(i).replaceAll("\\|", "s");
 74 |         Form guessForm = Form.wordsAsGlyphs(Arrays.asList(guessStr.split("\\s+")));
 75 |         Form goldForm = Form.wordsAsGlyphs(Arrays.asList(goldStr.split("\\s+")));
 76 |         EditDistanceParams params = EditDistanceParams.getStandardParams(guessForm, goldForm, false);
 77 |         MarkovEditDistanceComputer medc = new MarkovEditDistanceComputer(params);
 78 |         AlignedFormPair alignedPair = medc.runEditDistance();
 79 |         assert alignedPair.trg.length() == goldForm.length();
 80 |         int srcGuessIdx = 0;
 81 |         int trgGoldIdx = 0;
 82 |         for (Operation op : alignedPair.ops) {
 83 |           switch (op) {
 84 |             case EQUAL:
 85 |               srcGuessIdx++;
 86 |               trgGoldIdx++;
 87 |               break;
 88 |             case SUBST:
 89 |               allErrors.add(new Error(fileIdx, i, srcGuessIdx, guessForm.charAt(srcGuessIdx).toString(), goldForm.charAt(trgGoldIdx).toString()));
 90 |               srcGuessIdx++;
 91 |               trgGoldIdx++;
 92 |               break;
 93 |             case INSERT:
 94 |               allErrors.add(new Error(fileIdx, i, srcGuessIdx, Error.INSERTION, goldForm.charAt(trgGoldIdx).toString()));
 95 |               trgGoldIdx++;
 96 |               break;
 97 |             case DELETE:
 98 |               allErrors.add(new Error(fileIdx, i, srcGuessIdx, guessForm.charAt(srcGuessIdx).toString(), Error.DELETION));
 99 |               srcGuessIdx++;
100 |               break;
101 |           }
102 |         }
103 |       }
104 |       System.out.println("Processed file " + fileNames[fileIdx] + " with " + goldLines.size() + " lines, cumulative errors = " + allErrors.size());
105 |     }
106 |     return allErrors;
107 |   }
108 |   
109 |   public static Pair<List<String>,List<String>> getGoldGuessLinesFromOutput(String outFile) {
110 |     List<String> lines = f.readLines(outFile);
111 |     List<String> guessLines = new ArrayList<String>();
112 |     List<String> goldLines = new ArrayList<String>();
113 |     for (int i = 0; i < lines.size(); i++) {
114 |       String currLine = lines.get(i).trim();
115 |       if (i % 3 == 0 && currLine.equals("")) {
116 |         break;
117 |       }
118 |       switch (i % 3) {
119 |         case 0: guessLines.add(currLine);
120 |           break;
121 |         case 1: goldLines.add(currLine);
122 |           break;
123 |         case 2: assert currLine.equals("");
124 |           break;
125 |       }
126 |     }
127 |     return Pair.makePair(goldLines, guessLines);
128 |   }
129 | }
130 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/eval/EvalPrinter.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.eval;
 2 | 
 3 | import java.util.HashMap;
 4 | import java.util.List;
 5 | import java.util.Map;
 6 | 
 7 | import edu.berkeley.cs.nlp.ocular.eval.Evaluator.EvalSuffStats;
 8 | import edu.berkeley.cs.nlp.ocular.util.FileHelper;
 9 | import edu.berkeley.cs.nlp.ocular.util.Tuple2;
10 | 
11 | /**
12 |  * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu)
13 |  * @author Dan Garrette (dhgarrette@gmail.com)
14 |  */
15 | public class EvalPrinter {
16 | 
17 | 	public static void printEvaluation(List<Tuple2<String, Map<String, EvalSuffStats>>> allEvals, String outputPath) {
18 | 		Map<String, EvalSuffStats> totalSuffStats = new HashMap<String, EvalSuffStats>();
19 | 		StringBuffer buf = new StringBuffer();
20 | 		buf.append("All evals:\n");
21 | 		for (Tuple2<String, Map<String, EvalSuffStats>> docNameAndEvals : allEvals) {
22 | 			String docName = docNameAndEvals._1;
23 | 			Map<String, EvalSuffStats> evals = docNameAndEvals._2;
24 | 			buf.append("Document: " + docName + "\n");
25 | 			buf.append(Evaluator.renderEval(evals) + "\n");
26 | 			for (String evalType : evals.keySet()) {
27 | 				EvalSuffStats eval = evals.get(evalType);
28 | 				EvalSuffStats totalEval = totalSuffStats.get(evalType);
29 | 				if (totalEval == null) {
30 | 					totalEval = new EvalSuffStats();
31 | 					totalSuffStats.put(evalType, totalEval);
32 | 				}
33 | 				totalEval.increment(eval);
34 | 			}
35 | 		}
36 | 
37 | 		buf.append("\nMacro-avg total eval:\n");
38 | 		buf.append(Evaluator.renderEval(totalSuffStats) + "\n");
39 | 
40 | 		FileHelper.writeString(outputPath, buf.toString());
41 | 		System.out.println("\n" + outputPath);
42 | 		System.out.println(buf.toString());
43 | 	}
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/eval/Form.java:
--------------------------------------------------------------------------------
  1 | package edu.berkeley.cs.nlp.ocular.eval;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.Collections;
  5 | import java.util.Iterator;
  6 | import java.util.List;
  7 | 
  8 | import edu.berkeley.cs.nlp.ocular.data.textreader.Charset;
  9 | import edu.berkeley.cs.nlp.ocular.util.Tuple2;
 10 | 
 11 | /**
 12 |  * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu)
 13 |  */
 14 | public class Form implements Comparable<Form> {
 15 | 
 16 |   private final List<Glyph> glyphs;
 17 |   
 18 |   public Form(List<Glyph> glyphs) {
 19 |     this.glyphs = glyphs;
 20 |   }
 21 |   
 22 |   public static Form charsAsGlyphs(String str) {
 23 |     return charsAsGlyphs(str, true);
 24 |   }
 25 |   
 26 |   /**
 27 |    * 
 28 |    * @param str
 29 |    * @param charIncludesDiacritic If false, the diacritic will be scored separately from the base character.
 30 |    * @return
 31 |    */
 32 |   public static Form charsAsGlyphs(String str, boolean charIncludesDiacritic) {
 33 |     List<Glyph> glyphs = new ArrayList<Glyph>();
 34 |     for (String c : Charset.readNormalizeCharacters(str)) {
 35 |       if (charIncludesDiacritic) {
 36 |     	  glyphs.add(new Glyph(c));
 37 |       }
 38 |       else {
 39 |     	  Tuple2<String,List<String>> letterAndNormalDiacritics = Charset.normalizeCharSeparateDiacritics(c);
 40 |     	  Collections.sort(letterAndNormalDiacritics._2);
 41 |     	  for (String diacritic : letterAndNormalDiacritics._2) {
 42 |     		  glyphs.add(new Glyph(diacritic));
 43 |     	  }
 44 |     	  glyphs.add(new Glyph(letterAndNormalDiacritics._1));
 45 |       }
 46 |     }
 47 |     return new Form(glyphs);
 48 |   }
 49 |   
 50 |   public static Form wordsAsGlyphs(List<String> words) {
 51 |     List<Glyph> glyphs = new ArrayList<Glyph>();
 52 |     for (int i = 0; i < words.size(); i++) {
 53 |       glyphs.add(new Glyph(words.get(i)));
 54 |     }
 55 |     return new Form(glyphs);
 56 |   }
 57 |   
 58 |   public Form substring(int start) {
 59 |     return substring(start, length());
 60 |   }
 61 |   
 62 |   public Form substring(int start, int end) {
 63 |     return new Form(glyphs.subList(start, end));
 64 |   }
 65 |   
 66 |   public int length() {
 67 |     return glyphs.size();
 68 |   }
 69 | 
 70 |   public Glyph charAt(int index) {
 71 |     return glyphs.get(index);
 72 |   }
 73 |   
 74 |   public Form append(Form other) {
 75 |     List<Glyph> newGlyphs = new ArrayList<Glyph>();
 76 |     newGlyphs.addAll(this.glyphs);
 77 |     newGlyphs.addAll(other.glyphs);
 78 |     return new Form(newGlyphs);
 79 |   }
 80 |   
 81 |   @Override
 82 |   public boolean equals(Object other) {
 83 |     if (other == null || !(other instanceof Form)) {
 84 |       return false;
 85 |     }
 86 |     return this.glyphs.equals(((Form)other).glyphs);
 87 |   }
 88 |   
 89 |   @Override
 90 |   public int hashCode() {
 91 |     return this.glyphs.hashCode();
 92 |   }
 93 | 
 94 |   @Override
 95 |   public String toString() {
 96 |     String ret = "";
 97 |     for (Glyph glyph : glyphs) {
 98 |       ret += glyph.toString();
 99 |     }
100 |     return ret;
101 |   }
102 | 
103 |   public String toStringWithSpaces() {
104 |     String ret = "";
105 |     for (Glyph glyph : glyphs) {
106 |       ret += glyph.toString() + " ";
107 |     }
108 |     return ret;
109 |   }
110 | 
111 |   @Override
112 |   public int compareTo(Form o) {
113 |     return compareCollections(this.glyphs, o.glyphs);
114 |   }
115 |   
116 |   public static <T extends Comparable<T>> int compareCollections(Iterable<T> col1, Iterable<T> col2) {
117 |     Iterator<T> first = col1.iterator();
118 |     Iterator<T> second = col2.iterator();
119 |     while (first.hasNext() && second.hasNext()) {
120 |       int result = first.next().compareTo(second.next());
121 |       if (result != 0) {
122 |         return result;
123 |       }
124 |     }
125 |     if (!first.hasNext() && !second.hasNext()) {
126 |       return 0;
127 |     }
128 |     // Longer one comes second
129 |     return (first.hasNext() ? 1 : -1);
130 |   }
131 | }
132 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/eval/Glyph.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.eval;
 2 | 
 3 | /**
 4 |  * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu)
 5 |  */
 6 | public class Glyph implements Comparable<Glyph> {
 7 |   
 8 |   public final String glyph;
 9 |   
10 |   public Glyph(String glyph) {
11 |     this.glyph = glyph;
12 |   }
13 | 
14 |   @Override
15 |   public boolean equals(Object other) {
16 |     if (other == null || !(other instanceof Glyph)) {
17 |       return false;
18 |     }
19 |     return this.glyph.equals(((Glyph)other).glyph);
20 |   }
21 |   
22 |   @Override
23 |   public int hashCode() {
24 |     return glyph.hashCode();
25 |   }
26 | 
27 |   @Override
28 |   public String toString() {
29 |     return glyph;
30 |   }
31 | 
32 |   @Override
33 |   public int compareTo(Glyph o) {
34 |     return this.glyph.compareTo(o.glyph);
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/eval/LmPerplexity.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.eval;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import edu.berkeley.cs.nlp.ocular.data.textreader.Charset;
 6 | import edu.berkeley.cs.nlp.ocular.lm.CodeSwitchLanguageModel;
 7 | import edu.berkeley.cs.nlp.ocular.util.CollectionHelper;
 8 | 
 9 | /**
10 |  * @author Hannah Alpert-Abrams (halperta@gmail.com)
11 |  * @author Dan Garrette (dhgarrette@gmail.com)
12 |  */
13 | public class LmPerplexity {
14 | 
15 | 	private CodeSwitchLanguageModel lm;
16 | 	
17 | 	private final int spaceIndex;
18 | 	
19 | 	public LmPerplexity(CodeSwitchLanguageModel lm) {
20 | 		this.lm = lm;
21 | 		this.spaceIndex = lm.getCharacterIndexer().getIndex(Charset.SPACE);
22 | 	}
23 | 
24 | 	public double perplexity(List<Integer> viterbiNormalizedTranscriptionCharIndices, List<Integer> viterbiNormalizedTranscriptionLangIndices) {
25 | 		double logTotalProbability = 0.0;
26 | 		for (int i=0; i<viterbiNormalizedTranscriptionCharIndices.size(); ++i) {
27 | 			int curC = viterbiNormalizedTranscriptionCharIndices.get(i);
28 | 			int curL = getLangIndex(viterbiNormalizedTranscriptionLangIndices, i);
29 | 
30 | 			double langTransitionProb = getLangTransitionProb(i, curL, viterbiNormalizedTranscriptionCharIndices, viterbiNormalizedTranscriptionLangIndices);
31 | 			double ngramProb = getNgramProb(i, curC, curL, viterbiNormalizedTranscriptionCharIndices, viterbiNormalizedTranscriptionLangIndices);
32 | 			logTotalProbability += Math.log(langTransitionProb) + Math.log(ngramProb);
33 | 
34 | //			StringBuilder ctxString = new StringBuilder();
35 | //		    for (int c: viterbiNormalizedTranscriptionCharIndices.subList(findStartPoint(i, curL, viterbiNormalizedTranscriptionLangIndices), i))
36 | //		      ctxString.append(lm.getCharacterIndexer().getObject(c));
37 | //		    System.out.println(String.format("P_%d(%s | %s) = %s * %s", curL, lm.getCharacterIndexer().getObject(curC), ctxString, ngramProb, langTransitionProb));
38 | 		}
39 | 		return Math.exp(-logTotalProbability / viterbiNormalizedTranscriptionCharIndices.size());
40 | 	}
41 | 
42 | 	private double getNgramProb(int i, int curC, int curL, List<Integer> viterbiNormalizedTranscriptionCharIndices, List<Integer> viterbiNormalizedTranscriptionLangIndices) {
43 | 		int startPoint = findStartPoint(i, curL, viterbiNormalizedTranscriptionLangIndices);
44 | 		int[] context = CollectionHelper.intListToArray(viterbiNormalizedTranscriptionCharIndices.subList(startPoint, i));
45 | 		return lm.get(curL).getCharNgramProb(context, curC);
46 | 	}
47 | 	
48 | 	private int findStartPoint(int i, int curL, List<Integer> viterbiNormalizedTranscriptionLangIndices) {
49 | 		int startPoint = i;
50 | 		while (startPoint > 0 && getLangIndex(viterbiNormalizedTranscriptionLangIndices, startPoint-1) == curL && i-startPoint < lm.get(curL).getMaxOrder()-1) {
51 | 			--startPoint;
52 | 		}
53 | 		return startPoint;
54 | 	}
55 | 
56 | 	private double getLangTransitionProb(int i, int curL, List<Integer> viterbiNormalizedTranscriptionCharIndices, List<Integer> viterbiNormalizedTranscriptionLangIndices) {
57 | 		if (i > 0) {
58 | 			int prevC = viterbiNormalizedTranscriptionCharIndices.get(i-1);
59 | 			int prevL = getLangIndex(viterbiNormalizedTranscriptionLangIndices, i-1);
60 | 			if (prevC != spaceIndex) {
61 | 				if (prevL != curL) throw new RuntimeException("Characters cannot change languages mid-word.");
62 | 				return 1.0;
63 | 			}
64 | 			else {
65 | 				return lm.languageTransitionProb(prevL, curL);
66 | 			}
67 | 		}
68 | 		else {
69 | 			return lm.languagePrior(curL);
70 | 		}
71 | 	}
72 | 	
73 | 	private int getLangIndex(List<Integer> viterbiNormalizedTranscriptionLangIndices, int i) {
74 | 		int curL = viterbiNormalizedTranscriptionLangIndices.get(i);
75 | 		if (curL < 0) {
76 | 			if (this.lm.getLanguageIndexer().size() == 1)
77 | 				curL = 0;
78 | 			else if (i > 0) 
79 | 				throw new RuntimeException("curl="+curL+", i="+i);
80 | 		}
81 | 		return curL;
82 | 	}
83 | 
84 | }
85 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/eval/MultiDocumentTranscriber.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.eval;
 2 | 
 3 | import edu.berkeley.cs.nlp.ocular.font.Font;
 4 | import edu.berkeley.cs.nlp.ocular.gsm.GlyphSubstitutionModel;
 5 | import edu.berkeley.cs.nlp.ocular.lm.CodeSwitchLanguageModel;
 6 | 
 7 | /**
 8 |  * @author Dan Garrette (dhgarrette@gmail.com)
 9 |  */
10 | public interface MultiDocumentTranscriber {
11 | 
12 | 	public void transcribe(Font font, CodeSwitchLanguageModel lm, GlyphSubstitutionModel gsm);
13 | 	public void transcribe(int iter, int batchId, Font font, CodeSwitchLanguageModel lm, GlyphSubstitutionModel gsm);
14 | 	
15 | 	/**
16 | 	 * No-op evaluator implementation
17 | 	 */
18 | 	public static class NoOpMultiDocumentTranscriber implements MultiDocumentTranscriber {
19 | 		public void transcribe(Font font, CodeSwitchLanguageModel lm, GlyphSubstitutionModel gsm) {}
20 | 		public void transcribe(int iter, int batchId, Font font, CodeSwitchLanguageModel lm, GlyphSubstitutionModel gsm) {}
21 | 	}
22 | }
23 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/eval/Operation.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.eval;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | /**
 7 |  * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu)
 8 |  */
 9 | public enum Operation {
10 | 
11 |   EQUAL, SUBST, INSERT, DELETE;
12 |   
13 |   public static String opToString(Operation op) {
14 |     switch (op) {
15 |       case EQUAL: return "=";
16 |       case SUBST: return "S";
17 |       case INSERT : return "I";
18 |       case DELETE : return "D";
19 |       default : throw new RuntimeException("Bad op: " + op);
20 |     }
21 |   }
22 | 
23 |   public static String opsToString(List<Operation> ops) {
24 |     String opsStr = "";
25 |     for (Operation op : ops) {
26 |       opsStr += opToString(op);
27 |     }
28 |     return opsStr;
29 |   }
30 |   
31 |   public static Operation charToOp(char opChar) {
32 |     switch (opChar) {
33 |       case '=': return EQUAL;
34 |       case 'S': return SUBST;
35 |       case 'I': return INSERT;
36 |       case 'D': return DELETE;
37 |       default : throw new RuntimeException("Bad op string: " + opChar);
38 |     }
39 |   }
40 | 
41 |   public static List<Operation> stringToOps(String opsStr) {
42 |     List<Operation> ops = new ArrayList<Operation>();
43 |     for (int i = 0; i < opsStr.length(); i++) {
44 |       ops.add(charToOp(opsStr.charAt(i)));
45 |     }
46 |     return ops;
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/eval/SingleDocumentEvaluatorAndOutputPrinter.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.eval;
 2 | 
 3 | import java.util.Map;
 4 | import java.util.Set;
 5 | 
 6 | import edu.berkeley.cs.nlp.ocular.data.Document;
 7 | import edu.berkeley.cs.nlp.ocular.eval.Evaluator.EvalSuffStats;
 8 | import edu.berkeley.cs.nlp.ocular.lm.CodeSwitchLanguageModel;
 9 | import edu.berkeley.cs.nlp.ocular.main.FonttrainTranscribeShared.OutputFormat;
10 | import edu.berkeley.cs.nlp.ocular.model.DecodeState;
11 | import edu.berkeley.cs.nlp.ocular.util.Tuple2;
12 | 
13 | /**
14 |  * @author Dan Garrette (dhgarrette@gmail.com)
15 |  */
16 | public interface SingleDocumentEvaluatorAndOutputPrinter {
17 | 	
18 | 	public Tuple2<Map<String, EvalSuffStats>,Map<String, EvalSuffStats>> evaluateAndPrintTranscription(int iter, int batchId,
19 | 			Document doc,
20 | 			DecodeState[][] decodeStates,
21 | 			String inputDocPath, String outputPath, Set<OutputFormat> outputFormats,
22 | 			CodeSwitchLanguageModel lm);
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/font/Font.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.font;
 2 | 
 3 | import java.io.Serializable;
 4 | import java.util.Map;
 5 | 
 6 | import edu.berkeley.cs.nlp.ocular.model.CharacterTemplate;
 7 | 
 8 | /**
 9 |  * @author Dan Garrette (dhgarrette@gmail.com)
10 |  */
11 | public class Font implements Serializable {
12 | 	private static final long serialVersionUID = 1L;
13 | 
14 | 	public final Map<String, CharacterTemplate> charTemplates;
15 | 
16 | 	public Font(Map<String, CharacterTemplate> charTemplates) {
17 | 		this.charTemplates = charTemplates;
18 | 	}
19 | 	
20 | 	public CharacterTemplate get(String character) {
21 | 		return charTemplates.get(character);
22 | 	}
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/gsm/GlyphChar.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.gsm;
 2 | 
 3 | import java.io.Serializable;
 4 | 
 5 | import tberg.murphy.indexer.Indexer;
 6 | 
 7 | /**
 8 |  * @author Dan Garrette (dhgarrette@gmail.com)
 9 |  */
10 | public class GlyphChar implements Serializable {
11 | 	private static final long serialVersionUID = 1L;
12 | 
13 | 	public enum GlyphType { 
14 | 		ELISION_TILDE, //                 this glyph is marked with a tilde indicating that some subsequent letter have been elided   
15 | 		TILDE_ELIDED, //                  this (empty) glyph appears after an "elision tilde"
16 | 		FIRST_ELIDED, //                  this (empty) glyph results from the elision of the first letter of a word  
17 | 		DOUBLED, //                       this glyph marks an empty LM character whose glyph is a duplicate of the next glyph, which is just a rendering of its LM character
18 | 		//RMRGN_HPHN_DROP, //               this glyph marks a right-margin line-breaking hyphen is not printed 
19 | 		ELIDED, //                        this (empty) glyph results from the elision a character  
20 | 		NORMAL_CHAR }; //                 
21 | 	
22 | 	public final int templateCharIndex;
23 | 	public final GlyphType glyphType;
24 | 
25 | 	public GlyphChar(int templateCharIndex, GlyphType glyphType) {
26 | 		this.templateCharIndex = templateCharIndex;
27 | 		this.glyphType = glyphType;
28 | 	}
29 | 	
30 | 	public boolean isElided() {
31 | 		switch (glyphType) {
32 | 			case TILDE_ELIDED:
33 | 			case FIRST_ELIDED:
34 | 			case ELIDED:
35 | 				return true;
36 | 			default:
37 | 				return false;
38 | 		}
39 | 	}
40 | 	
41 | 	public boolean equals(Object o) {
42 | 		if (this == o) return true;
43 | 		if (!(o instanceof GlyphChar)) return false;
44 | 		final GlyphChar gc = (GlyphChar) o;
45 | 		return templateCharIndex == gc.templateCharIndex && glyphType == gc.glyphType;
46 | 	}
47 | 
48 | 	public int hashCode() {
49 | 		return 29 * templateCharIndex + 17 * (glyphType.ordinal()); 
50 | 	}
51 | 	
52 | 	public String toString() {
53 | 		return "GlyphChar(templateCharIndex="+templateCharIndex+", glyphType="+glyphType+")";
54 | 	}
55 | 	
56 | 	public String toString(Indexer<String> charIndexer) {
57 | 		return "GlyphChar("+charIndexer.getObject(templateCharIndex)+", "+glyphType+")";
58 | 	}
59 | }
60 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/gsm/GlyphSubstitutionModel.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.gsm;
 2 | 
 3 | import java.io.Serializable;
 4 | 
 5 | /**
 6 |  * @author Dan Garrette (dhgarrette@gmail.com)
 7 |  */
 8 | public interface GlyphSubstitutionModel extends Serializable {
 9 | 
10 | 	public double glyphProb(int language, int lmChar, GlyphChar glyphChar);
11 | 
12 | }
13 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/gsm/NoSubGlyphSubstitutionModel.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.gsm;
 2 | 
 3 | import edu.berkeley.cs.nlp.ocular.gsm.GlyphChar.GlyphType;
 4 | 
 5 | /**
 6 |  * @author Dan Garrette (dhgarrette@gmail.com)
 7 |  */
 8 | public class NoSubGlyphSubstitutionModel implements GlyphSubstitutionModel {
 9 | 	private static final long serialVersionUID = 1L;
10 | 
11 | 	public NoSubGlyphSubstitutionModel() {
12 | 	}
13 | 	
14 | 	public double glyphProb(int language, int lmChar, GlyphChar glyphChar) {
15 | 		return (glyphChar.glyphType == GlyphType.NORMAL_CHAR && lmChar == glyphChar.templateCharIndex) ? 1.0 : 0.0;
16 | 	}
17 | 	
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/lm/CodeSwitchLanguageModel.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.lm;
 2 | 
 3 | import java.io.Serializable;
 4 | 
 5 | import tberg.murphy.indexer.Indexer;
 6 | 
 7 | /**
 8 |  * @author Dan Garrette (dhgarrette@gmail.com)
 9 |  */
10 | public interface CodeSwitchLanguageModel extends LanguageModel, Serializable {
11 | 
12 | 	public Indexer<String> getLanguageIndexer();
13 | 	
14 | 	public SingleLanguageModel get(int language);
15 | 	public double languagePrior(int language);
16 | 	public double languageTransitionProb(int fromLanguage, int destinationLanguage);
17 | 	public double getProbKeepSameLanguage();
18 | 
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/lm/CountDb.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.lm;
 2 | 
 3 | /**
 4 |  * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu)
 5 |  */
 6 | public interface CountDb {
 7 |   
 8 |   public long getNumTokens();
 9 |   
10 |   public int getNumBigramTypes();
11 |   
12 |   public int currSize();
13 |   
14 |   public int totalSize();
15 |   
16 |   public long[] getKeys();
17 | 
18 |   public int getCount(long key, CountType countType);
19 |   
20 |   public int getCount(NgramWrapper ngram, CountType countType);
21 |   
22 |   public void incrementBigramTypes();
23 |   
24 |   /**
25 |    * @return The old count of the ngram (pre-update), but only if we do token counts
26 |    */
27 |   public int incrementCount(NgramWrapper ngram, CountType countType);
28 |   
29 |   public void maybeResize();
30 |   
31 |   public String getStringAnalysis();
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/lm/CountType.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.lm;
 2 | 
 3 | /**
 4 |  * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu)
 5 |  */
 6 | public enum CountType
 7 | {
 8 |   TOKEN_INDEX(0),
 9 |   HISTORY_TYPE_INDEX(1),
10 |   LOWER_ORDER_TYPE_INDEX(2),
11 |   LOWER_ORDER_TYPE_NORMALIZER(3);
12 |   
13 |   private final int index;
14 |   
15 |   private CountType(int index) {
16 |     this.index = index;
17 |   }
18 |   
19 |   public int getIndex() {
20 |     return index;
21 |   }
22 | }


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/lm/InterpolatingSingleLanguageModel.java:
--------------------------------------------------------------------------------
  1 | package edu.berkeley.cs.nlp.ocular.lm;
  2 | 
  3 | import java.util.List;
  4 | import java.util.Set;
  5 | 
  6 | import edu.berkeley.cs.nlp.ocular.util.ArrayHelper;
  7 | import edu.berkeley.cs.nlp.ocular.util.Tuple2;
  8 | import tberg.murphy.indexer.Indexer;
  9 | 
 10 | /**
 11 |  * @author Dan Garrette (dhgarrette@gmail.com)
 12 |  */
 13 | public class InterpolatingSingleLanguageModel implements SingleLanguageModel {
 14 | 	private static final long serialVersionUID = 1L;
 15 | 
 16 | 	private SingleLanguageModel[] subModels;
 17 | 	private double[] interpWeights;
 18 | 	private int numModels;
 19 | 
 20 | 	private Indexer<String> charIndexer = null;
 21 | 	private Set<Integer> activeCharacters = null;
 22 | 	private int maxOrder = -1;
 23 | 
 24 | 	public InterpolatingSingleLanguageModel(List<Tuple2<SingleLanguageModel, Double>> subModelsAndinterpWeights) {
 25 | 		numModels = subModelsAndinterpWeights.size();
 26 | 		
 27 | 		subModels = new SingleLanguageModel[numModels];
 28 | 		interpWeights = new double[numModels];
 29 | 		
 30 | 		double totalInterpWeight = 0.0;
 31 | 		for (int i = 0; i < numModels; ++i) {
 32 | 			Tuple2<SingleLanguageModel, Double> modelAndWeight = subModelsAndinterpWeights.get(i);
 33 | 			subModels[i] = modelAndWeight._1;
 34 | 			interpWeights[i] = modelAndWeight._2;
 35 | 			totalInterpWeight += interpWeights[i];
 36 | 			
 37 | 			if (charIndexer == null) {
 38 | 				charIndexer = subModels[i].getCharacterIndexer();
 39 | 				activeCharacters = subModels[i].getActiveCharacters();
 40 | 				int thisMaxOrder = subModels[i].getMaxOrder();
 41 | 				if (thisMaxOrder > maxOrder)
 42 | 					maxOrder = thisMaxOrder;
 43 | 			} else if (charIndexer != subModels[i].getCharacterIndexer()) {
 44 | 				throw new RuntimeException("Sub-models don't all use the same character indexer");
 45 | 			} else if (activeCharacters != subModels[i].getActiveCharacters()) {
 46 | 				throw new RuntimeException("Sub-models don't all use the same active-character set");
 47 | 			}
 48 | 		}
 49 | 		for (int i = 0; i < numModels; ++i) {
 50 | 			interpWeights[i] /= totalInterpWeight;
 51 | 		}
 52 | 	}
 53 | 
 54 | 	@Override
 55 | 	public double getCharNgramProb(int[] context, int c) {
 56 | 		double probSum = 0.0;
 57 | 		for (int i = 0; i < numModels; ++i) {
 58 | 			int[] shrunkenContext = subModels[i].shrinkContext(context); // context may be different for different submodels
 59 | 			probSum += subModels[i].getCharNgramProb(shrunkenContext, c) * interpWeights[i];
 60 | 		}
 61 | 		return probSum;
 62 | 	}
 63 | 
 64 | 	@Override
 65 | 	public Indexer<String> getCharacterIndexer() {
 66 | 		return charIndexer;
 67 | 	}
 68 | 
 69 | 	@Override
 70 | 	public Set<Integer> getActiveCharacters() {
 71 | 		return activeCharacters;
 72 | 	}
 73 | 	
 74 | 	@Override
 75 | 	public int getMaxOrder() {
 76 | 		return maxOrder;
 77 | 	}
 78 | 
 79 | 	@Override
 80 | 	public int[] shrinkContext(int[] originalContext) {
 81 | 		int[] newContext = originalContext;
 82 | 		while (!containsContext(newContext) && newContext.length > 0) {
 83 | 			newContext = ArrayHelper.takeRight(newContext, newContext.length - 1);
 84 | 		}
 85 | 		return newContext;
 86 | 	}
 87 | 	
 88 | 	@Override
 89 | 	public boolean containsContext(int[] context) {
 90 | 		for (SingleLanguageModel slm : subModels) {
 91 | 			if (slm.containsContext(context)) {
 92 | 				return true;
 93 | 			}
 94 | 		}
 95 | 		return false;
 96 | 	}
 97 | 	
 98 | 	public SingleLanguageModel getSubModel(int i) {
 99 | 		return subModels[i];
100 | 	}
101 | 
102 | }
103 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/lm/LanguageModel.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.lm;
 2 | 
 3 | import tberg.murphy.indexer.Indexer;
 4 | 
 5 | /**
 6 |  * @author Dan Garrette (dhgarrette@gmail.com)
 7 |  */
 8 | public interface LanguageModel {
 9 | 
10 | 	public double getCharNgramProb(int[] context, int c);
11 | 	
12 | 	public Indexer<String> getCharacterIndexer();
13 | 
14 | }
15 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/lm/LongArrWrapper.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.lm;
 2 | 
 3 | import java.io.Serializable;
 4 | import java.util.Arrays;
 5 | 
 6 | /**
 7 |  * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu)
 8 |  */
 9 | public class LongArrWrapper implements Serializable {
10 | 	private static final long serialVersionUID = 5942433644698840887L;
11 | 	public final long[] arr;
12 | 
13 | 	public LongArrWrapper(long[] arr) {
14 | 		this.arr = arr;
15 | 	}
16 | 
17 | 	@Override
18 | 	public boolean equals(Object other) {
19 | 		if (other == null || !(other instanceof LongArrWrapper)) {
20 | 			return false;
21 | 		}
22 | 		return Arrays.equals(this.arr, ((LongArrWrapper)other).arr);
23 | 	}
24 | 
25 | 	@Override
26 | 	public int hashCode() {
27 | 		return Arrays.hashCode(this.arr);
28 | 	}
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/lm/LongNgram.java:
--------------------------------------------------------------------------------
  1 | package edu.berkeley.cs.nlp.ocular.lm;
  2 | 
  3 | import tberg.murphy.indexer.Indexer;
  4 | 
  5 | /**
  6 |  * Contains code for carrying out operations on trigrams encoded as longs.
  7 |  * Can be instantiated, but also has static methods so that the code can be
  8 |  * used without creating the object.
  9 |  * 
 10 |  * Indices are packed into a long using BITS_PER_WORD bits per index,
 11 |  * up to MAX_ORDER indices. BITS_PER_WORD * MAX_ORDER must be <= 64 (use = at your own risk...)
 12 |  * When indices are in the long, 1 is added to each of them so that lower-order
 13 |  * n-grams (with 0s) can be differentiated from n-grams with the first character in the indexer
 14 |  * in them.
 15 |  * 
 16 |  * @author Greg Durrett (gdurrett@cs.berkeley.edu)
 17 |  */
 18 | public class LongNgram {
 19 | 
 20 |   // 128 characters should be enough, this lets us do a 9-gram
 21 |   public static final int BITS_PER_WORD = 7;
 22 |   public static final int MAX_ORDER = 9;
 23 | 
 24 |   public static long[] convertToLong(int[] ngram) {
 25 |     return convertToLong(ngram, 0, ngram.length);
 26 |   }
 27 | 
 28 |   public static long[] convertToLong(int[] ngram, int start, int end) {
 29 |     // Add MAX_ORDER-1 to round up
 30 |     int numLongs = (end - start + MAX_ORDER-1)/MAX_ORDER;
 31 |     long[] longs = new long[numLongs];
 32 |     int longIdx = numLongs - 1;
 33 |     for (int i = end; i > start; i -= MAX_ORDER) {
 34 |       longs[longIdx] = Ngram.convertToLong(ngram, Math.max(start, i - MAX_ORDER), i);
 35 |       longIdx--;
 36 |     }
 37 |     return longs;
 38 |   }
 39 | 
 40 |   public static int[] convertToIntArr(long[] ngram) {
 41 |     int[] arr = new int[LongNgram.getActualOrder(ngram)];
 42 |     int ngramIdx = arr.length - 1;
 43 |     for (int longIdx = ngram.length - 1; longIdx >= 0; longIdx--) {
 44 |       int[] curr = Ngram.convertToIntArr(ngram[longIdx]);
 45 |       for (int i = curr.length - 1; i >= 0; i--) {
 46 |         arr[ngramIdx] = curr[i];
 47 |         ngramIdx--;
 48 |       }
 49 |     }
 50 |     return arr;
 51 |   }
 52 | 
 53 |   // TODO: I think these methods work but they don't do clipping to arbitrary orders,
 54 |   // and I think it's easier to just 
 55 | //  public static long[] getLowerOrder(long[] ngram) {
 56 | //    return LongNgram.getLowerOrder(ngram, LongNgram.getActualOrder(ngram));
 57 | //  }
 58 | //
 59 | //  public static long[] getLowerOrder(long[] ngram, int order) {
 60 | //    if (order % MAX_ORDER == 1) {
 61 | //      long[] newNgram = new long[ngram.length-1];
 62 | //      System.arraycopy(ngram, 1, newNgram, 0, ngram.length-1);
 63 | //      return newNgram;
 64 | //    } else {
 65 | //      long[] newNgram = new long[ngram.length];
 66 | //      System.arraycopy(ngram, 0, newNgram, 0, ngram.length);
 67 | //      newNgram[0] = Ngram.getLowerOrder(ngram[0]);
 68 | //      return newNgram;
 69 | //    }
 70 | //  }
 71 | //
 72 | //  public static long[] getHistory(long[] ngram) {
 73 | //    return LongNgram.getHistory(ngram, LongNgram.getActualOrder(ngram));
 74 | //  }
 75 | //
 76 | //  public static long[] getHistory(long[] ngram, int order) {
 77 | //    long lowOrderMask = (1L << ((long)BITS_PER_WORD)) - 1L;
 78 | //    long[] newNgram;
 79 | //    int newNgramIdx;
 80 | //    long carryOver;
 81 | //    if (order % MAX_ORDER == 1) {
 82 | //      newNgram = new long[ngram.length-1];
 83 | //      newNgramIdx = 0;
 84 | //      carryOver = ngram[0];
 85 | //    } else {
 86 | //      newNgram = new long[ngram.length];
 87 | //      newNgramIdx = 1;
 88 | //      carryOver = ngram[0] & lowOrderMask;
 89 | //      newNgram[0] = ngram[0] >>> BITS_PER_WORD;
 90 | //    }
 91 | //    for (int i = 1; i < ngram.length; i++) {
 92 | //      newNgram[newNgramIdx] = ngram[i] >>> BITS_PER_WORD + carryOver << (BITS_PER_WORD * (MAX_ORDER - 1));
 93 | //      newNgramIdx++;
 94 | //      carryOver = ngram[i] & lowOrderMask;
 95 | //    }
 96 | //    return newNgram;
 97 | //  }
 98 | //
 99 | //  public static long[] getLowerOrderHistory(long[] ngram) {
100 | //    return LongNgram.getLowerOrderHistory(ngram, LongNgram.getActualOrder(ngram));
101 | //  }
102 | //
103 | //  public static long[] getLowerOrderHistory(long[] ngram, int order) {
104 | //    return LongNgram.getLowerOrder(LongNgram.getHistory(ngram, order), order - 1);
105 | //  }
106 | 
107 |   public static int getActualOrder(long[] ngram) {
108 |     if (ngram.length == 0) {
109 |       return 0;
110 |     } else {
111 |       return (ngram.length - 1) * MAX_ORDER + Ngram.getActualOrder(ngram[0]);
112 |     }
113 |   }
114 | 
115 |   public static String toString(int[] ngram, Indexer<String> indexer) {
116 |     return LongNgram.toString(LongNgram.convertToLong(ngram), indexer);
117 |   }
118 | 
119 |   public static String toString(long[] ngram, Indexer<String> indexer) {
120 |     int order = LongNgram.getActualOrder(ngram);
121 |     String ngramStr = "";
122 |     for (int i = 0; i < ngram.length; i++) {
123 |       ngramStr += Ngram.getNgramStr(ngram[i], indexer);
124 |     }
125 |     return "[" + order + ":" + ngramStr + "]";
126 |   }
127 | }
128 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/lm/Ngram.java:
--------------------------------------------------------------------------------
  1 | package edu.berkeley.cs.nlp.ocular.lm;
  2 | 
  3 | import tberg.murphy.indexer.Indexer;
  4 | 
  5 | /**
  6 |  * Contains code for carrying out operations on trigrams encoded as longs.
  7 |  * Can be instantiated, but also has static methods so that the code can be
  8 |  * used without creating the object.
  9 |  * 
 10 |  * Indices are packed into a long using BITS_PER_WORD bits per index,
 11 |  * up to MAX_ORDER indices. BITS_PER_WORD * MAX_ORDER must be <= 64 (use = at your own risk...)
 12 |  * When indices are in the long, 1 is added to each of them so that lower-order
 13 |  * n-grams (with 0s) can be differentiated from n-grams with the first character in the indexer
 14 |  * in them.
 15 |  * 
 16 |  * @author Greg Durrett (gdurrett@cs.berkeley.edu)
 17 |  */
 18 | public class Ngram {
 19 | 
 20 |   // 128 characters should be enough, this lets us do a 9-gram
 21 |   public static final int BITS_PER_WORD = 7;
 22 |   public static final int MAX_ORDER = 9;
 23 |   public static final int[] CONVERTER = new int[MAX_ORDER];
 24 | 
 25 |   private static int encodeWord(int rawWord) {
 26 |     return rawWord + 1;
 27 |   }
 28 | 
 29 |   private static int decodeWord(int encodedWord) {
 30 |     return encodedWord - 1;
 31 |   }
 32 | 
 33 |   public static long convertToLong(int[] ngram) {
 34 |     return convertToLong(ngram, 0, ngram.length);
 35 |   }
 36 | 
 37 |   public static long convertToLong(int[] ngram, int start, int end) {
 38 |     long l = 0;
 39 |     for (int i = start; i < end; i++)
 40 |       l = (l << BITS_PER_WORD) + encodeWord(ngram[i]);
 41 |     return l;
 42 |   }
 43 | 
 44 |   public static int[] convertToIntArr(long ngram) {
 45 |     //    assert Ngram.getActualOrder(ngram) == MAX_ORDER : "Ngram of less than max order: "
 46 |     //              + Ngram.toString(ngram) + ", order: " + Ngram.getActualOrder(ngram);
 47 |     int[] arr = new int[Ngram.getActualOrder(ngram)];
 48 |     int i = 0;
 49 |     long wordMask = (1L << BITS_PER_WORD) - 1;
 50 |     while (ngram != 0) {
 51 |       arr[arr.length - 1 - i] = decodeWord((int) (ngram & wordMask));
 52 |       i++;
 53 |       ngram = Ngram.getHistory(ngram);
 54 |     }
 55 |     return arr;
 56 |   }
 57 | 
 58 |   public static long getLowerOrder(long ngram) {
 59 |     return Ngram.getLowerOrder(ngram, Ngram.getActualOrder(ngram));
 60 |   }
 61 | 
 62 |   public static long getLowerOrder(long ngram, int order) {
 63 |     long mask = (1L << ((order - 1) * BITS_PER_WORD)) - 1L;
 64 |     return mask & ngram;
 65 |   }
 66 | 
 67 |   public static long getHistory(long ngram) {
 68 |     return Ngram.getHistory(ngram, Ngram.getActualOrder(ngram));
 69 |   }
 70 | 
 71 |   public static long getHistory(long ngram, int order) {
 72 |     long mask = ((1L << (((long) order - 1) * BITS_PER_WORD)) - 1L) << BITS_PER_WORD;
 73 |     return (mask & ngram) >> BITS_PER_WORD;
 74 |   }
 75 | 
 76 |   public static long getLowerOrderHistory(long ngram) {
 77 |     return Ngram.getLowerOrderHistory(ngram, Ngram.getActualOrder(ngram));
 78 |   }
 79 | 
 80 |   public static long getLowerOrderHistory(long ngram, int order) {
 81 |     return Ngram.getLowerOrder(Ngram.getHistory(ngram, order), order - 1);
 82 |   }
 83 | 
 84 | //  public static long addWordAndShift(long ngram, int word) {
 85 | //    long mask = (1L << (((long) MAX_ORDER - 1) * BITS_PER_WORD)) - 1L << BITS_PER_WORD;
 86 | //    return ((ngram << BITS_PER_WORD) & mask) + encodeWord(word);
 87 | //  }
 88 | 
 89 |   public static int getActualOrder(long ngram) {
 90 |     for (int i = MAX_ORDER - 1; i >= 0; i--) {
 91 |       long mask = (1L << (((long) i) * BITS_PER_WORD)) - 1L;
 92 |       if ((ngram & mask) != ngram)
 93 |         return i + 1;
 94 |     }
 95 |     return 0;
 96 |   }
 97 | 
 98 |   public static String toString(int[] ngram, Indexer<String> indexer) {
 99 |     return Ngram.toString(Ngram.convertToLong(ngram), indexer);
100 |   }
101 | 
102 |   public static String toString(long ngram, Indexer<String> indexer) {
103 |     return "[" + Ngram.getActualOrder(ngram) + ":" + getNgramStr(ngram, indexer) + "]";
104 |   }
105 |   
106 |   public static String getNgramStr(long ngram, Indexer<String> indexer) {
107 |     String string = "";
108 |     int order = Ngram.getActualOrder(ngram);
109 |     for (int i = 0; i < order; i++) {
110 |       long mask = (1L << BITS_PER_WORD) - 1L;
111 |       string = indexer.getObject(decodeWord((int) (ngram & mask))) + string;
112 |       ngram = ngram >> BITS_PER_WORD;
113 |     }
114 |     return string;
115 |   }
116 | }
117 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/lm/NgramWrapper.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.lm;
 2 | 
 3 | /**
 4 |  * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu)
 5 |  */
 6 | public class NgramWrapper {
 7 | 
 8 |   public int[] ngram;
 9 |   public int start;
10 |   public int end;
11 | 
12 |   private NgramWrapper() {
13 |     this.ngram = null;
14 |     this.start = -1;
15 |     this.end = -1;
16 |   }
17 | 
18 |   public static NgramWrapper getNew(int[] ngram, int start, int end) {
19 |     NgramWrapper next = new NgramWrapper();
20 |     next.changeNgramWrapper(ngram, start, end);
21 |     return next;
22 |   }
23 | 
24 |   private void changeNgramWrapper(int[] ngram, int start, int end) {
25 |     this.ngram = ngram;
26 |     this.start = start;
27 |     this.end = end;
28 |   }
29 | 
30 |   public int getOrder() {
31 |     return end - start;
32 |   }
33 | 
34 |   public NgramWrapper getLowerOrder() {
35 |     return getNew(ngram, start + 1, end);
36 |   }
37 | 
38 |   public NgramWrapper getLowerOrder(int order) {
39 |     return getNew(ngram, end - order, end);
40 |   }
41 | 
42 |   public NgramWrapper getHistory() {
43 |     return getNew(ngram, start, end - 1);
44 |   }
45 | 
46 |   public long getLongRep() {
47 |     return Ngram.convertToLong(ngram, start, end);
48 |   }
49 | 
50 |   public long[] getLongerRep() {
51 |     return LongNgram.convertToLong(ngram, start, end);
52 |   }
53 |   
54 |   public String toString() {
55 |     String str = "[";
56 |     for (int i = start; i < end; i++) {
57 |       str += ngram[i] + ", ";
58 |     }
59 |     if (str.length() == 1) {
60 |       return str + "]";
61 |     } else {
62 |       return str.substring(0, str.length() - 2) + "]";
63 |     }
64 |   }
65 | }
66 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/lm/SingleLanguageModel.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.lm;
 2 | 
 3 | import java.io.Serializable;
 4 | import java.util.Set;
 5 | 
 6 | import edu.berkeley.cs.nlp.ocular.lm.LanguageModel;
 7 | 
 8 | /**
 9 |  * @author Dan Garrette (dhgarrette@gmail.com)
10 |  */
11 | public interface SingleLanguageModel extends LanguageModel, Serializable {
12 | 
13 | 	public Set<Integer> getActiveCharacters();
14 | 	public int getMaxOrder();
15 | 	public int[] shrinkContext(int[] originalContext);
16 | 	public boolean containsContext(int[] context);
17 | 	
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/lm/UniformLanguageModel.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.lm;
 2 | 
 3 | import tberg.murphy.indexer.Indexer;
 4 | 
 5 | import java.util.Set;
 6 | 
 7 | import edu.berkeley.cs.nlp.ocular.lm.SingleLanguageModel;
 8 | 
 9 | /**
10 |  * @author Dan Garrette (dhgarrette@gmail.com)
11 |  */
12 | public class UniformLanguageModel implements SingleLanguageModel {
13 | 	private static final long serialVersionUID = 398523984923L;
14 | 
15 | 	final private Set<Integer> activeCharacters;
16 | 	final private Indexer<String> charIndexer;
17 | 	final private int maxOrder;
18 | 	final private boolean[] isActive;
19 | 	final private double prob;
20 | 
21 | 	public UniformLanguageModel(Set<Integer> activeCharacters, Indexer<String> charIndexer, int maxOrder) {
22 | 		this.activeCharacters = activeCharacters;
23 | 		this.charIndexer = charIndexer;
24 | 		this.maxOrder = maxOrder;
25 | 
26 | 		isActive = new boolean[charIndexer.size()];
27 | 		for (int c : activeCharacters) {
28 | 			isActive[c] = true;
29 | 		}
30 | 		this.prob = 1.0 / activeCharacters.size();
31 | 	}
32 | 
33 | 	public Set<Integer> getActiveCharacters() {
34 | 		return activeCharacters;
35 | 	}
36 | 
37 | 	public int[] shrinkContext(int[] context) {
38 | 		return context;
39 | 	}
40 | 	
41 | 	public boolean containsContext(int[] context) {
42 | 		return true;
43 | 	}
44 | 
45 | 	public double getCharNgramProb(int[] context, int c) {
46 | 		if (isActive[c])
47 | 			return prob;
48 | 		else
49 | 			return 0.0;
50 | 	}
51 | 
52 | 	public Indexer<String> getCharacterIndexer() {
53 | 		return charIndexer;
54 | 	}
55 | 
56 | 	public int getMaxOrder() {
57 | 		return maxOrder;
58 | 	}
59 | 
60 | }
61 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/main/ExtractLinesOnly.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.main;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import edu.berkeley.cs.nlp.ocular.data.Document;
 6 | import edu.berkeley.cs.nlp.ocular.data.LazyRawImageLoader;
 7 | 
 8 | /**
 9 |  * @author Dan Garrette (dhgarrette@gmail.com)
10 |  */
11 | public class ExtractLinesOnly extends LineExtractionOptions {
12 | 
13 | 	public static void main(String[] args) {
14 | 		System.out.println("ExtractLinesOnly");
15 | 		ExtractLinesOnly main = new ExtractLinesOnly();
16 | 		main.doMain(main, args);
17 | 	}
18 | 		
19 | 	protected void validateOptions() {
20 | 		super.validateOptions();
21 | 		if (extractedLinesPath == null) throw new IllegalArgumentException("-extractedLinesPath is required.");
22 | 	}
23 | 
24 | 	public void run(List<String> commandLineArgs) {
25 | 		List<String> inputDocPathList = getInputDocPathList();
26 | 		List<Document> inputDocuments = LazyRawImageLoader.loadDocuments(inputDocPathList, extractedLinesPath, numDocs, numDocsToSkip, uniformLineHeight, binarizeThreshold, crop);
27 | 		if (inputDocuments.isEmpty()) throw new NoDocumentsFoundException();
28 | 		for (Document doc : inputDocuments) {
29 | 			doc.loadLineImages();
30 | 		}
31 | 	}
32 | 	
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/main/InitializeGlyphSubstitutionModel.java:
--------------------------------------------------------------------------------
  1 | package edu.berkeley.cs.nlp.ocular.main;
  2 | 
  3 | import java.io.File;
  4 | import java.io.FileInputStream;
  5 | import java.io.FileOutputStream;
  6 | import java.io.IOException;
  7 | import java.io.ObjectInputStream;
  8 | import java.io.ObjectOutputStream;
  9 | import java.util.List;
 10 | import java.util.Set;
 11 | import java.util.zip.GZIPInputStream;
 12 | import java.util.zip.GZIPOutputStream;
 13 | 
 14 | import edu.berkeley.cs.nlp.ocular.gsm.BasicGlyphSubstitutionModel.BasicGlyphSubstitutionModelFactory;
 15 | import edu.berkeley.cs.nlp.ocular.gsm.GlyphSubstitutionModel;
 16 | import edu.berkeley.cs.nlp.ocular.lm.CodeSwitchLanguageModel;
 17 | import tberg.murphy.fig.Option;
 18 | import tberg.murphy.indexer.Indexer;
 19 | 
 20 | /**
 21 |  * @author Dan Garrette (dhgarrette@gmail.com)
 22 |  */
 23 | public class InitializeGlyphSubstitutionModel extends OcularRunnable {
 24 | 	
 25 | 	@Option(gloss = "Path to the language model file (so that it knows which characters to create images for).")
 26 | 	public static String inputLmPath = null; // Required.
 27 | 
 28 | 	@Option(gloss = "Output font file path.")
 29 | 	public static String outputGsmPath = null; // Required.
 30 | 	
 31 | 	@Option(gloss = "The default number of counts that every glyph gets in order to smooth the glyph substitution model estimation.")
 32 | 	public static double gsmSmoothingCount = 1.0;
 33 | 	
 34 | 	@Option(gloss = "gsmElisionSmoothingCountMultiplier.")
 35 | 	public static double gsmElisionSmoothingCountMultiplier = 100.0;
 36 | 	
 37 | 	@Option(gloss = "Exponent on GSM scores.")
 38 | 	public static double gsmPower = 4.0;
 39 | 
 40 | 	public static void main(String[] args) {
 41 | 		System.out.println("InitializeGlyphSubstitutionModel");
 42 | 		InitializeGlyphSubstitutionModel main = new InitializeGlyphSubstitutionModel();
 43 | 		main.doMain(main, args);
 44 | 	}
 45 | 
 46 | 	protected void validateOptions() {
 47 | 		if (inputLmPath == null) throw new IllegalArgumentException("-inputLmPath not set");
 48 | 		if (outputGsmPath == null) throw new IllegalArgumentException("-outputGsmPath not set");
 49 | 	}
 50 | 
 51 | 	public void run(List<String> commandLineArgs) {
 52 | 		final CodeSwitchLanguageModel lm = InitializeLanguageModel.readCodeSwitchLM(inputLmPath);
 53 | 		final Indexer<String> charIndexer = lm.getCharacterIndexer();
 54 | 		final Indexer<String> langIndexer = lm.getLanguageIndexer();
 55 | 		Set<Integer>[] activeCharacterSets = FonttrainTranscribeShared.makeActiveCharacterSets(lm);
 56 | 		
 57 | 		// Fake stuff
 58 | 		int minCountsForEvalGsm = 0;
 59 | 		String outputPath = null;
 60 | 		
 61 | 		BasicGlyphSubstitutionModelFactory factory = new BasicGlyphSubstitutionModelFactory(
 62 | 				gsmSmoothingCount, gsmElisionSmoothingCountMultiplier, 
 63 | 				langIndexer, charIndexer, 
 64 | 				activeCharacterSets, gsmPower, minCountsForEvalGsm, outputPath);
 65 | 		
 66 | 		System.out.println("Initializing a uniform GSM.");
 67 | 		GlyphSubstitutionModel gsm = factory.uniform();
 68 | 		
 69 | 		System.out.println("Writing intialized gsm to " + outputGsmPath);
 70 | 		writeGSM(gsm, outputGsmPath);
 71 | 	}
 72 | 	
 73 | 	public static GlyphSubstitutionModel readGSM(String gsmPath) {
 74 | 		ObjectInputStream in = null;
 75 | 		try {
 76 | 			File file = new File(gsmPath);
 77 | 			if (!file.exists()) {
 78 | 				throw new RuntimeException("Serialized GlyphSubstitutionModel file " + gsmPath + " not found");
 79 | 			}
 80 | 			in = new ObjectInputStream(new GZIPInputStream(new FileInputStream(file)));
 81 | 			return (GlyphSubstitutionModel) in.readObject();
 82 | 		} catch (Exception e) {
 83 | 			throw new RuntimeException(e);
 84 | 		} finally {
 85 | 			if (in != null)
 86 | 				try { in.close(); } catch (IOException e) { throw new RuntimeException(e); }
 87 | 		}
 88 | 	}
 89 | 
 90 | 	public static void writeGSM(GlyphSubstitutionModel gsm, String gsmPath) {
 91 | 		ObjectOutputStream out = null;
 92 | 		try {
 93 | 			new File(gsmPath).getAbsoluteFile().getParentFile().mkdirs();
 94 | 			out = new ObjectOutputStream(new GZIPOutputStream(new FileOutputStream(gsmPath)));
 95 | 			out.writeObject(gsm);
 96 | 		} catch (Exception e) {
 97 | 			throw new RuntimeException(e);
 98 | 		} finally {
 99 | 			if (out != null)
100 | 				try { out.close(); } catch (IOException e) { throw new RuntimeException(e); }
101 | 		}
102 | 	}
103 | 
104 | }
105 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/main/LineExtractionOptions.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.main;
 2 | 
 3 | import java.io.File;
 4 | import java.util.Arrays;
 5 | import java.util.List;
 6 | 
 7 | import tberg.murphy.fig.Option;
 8 | import tberg.murphy.fileio.f;
 9 | 
10 | /**
11 |  * @author Dan Garrette (dhgarrette@gmail.com)
12 |  */
13 | public abstract class LineExtractionOptions extends OcularRunnable {
14 | 
15 | 	// Main Options
16 | 	
17 | 	@Option(gloss = "Path to the directory that contains the input document images. The entire directory will be searched recursively for any files that do not end in `.txt` (and that do not start with `.`).  Files will be processed in lexicographical order.")
18 | 	public static String inputDocPath = null; // Either inputDocPath or inputDocListPath is required.
19 | 	
20 | 	@Option(gloss = "Path to a file that contains a list of paths to images files that should be used.  The file should contain one path per line. These paths will be searched in order.  Each path may point to either a file or a directory, which will be searched recursively for any files that do not end in `.txt` (and that do not start with `.`).  Paths will be processed in the order given in the file, and each path will be searched in lexicographical order.")
21 | 	public static String inputDocListPath = null; // Either inputDocPath or inputDocListPath is required.
22 | 
23 | 	@Option(gloss = "Number of documents (pages) to use, counting alphabetically. Ignore or use 0 to use all documents. Default: Use all documents.")
24 | 	public static int numDocs = Integer.MAX_VALUE;
25 | 
26 | 	@Option(gloss = "Number of training documents (pages) to skip over, counting alphabetically.  Useful, in combination with -numDocs, if you want to break a directory of documents into several chunks.")
27 | 	public static int numDocsToSkip = 0;
28 | 
29 | 	@Option(gloss = "Path of the directory where the line-extraction images should be read/written.  If the line files exist here, they will be used; if not, they will be extracted and then written here.  Useful if: 1) you plan to run Ocular on the same documents multiple times and you want to save some time by not re-extracting the lines, or 2) you use an alternate line extractor (such as Tesseract) to pre-process the document.  If ignored, the document will simply be read from the original document image file, and no line images will be written.")
30 | 	public static String extractedLinesPath = null; // Don't read or write line image files.
31 | 	
32 | 	// Line Extraction Options
33 | 	
34 | 	@Option(gloss = "Quantile to use for pixel value thresholding. (High values mean more black pixels.)")
35 | 	public static double binarizeThreshold = 0.12;
36 | 
37 | 	@Option(gloss = "Crop pages?")
38 | 	public static boolean crop = true;
39 | 
40 | 	@Option(gloss = "Scale all lines to have the same height?")
41 | 	public static boolean uniformLineHeight = true;
42 | 	
43 | 
44 | 	
45 | 	protected void validateOptions() {
46 | 		if ((inputDocPath == null) == (inputDocListPath == null)) throw new IllegalArgumentException("Either -inputDocPath or -inputDocListPath is required.");
47 | 		if (inputDocPath != null)
48 | 			for (String path : inputDocPath.split("[\\s,;:]+"))
49 | 				if (!new File(path).exists()) throw new IllegalArgumentException("inputDocPath "+path+" does not exist [looking in "+(new File(".").getAbsolutePath())+"]");
50 | 		if (inputDocListPath != null && !new File(inputDocListPath).exists()) throw new IllegalArgumentException("-inputDocListPath "+inputDocListPath+" does not exist [looking in "+(new File(".").getAbsolutePath())+"]");
51 | 		if (numDocsToSkip < 0) throw new IllegalArgumentException("-numDocsToSkip must be >= 0.  Was "+numDocsToSkip+".");
52 | 	}
53 | 	
54 | 	protected static List<String> getInputDocPathList() {
55 | 		return inputDocPath != null ? Arrays.asList(inputDocPath.split("[\\s+,;:]")) : f.readLines(inputDocListPath);
56 | 	}
57 | 
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/main/NoDocumentsFoundException.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.main;
 2 | 
 3 | /**
 4 |  * @author Dan Garrette (dhgarrette@gmail.com)
 5 |  */
 6 | public class NoDocumentsFoundException extends RuntimeException {
 7 | 	private static final long serialVersionUID = 1L;
 8 | 
 9 | 	public NoDocumentsFoundException() {
10 | 		super("No documents were found in the given input path(s).");
11 | 	}
12 | 
13 | 	public NoDocumentsFoundException(String message) {
14 | 		super(message);
15 | 	}
16 | 
17 | }
18 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/main/NoDocumentsToProcessException.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.main;
 2 | 
 3 | /**
 4 |  * @author Dan Garrette (dhgarrette@gmail.com)
 5 |  */
 6 | public class NoDocumentsToProcessException extends RuntimeException {
 7 | 	private static final long serialVersionUID = 1L;
 8 | 
 9 | 	public NoDocumentsToProcessException() {
10 | 		super();
11 | 	}
12 | 
13 | 	public NoDocumentsToProcessException(String message) {
14 | 		super(message);
15 | 	}
16 | 
17 | }
18 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/main/OcularRunnable.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.main;
 2 | 
 3 | import java.text.SimpleDateFormat;
 4 | import java.util.Arrays;
 5 | import java.util.Date;
 6 | import java.util.List;
 7 | 
 8 | import tberg.murphy.fig.OptionsParser;
 9 | 
10 | /**
11 |  * @author Dan Garrette (dhgarrette@gmail.com)
12 |  */
13 | public abstract class OcularRunnable {
14 | 
15 | 	private SimpleDateFormat sdf = new SimpleDateFormat("MM/dd/yyyy HH:mm:ss");
16 | 	
17 | 	final protected void doMain(OcularRunnable main, String[] args) {
18 | 		System.out.println(toArgListString(args));
19 | 		long startTime = System.currentTimeMillis();
20 | 		printStartTime(startTime);
21 | 		OptionsParser parser = new OptionsParser();
22 | 		parser.doRegisterAll(new Object[] { main });
23 | 		if (!parser.doParse(args)) System.exit(1);
24 | 		main.validateOptions();
25 | 		main.run(Arrays.asList(args));
26 | 		long endTime = System.currentTimeMillis();
27 | 		printEndTime(startTime, endTime);
28 | 	}
29 | 	
30 | 	abstract protected void run(List<String> commandLineArgs);
31 | 
32 | 	abstract protected void validateOptions();
33 | 
34 | 	private static String toArgListString(String[] args) {
35 | 		StringBuffer sb = new StringBuffer();
36 | 		for (int i = 0; i < args.length; ++i) {
37 | 			sb.append("  " + args[i]);
38 | 			if (i % 2 != 0)
39 | 				sb.append("\n");
40 | 		}
41 | 		return sb.toString();
42 | 	}
43 | 	
44 | 	private void printEndTime(long startTime, long endTime) {
45 | 		System.out.println("\n"+ convertSecondsToAmountOfTimeString(endTime - startTime) + " elapsed. Completed at "+sdf.format(new Date(endTime)));
46 | 	}
47 | 
48 | 	private void printStartTime(long startTime) {
49 | 		System.out.println("Started job at "+sdf.format(new Date(startTime))+"\n");
50 | 	}
51 | 
52 | 	private String convertSecondsToAmountOfTimeString(long millis) {
53 | 		long seconds = millis / 1000;
54 | 	    long s = seconds % 60;
55 | 	    long m = (seconds / 60) % 60;
56 | 	    long h = (seconds / (60 * 60));
57 | 	    return String.format("%02d:%02d:%02d", h,m,s);
58 | 	}
59 | 
60 | }
61 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/main/TrainFont.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.main;
 2 | 
 3 | import java.util.List;
 4 | import java.util.Set;
 5 | 
 6 | import edu.berkeley.cs.nlp.ocular.data.Document;
 7 | import edu.berkeley.cs.nlp.ocular.data.LazyRawImageLoader;
 8 | import edu.berkeley.cs.nlp.ocular.eval.BasicSingleDocumentEvaluatorAndOutputPrinter;
 9 | import edu.berkeley.cs.nlp.ocular.eval.MultiDocumentTranscriber;
10 | import edu.berkeley.cs.nlp.ocular.eval.SingleDocumentEvaluatorAndOutputPrinter;
11 | import edu.berkeley.cs.nlp.ocular.font.Font;
12 | import edu.berkeley.cs.nlp.ocular.gsm.BasicGlyphSubstitutionModel.BasicGlyphSubstitutionModelFactory;
13 | import edu.berkeley.cs.nlp.ocular.gsm.GlyphSubstitutionModel;
14 | import edu.berkeley.cs.nlp.ocular.lm.CodeSwitchLanguageModel;
15 | import edu.berkeley.cs.nlp.ocular.model.DecoderEM;
16 | import edu.berkeley.cs.nlp.ocular.train.FontTrainer;
17 | import edu.berkeley.cs.nlp.ocular.train.TrainingRestarter;
18 | import edu.berkeley.cs.nlp.ocular.util.FileUtil;
19 | import tberg.murphy.fig.Option;
20 | import tberg.murphy.indexer.Indexer;
21 | 
22 | /**
23 |  * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu)
24 |  * @author Dan Garrette (dhgarrette@gmail.com)
25 |  */
26 | public class TrainFont extends FonttrainTranscribeShared {
27 | 
28 | 	@Option(gloss = "Number of iterations of EM to use for font learning.")
29 | 	public static int numEMIters = 3;
30 | 	
31 | 	@Option(gloss = "If true, the font trainer will find the latest completed iteration in the outputPath and load it in order to pick up training from that point.  Convenient if a training run crashes when only partially completed.")
32 | 	public static boolean continueFromLastCompleteIteration = false;
33 | 
34 | 	@Option(gloss = "When using -evalInputDocPath, the font trainer will perform an evaluation every `evalFreq` iterations. Default: Evaluate only after all iterations have completed.")
35 | 	public static int evalFreq = Integer.MAX_VALUE; 
36 | 	
37 | 
38 | 	public static void main(String[] args) {
39 | 		System.out.println("TrainFont");
40 | 		TrainFont main = new TrainFont();
41 | 		main.doMain(main, args);
42 | 	}
43 | 
44 | 	protected void validateOptions() {
45 | 		super.validateOptions();
46 | 		
47 | 		if (numEMIters <= 0) new IllegalArgumentException("-numEMIters must be a positive number.");
48 | 
49 | 		if (outputFontPath == null) throw new IllegalArgumentException("-outputFontPath is required for font training.");
50 | 	}
51 | 
52 | 	public void run(List<String> commandLineArgs) {
53 | 		Set<OutputFormat> outputFormats = parseOutputFormats();
54 | 		
55 | 		CodeSwitchLanguageModel initialLM = loadInputLM();
56 | 		Font initialFont = loadInputFont();
57 | 		BasicGlyphSubstitutionModelFactory gsmFactory = makeGsmFactory(initialLM);
58 | 		GlyphSubstitutionModel initialGSM = loadInitialGSM(gsmFactory);
59 | 		
60 | 		Indexer<String> charIndexer = initialLM.getCharacterIndexer();
61 | 		Indexer<String> langIndexer = initialLM.getLanguageIndexer();
62 | 		
63 | 		DecoderEM decoderEM = makeDecoder(charIndexer);
64 | 
65 | 		boolean evalCharIncludesDiacritic = true;
66 | 		SingleDocumentEvaluatorAndOutputPrinter documentOutputPrinterAndEvaluator = new BasicSingleDocumentEvaluatorAndOutputPrinter(charIndexer, langIndexer, allowGlyphSubstitution, evalCharIncludesDiacritic, commandLineArgs);
67 | 		
68 | 		List<String> inputDocPathList = getInputDocPathList();
69 | 		List<Document> inputDocuments = LazyRawImageLoader.loadDocuments(inputDocPathList, extractedLinesPath, numDocs, numDocsToSkip, uniformLineHeight, binarizeThreshold, crop);
70 | 		if (inputDocuments.isEmpty()) throw new NoDocumentsFoundException();
71 | 		if (updateDocBatchSize > 0 && inputDocuments.size() < updateDocBatchSize) throw new RuntimeException("The number of available documents is less than -updateDocBatchSize!");
72 | 		
73 | 		String newInputDocPath = FileUtil.lowestCommonPath(inputDocPathList);
74 | 
75 | 		MultiDocumentTranscriber evalSetEvaluator = makeEvalSetEvaluator(charIndexer, decoderEM, documentOutputPrinterAndEvaluator);
76 | 		new FontTrainer().trainFont(
77 | 				inputDocuments,  
78 | 				initialFont, initialLM, initialGSM,
79 | 				continueFromLastCompleteIteration ? new TrainingRestarter() : null,
80 | 				outputFontPath, outputLmPath, outputGsmPath,
81 | 				decoderEM,
82 | 				gsmFactory, documentOutputPrinterAndEvaluator,
83 | 				numEMIters, updateDocBatchSize > 0 ? updateDocBatchSize : inputDocuments.size(), false, true,
84 | 				numMstepThreads,
85 | 				newInputDocPath, outputPath, outputFormats,
86 | 				evalSetEvaluator, evalFreq, evalBatches,
87 | 				skipFailedDocs);
88 | 	}
89 | 
90 | }
91 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/main/gui/GridLayout2.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * FROM:
  3 |  * http://core0.staticworld.net/downloads/idge/imported/article/jvw/2006/10/gridlayout2.java
  4 |  * 
  5 |  */
  6 | 
  7 | package edu.berkeley.cs.nlp.ocular.main.gui;
  8 | 
  9 | import java.awt.*;
 10 | 
 11 | /**
 12 |  * Grid Layout which allows components of different sizes
 13 |  * 
 14 |  * @author 
 15 |  */
 16 | public class GridLayout2 extends GridLayout {
 17 |   private static final long serialVersionUID = 1L;
 18 | 
 19 |   public GridLayout2() {
 20 |     this(1, 0, 0, 0);
 21 |   }
 22 | 
 23 |   public GridLayout2(int rows, int cols) {
 24 |     this(rows, cols, 0, 0);
 25 |   }
 26 | 
 27 |   public GridLayout2(int rows, int cols, int hgap, int vgap) {
 28 |     super(rows, cols, hgap, vgap);
 29 |   }
 30 | 
 31 |   public Dimension preferredLayoutSize(Container parent) {
 32 | 	//System.err.println("preferredLayoutSize");
 33 |     synchronized (parent.getTreeLock()) {
 34 |       Insets insets = parent.getInsets();
 35 |       int ncomponents = parent.getComponentCount();
 36 |       int nrows = getRows();
 37 |       int ncols = getColumns();
 38 |       if (nrows > 0) {
 39 |         ncols = (ncomponents + nrows - 1) / nrows;
 40 |       } 
 41 |       else {
 42 |         nrows = (ncomponents + ncols - 1) / ncols;
 43 |       }
 44 |       int[] w = new int[ncols];
 45 |       int[] h = new int[nrows];
 46 |       for (int i = 0; i < ncomponents; i ++) {
 47 |         int r = i / ncols;
 48 |         int c = i % ncols;
 49 |         Component comp = parent.getComponent(i);
 50 |         Dimension d = comp.getPreferredSize();
 51 |         if (w[c] < d.width) {
 52 |           w[c] = d.width;
 53 |         }
 54 |         if (h[r] < d.height) {
 55 |           h[r] = d.height;
 56 |         }
 57 |       }
 58 |       int nw = 0;
 59 |       for (int j = 0; j < ncols; j ++) {
 60 |         nw += w[j];
 61 |       }
 62 |       int nh = 0;
 63 |       for (int i = 0; i < nrows; i ++) {
 64 |         nh += h[i];
 65 |       }
 66 |       return new Dimension(insets.left + insets.right + nw + (ncols-1)*getHgap(), 
 67 |           insets.top + insets.bottom + nh + (nrows-1)*getVgap());
 68 |     }
 69 |   }
 70 | 
 71 |   public Dimension minimumLayoutSize(Container parent) {
 72 | 	System.err.println("minimumLayoutSize");
 73 |     synchronized (parent.getTreeLock()) {
 74 |       Insets insets = parent.getInsets();
 75 |       int ncomponents = parent.getComponentCount();
 76 |       int nrows = getRows();
 77 |       int ncols = getColumns();
 78 |       if (nrows > 0) {
 79 |         ncols = (ncomponents + nrows - 1) / nrows;
 80 |       } 
 81 |       else {
 82 |         nrows = (ncomponents + ncols - 1) / ncols;
 83 |       }
 84 |       int[] w = new int[ncols];
 85 |       int[] h = new int[nrows];
 86 |       for (int i = 0; i < ncomponents; i ++) {
 87 |         int r = i / ncols;
 88 |         int c = i % ncols;
 89 |         Component comp = parent.getComponent(i);
 90 |         Dimension d = comp.getMinimumSize();
 91 |         if (w[c] < d.width) {
 92 |           w[c] = d.width;
 93 |         }
 94 |         if (h[r] < d.height) {
 95 |           h[r] = d.height;
 96 |         }
 97 |       }
 98 |       int nw = 0;
 99 |       for (int j = 0; j < ncols; j ++) {
100 |         nw += w[j];
101 |       }
102 |       int nh = 0;
103 |       for (int i = 0; i < nrows; i ++) {
104 |         nh += h[i];
105 |       }
106 |       return new Dimension(insets.left + insets.right + nw + (ncols-1)*getHgap(), 
107 |           insets.top + insets.bottom + nh + (nrows-1)*getVgap());
108 |     }
109 |   }
110 | 
111 |   public void layoutContainer(Container parent) {
112 |     //System.err.println("layoutContainer");
113 |     synchronized (parent.getTreeLock()) {
114 |       Insets insets = parent.getInsets();
115 |       int ncomponents = parent.getComponentCount();
116 |       int nrows = getRows();
117 |       int ncols = getColumns();
118 |       if (ncomponents == 0) {
119 |         return;
120 |       }
121 |       if (nrows > 0) {
122 |         ncols = (ncomponents + nrows - 1) / nrows;
123 |       } 
124 |       else {
125 |         nrows = (ncomponents + ncols - 1) / ncols;
126 |       }
127 |       int hgap = getHgap();
128 |       int vgap = getVgap();
129 | 	  // scaling factors      
130 |       Dimension pd = preferredLayoutSize(parent);
131 |       double sw = (1.0 * parent.getWidth()) / pd.width;
132 |       double sh = (1.0 * parent.getHeight()) / pd.height;
133 |       // scale
134 |       int[] w = new int[ncols];
135 |       int[] h = new int[nrows];
136 |       for (int i = 0; i < ncomponents; i ++) {
137 |         int r = i / ncols;
138 |         int c = i % ncols;
139 |         Component comp = parent.getComponent(i);
140 |         Dimension d = comp.getPreferredSize();
141 |         d.width = (int) (sw * d.width);
142 |         d.height = (int) (sh * d.height);
143 |         if (w[c] < d.width) {
144 |           w[c] = d.width;
145 |         }
146 |         if (h[r] < d.height) {
147 |           h[r] = d.height;
148 |         }
149 |       }
150 |       for (int c = 0, x = insets.left; c < ncols; c ++) {
151 |         for (int r = 0, y = insets.top; r < nrows; r ++) {
152 |           int i = r * ncols + c;
153 |           if (i < ncomponents) {
154 |             parent.getComponent(i).setBounds(x, y, w[c], h[r]);
155 |           }
156 |           y += h[r] + vgap;
157 |         }
158 |         x += w[c] + hgap;
159 |       }
160 |     }
161 |   }  
162 | }
163 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/model/DecodeState.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.model;
 2 | 
 3 | import edu.berkeley.cs.nlp.ocular.model.transition.SparseTransitionModel.TransitionState;
 4 | 
 5 | /**
 6 |  * @author Dan Garrette (dhgarrette@gmail.com)
 7 |  */
 8 | public class DecodeState {
 9 | 	public final TransitionState ts;
10 | 	public final int charAndPadWidth;
11 | 	public final int charWidth;
12 | 	public final int padWidth;
13 | 	public final int exposure;
14 | 	public final int verticalOffset;
15 | 	
16 | 	public DecodeState(TransitionState ts, int charAndPadWidth, int padWidth, int exposure, int verticalOffset) {
17 | 		this.ts = ts;
18 | 		this.charAndPadWidth = charAndPadWidth;
19 | 		this.padWidth = padWidth;
20 | 		this.charWidth = charAndPadWidth - padWidth;
21 | 		this.exposure = exposure;
22 | 		this.verticalOffset = verticalOffset;
23 | 	}
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/model/TransitionStateType.java:
--------------------------------------------------------------------------------
1 | package edu.berkeley.cs.nlp.ocular.model;
2 | 
3 | /**
4 |  * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu)
5 |  */
6 | public enum TransitionStateType {
7 | 	TMPL, LMRGN, LMRGN_HPHN, RMRGN, RMRGN_HPHN_INIT, RMRGN_HPHN
8 | }
9 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/model/em/DefaultInnerLoop.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.model.em;
 2 | 
 3 | import edu.berkeley.cs.nlp.ocular.model.CharacterTemplate;
 4 | import tberg.murphy.gpu.CudaUtil;
 5 | 
 6 | /**
 7 |  * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu)
 8 |  */
 9 | public class DefaultInnerLoop implements EmissionCacheInnerLoop {
10 | 	
11 | 	int numThreads;
12 | 	float[][] whiteTemplates;
13 | 	float[][] blackTemplates;
14 | 	int[] templateNumIndices;
15 | 	int[] templateIndicesOffsets;
16 | 	int maxTemplateWidth;
17 | 	int minTemplateWidth;
18 | 
19 | 	public DefaultInnerLoop(int numThreads) {
20 | 		this.numThreads = numThreads;
21 | 	}
22 | 	
23 | 	public void startup(float[][] whiteTemplates, float[][] blackTemplates, int[] templateNumIndices, int[] templateIndicesOffsets, int minTemplateWidth, int maxTemplateWidth, int maxSequenceLength, int totalTemplateNumIndices) {
24 | 		this.whiteTemplates = whiteTemplates;
25 | 		this.blackTemplates = blackTemplates;
26 | 		this.templateNumIndices = templateNumIndices;
27 | 		this.templateIndicesOffsets = templateIndicesOffsets;
28 | 		this.maxTemplateWidth = maxTemplateWidth;
29 | 		this.minTemplateWidth = minTemplateWidth;
30 | 	}
31 | 
32 | 	public void shutdown() {
33 | 	}
34 | 
35 | 	public void compute(final float[] scores, final float[] whiteObservations, final float[] blackObservations, final int sequenceLength) {
36 | 		for (int tw=minTemplateWidth; tw<=maxTemplateWidth; ++tw) {
37 | 			float[] whiteTemplatesForWidth = whiteTemplates[tw-minTemplateWidth];
38 | 			float[] blackTemplateForWidth = blackTemplates[tw-minTemplateWidth];
39 | 			for (int t=0; t<(sequenceLength-tw)+1; ++t) {
40 | 				for (int i=0; i<templateNumIndices[tw-minTemplateWidth]; ++i) {
41 | 					float score = 0.0f;
42 | 					for (int j=0; j<tw*CharacterTemplate.LINE_HEIGHT; ++j) {
43 | 						score += whiteObservations[t*CharacterTemplate.LINE_HEIGHT+j] * whiteTemplatesForWidth[i*tw*CharacterTemplate.LINE_HEIGHT+j];
44 | 					}
45 | 					scores[templateIndicesOffsets[tw-minTemplateWidth]*sequenceLength + CudaUtil.flatten(sequenceLength, templateNumIndices[tw-minTemplateWidth], t, i)] += score;
46 | 				}
47 | 			}
48 | 			for (int t=0; t<(sequenceLength-tw)+1; ++t) {
49 | 				for (int i=0; i<templateNumIndices[tw-minTemplateWidth]; ++i) {
50 | 					float score = 0.0f;
51 | 					for (int j=0; j<tw*CharacterTemplate.LINE_HEIGHT; ++j) {
52 | 						score += blackObservations[t*CharacterTemplate.LINE_HEIGHT+j] * blackTemplateForWidth[i*tw*CharacterTemplate.LINE_HEIGHT+j];
53 | 					}
54 | 					scores[templateIndicesOffsets[tw-minTemplateWidth]*sequenceLength + CudaUtil.flatten(sequenceLength, templateNumIndices[tw-minTemplateWidth], t, i)] += score;
55 | 				}
56 | 			}
57 | 		}
58 | 	}
59 | 
60 | 	public int numOuterThreads() {
61 | 		return numThreads;
62 | 	}
63 | 
64 | 	public int numPopulateThreads() {
65 | 		return 1;
66 | 	}
67 | 
68 | }
69 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/model/em/DenseBigramTransitionModel.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.model.em;
 2 | 
 3 | import edu.berkeley.cs.nlp.ocular.data.textreader.Charset;
 4 | import edu.berkeley.cs.nlp.ocular.lm.LanguageModel;
 5 | import tberg.murphy.arrays.a;
 6 | 
 7 | /**
 8 |  * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu)
 9 |  */
10 | public class DenseBigramTransitionModel {
11 | 	private static double SPC_TO_SPC_SMOOTH = 1e-2;
12 | 	
13 | 	private double[] starts;
14 | 	private double[][] forwardTrans;
15 | 	private double[][] backwardTrans;
16 | 	
17 | 	public DenseBigramTransitionModel(LanguageModel lm) {
18 | 		int numC = lm.getCharacterIndexer().size();
19 | 		
20 | 		this.starts = new double[numC];
21 | 		for (int c=0; c<numC; ++c) {
22 | 			this.starts[c] = Math.log(lm.getCharNgramProb(new int[0], c));
23 | 		}
24 | 		
25 | 		this.forwardTrans = new double[numC][numC];
26 | 		for (int prevC=0; prevC<numC; ++prevC) {
27 | 			for (int c=0; c<numC; ++c) {
28 | 				this.forwardTrans[prevC][c] = Math.log(lm.getCharNgramProb(new int[] {prevC}, c));
29 | 			}
30 | 		}
31 | 		int spaceIndex = lm.getCharacterIndexer().getIndex(Charset.SPACE);
32 | 		a.scalei(this.forwardTrans[spaceIndex], (1.0 - SPC_TO_SPC_SMOOTH));
33 | 		this.forwardTrans[spaceIndex][spaceIndex] += SPC_TO_SPC_SMOOTH;
34 | 		
35 | 		this.backwardTrans = new double[numC][numC];
36 | 		for (int prevC=0; prevC<numC; ++prevC) {
37 | 			for (int c=0; c<numC; ++c) {
38 | 				this.backwardTrans[c][prevC] = this.forwardTrans[prevC][c];
39 | 			}
40 | 		}
41 | 	}
42 | 	
43 | 	public double endLogProb(@SuppressWarnings("unused") int c) {
44 | 		return 0.0;
45 | 	}
46 | 	
47 | 	public double startLogProb(int c) {
48 | 		return starts[c];
49 | 	}
50 | 	
51 | 	public double[] forwardTransitions(int c) {
52 | 		return forwardTrans[c];
53 | 		
54 | 	}
55 | 	
56 | 	public double[] backwardTransitions(int c) {
57 | 		return backwardTrans[c];
58 | 	}
59 | }
60 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/model/em/EmissionCacheInnerLoop.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.model.em;
 2 | 
 3 | /**
 4 |  * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu)
 5 |  */
 6 | public interface EmissionCacheInnerLoop {
 7 | 	public void startup(float[][] whiteTemplates, float[][] blackTemplates, int[] templateNumIndices, int[] templateIndicesOffsets, int minTemplateWidth, int maxTemplateWidth, int maxSequenceLength, int totalTemplateNumIndices);
 8 | 	public void shutdown();
 9 | 	public void compute(final float[] scores, final float[] whiteObservations, final float[] blackObservations, final int sequenceLength);
10 | 	public int numOuterThreads();
11 | 	public int numPopulateThreads();
12 | }
13 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/model/em/EmptyBeamException.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.model.em;
 2 | 
 3 | /**
 4 |  * @author Dan Garrette (dhgarrette@gmail.com)
 5 |  */
 6 | public class EmptyBeamException extends RuntimeException {
 7 | 	private static final long serialVersionUID = 1L;
 8 | 
 9 | 	public EmptyBeamException() {
10 | 		super();
11 | 	}
12 | 
13 | 	public EmptyBeamException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) {
14 | 		super(message, cause, enableSuppression, writableStackTrace);
15 | 	}
16 | 
17 | 	public EmptyBeamException(String message, Throwable cause) {
18 | 		super(message, cause);
19 | 	}
20 | 
21 | 	public EmptyBeamException(String message) {
22 | 		super(message);
23 | 	}
24 | 
25 | 	public EmptyBeamException(Throwable cause) {
26 | 		super(cause);
27 | 	}
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/model/emission/EmissionModel.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.model.emission;
 2 | 
 3 | import edu.berkeley.cs.nlp.ocular.image.ImageUtils.PixelType;
 4 | import edu.berkeley.cs.nlp.ocular.model.CharacterTemplate;
 5 | import edu.berkeley.cs.nlp.ocular.model.transition.SparseTransitionModel.TransitionState;
 6 | 
 7 | /**
 8 |  * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu)
 9 |  */
10 | public interface EmissionModel {
11 | 
12 | 	public abstract int numChars();
13 | 
14 | 	public abstract int numSequences();
15 | 
16 | 	public abstract int sequenceLength(int d);
17 | 
18 | 	public abstract int[] allowedWidths(TransitionState ts);
19 | 
20 | 	public abstract int[] allowedWidths(int c);
21 | 
22 | 	public abstract float logProb(int d, int t, TransitionState ts, int w);
23 | 
24 | 	public abstract float logProb(int d, int t, int c, int w);
25 | 
26 | 	public abstract int getExposure(int d, int t, TransitionState ts, int w);
27 | 
28 | 	public abstract int getOffset(int d, int t, TransitionState ts, int w);
29 | 
30 | 	public abstract int getPadWidth(int d, int t, TransitionState ts, int w);
31 | 
32 | 	public abstract float padWidthLogProb(int pw);
33 | 
34 | 	public abstract void rebuildCache();
35 | 
36 | 	public abstract void incrementCount(int d, TransitionState ts, int startCol, int endCol, float count);
37 | 
38 | 	public abstract void incrementCounts(int d, TransitionState[] transitionStates, int[] widths);
39 | 
40 | 	
41 | 	public static interface EmissionModelFactory {
42 | 		public EmissionModel make(CharacterTemplate[] templates, PixelType[][][] observations);
43 | 	}
44 | 	
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/model/transition/SparseTransitionModel.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.model.transition;
 2 | 
 3 | import java.util.Collection;
 4 | 
 5 | import edu.berkeley.cs.nlp.ocular.gsm.GlyphChar;
 6 | import edu.berkeley.cs.nlp.ocular.model.TransitionStateType;
 7 | import edu.berkeley.cs.nlp.ocular.util.Tuple2;
 8 | 
 9 | /**
10 |  * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu)
11 |  */
12 | public interface SparseTransitionModel {
13 | 	public static interface TransitionState {
14 | 
15 | 		public int getLanguageIndex();
16 | 		
17 | 		/**
18 | 		 * The index of the character that was generated by the language model.
19 | 		 */
20 | 		public int getLmCharIndex();
21 | 
22 | 		/**
23 | 		 * The index of the character that will be rendered, as opposed to the 
24 | 		 * underlying character generated by the language model.
25 | 		 */
26 | 		public GlyphChar getGlyphChar();
27 | 
28 | 		public TransitionStateType getType();
29 | 		public int getOffset();
30 | 		public int getExposure();
31 | 		
32 | 		public Collection<Tuple2<TransitionState,Double>> forwardTransitions();
33 | 		public Collection<Tuple2<TransitionState,Double>> nextLineStartStates();
34 | 		public double endLogProb();
35 | 	}
36 | 	
37 | 	public Collection<Tuple2<TransitionState,Double>> startStates();
38 | }
39 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/output/HtmlOutputWriter.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.output;
 2 | 
 3 | import java.io.File;
 4 | import java.util.List;
 5 | 
 6 | import edu.berkeley.cs.nlp.ocular.data.textreader.Charset;
 7 | import edu.berkeley.cs.nlp.ocular.gsm.GlyphChar;
 8 | import edu.berkeley.cs.nlp.ocular.gsm.GlyphChar.GlyphType;
 9 | import edu.berkeley.cs.nlp.ocular.model.DecodeState;
10 | import edu.berkeley.cs.nlp.ocular.model.transition.SparseTransitionModel.TransitionState;
11 | import edu.berkeley.cs.nlp.ocular.util.FileUtil;
12 | import tberg.murphy.fileio.f;
13 | import tberg.murphy.indexer.Indexer;
14 | 
15 | /**
16 |  * @author Dan Garrette (dhgarrette@gmail.com)
17 |  */
18 | public class HtmlOutputWriter {
19 | 	
20 | 	private Indexer<String> charIndexer;
21 | 	private Indexer<String> langIndexer;
22 | 	
23 | 	public HtmlOutputWriter(Indexer<String> charIndexer, Indexer<String> langIndexer) {
24 | 		this.charIndexer = charIndexer;
25 | 		this.langIndexer = langIndexer;
26 | 	}
27 | 
28 | 	public void write(int numLines, List<DecodeState>[] viterbiTransStates, String imgFilename, String outputFilenameBase) {
29 | 		String htmlOutputFilename = outputFilenameBase + ".html";
30 | 		
31 | 		StringBuffer outputBuffer = new StringBuffer();
32 | 		outputBuffer.append("<HTML xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\n");
33 | 		outputBuffer.append("<HEAD><META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></HEAD>\n");
34 | 		outputBuffer.append("<body>\n");
35 | 		outputBuffer.append("<table><tr><td>\n");
36 | 
37 | 		String[] colors = new String[] { "Black", "Red", "Blue", "Olive", "Orange", "Magenta", "Lime", "Cyan", "Purple", "Green", "Brown" };
38 | 
39 | 		int prevLanguage = -1;
40 | 		for (int line = 0; line < numLines; ++line) {
41 | 			for (DecodeState ds : viterbiTransStates[line]) {
42 | 				TransitionState ts = ds.ts;
43 | 				int lmChar = ts.getLmCharIndex();
44 | 				GlyphChar glyph = ts.getGlyphChar();
45 | 				int glyphChar = glyph.templateCharIndex;
46 | 				String sglyphChar = Charset.unescapeChar(charIndexer.getObject(glyphChar));
47 | 
48 | 				int currLanguage = ts.getLanguageIndex();
49 | 				if (currLanguage != prevLanguage) {
50 | 					outputBuffer.append("<font color=\"" + colors[currLanguage+1] + "\">");
51 | 				}
52 | 				
53 | 				if (lmChar != glyphChar || glyph.glyphType != GlyphType.NORMAL_CHAR) {
54 | 					String norm = Charset.unescapeChar(charIndexer.getObject(lmChar));
55 | 					String dipl = (glyph.glyphType == GlyphType.DOUBLED ? "2x"+sglyphChar : glyph.isElided() ? "" : sglyphChar);
56 | 					outputBuffer.append("[" + norm + "/" + dipl + "]");
57 | 				}
58 | 				else {
59 | 					outputBuffer.append(sglyphChar);
60 | 				}
61 | 
62 | 				prevLanguage = currLanguage;
63 | 			}
64 | 			outputBuffer.append("</br>\n");
65 | 		}
66 | 		outputBuffer.append("</font></font><br/><br/><br/>\n");
67 | 		for (int i = -1; i < langIndexer.size(); ++i) {
68 | 			outputBuffer.append("<font color=\"" + colors[i+1] + "\">" + (i < 0 ? "none" : langIndexer.getObject(i)) + "</font></br>\n");
69 | 		}
70 | 
71 | 		outputBuffer.append("</td><td>\n");
72 | 		outputBuffer.append("<img src=\"" + FileUtil.pathRelativeTo(imgFilename, new File(htmlOutputFilename).getParent()) + "\" style=\"width: 75%; height: 75%\">\n");
73 | 		outputBuffer.append("</td></tr></table>\n");
74 | 		outputBuffer.append("</body></html>\n");
75 | 		outputBuffer.append("\n\n\n");
76 | 		outputBuffer.append("\n\n\n\n\n");
77 | 		String outputString = outputBuffer.toString();
78 | 
79 | 		System.out.println("Writing html output to " + htmlOutputFilename);
80 | 		f.writeString(htmlOutputFilename, outputString);
81 | 	}
82 | 	
83 | }
84 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/preprocessing/Binarizer.java:
--------------------------------------------------------------------------------
  1 | package edu.berkeley.cs.nlp.ocular.preprocessing;
  2 | 
  3 | import edu.berkeley.cs.nlp.ocular.image.ImageUtils;
  4 | 
  5 | /**
  6 |  * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu)
  7 |  */
  8 | public class Binarizer {
  9 | 
 10 | 	public static boolean isBinary(double[][] levels) {
 11 | 		int[] histogram = new int[(int) ImageUtils.MAX_LEVEL+1];
 12 | 		for (int i=0; i<levels.length; i++) {
 13 | 			for (int j=0; j<levels[i].length; j++) {
 14 | 				histogram[(int) levels[i][j]]++;
 15 | 			}
 16 | 		}
 17 | 		int nonZeroEntries = 0;
 18 | 		for (int count : histogram) {
 19 | 			if (count > 0) nonZeroEntries++;
 20 | 		}
 21 | 		return nonZeroEntries <= 2;
 22 | 	}
 23 | 	
 24 | 	public static void binarizeAlreadyBinary(double[][] levels) {
 25 | 		double min = Double.POSITIVE_INFINITY;
 26 | 		double max = Double.NEGATIVE_INFINITY;
 27 | 		for (double[] vals : levels) {
 28 | 			for (double val : vals) {
 29 | 				min = Math.min(val, min);
 30 | 				max = Math.max(val, max);
 31 | 			}
 32 | 		}
 33 | 		double threshold = (max + min) / 2.0;
 34 | 		for (int i = 0; i < levels.length; i++) {
 35 | 			for (int j = 0; j < levels[i].length; j++) {
 36 | 				if (levels[i][j] <= threshold) {
 37 | 					levels[i][j] = 0;
 38 | 				} else {
 39 | 					levels[i][j] = ImageUtils.MAX_LEVEL;
 40 | 				}
 41 | 			}
 42 | 		}
 43 | 	}
 44 | 	
 45 | 	public static void binarizeGlobal(double blackPercential, double[][] levels) {
 46 | 		if (isBinary(levels)) {
 47 | 			binarizeAlreadyBinary(levels);
 48 | 			return;
 49 | 		}
 50 | 		
 51 | 		int[] histogram = new int[(int) ImageUtils.MAX_LEVEL+1];
 52 | 		int total = 0;
 53 | 		for (int i=0; i<levels.length; i++) {
 54 | 			for (int j=0; j<levels[i].length; j++) {
 55 | 				histogram[(int) levels[i][j]]++;
 56 | 				total++;
 57 | 			}
 58 | 		}
 59 | 		
 60 | 		int rank = (int) (Math.ceil(total * blackPercential));
 61 | 		int curRank = 0;
 62 | 		double threshold = 0.0;
 63 | 		for (int v=0; v<histogram.length; ++v) {
 64 | 			curRank += histogram[v];
 65 | 			if (curRank >= rank) {
 66 | 				threshold = v;
 67 | 				break;
 68 | 			}
 69 | 		}
 70 | 		for (int i = 0; i < levels.length; i++) {
 71 | 			for (int j = 0; j < levels[i].length; j++) {
 72 | 				if (levels[i][j] <= threshold) {
 73 | 					levels[i][j] = 0;
 74 | 				} else {
 75 | 					levels[i][j] = ImageUtils.MAX_LEVEL;
 76 | 				}
 77 | 			}
 78 | 		}
 79 | 	}
 80 | 	
 81 | 	public static void binarizeLocal(double blackPercential, double radiusFactor,  double[][] levels) {
 82 | 		if (isBinary(levels)) {
 83 | 			binarizeAlreadyBinary(levels);
 84 | 			return;
 85 | 		}
 86 | 		
 87 | 		int radius = (int) (levels.length * radiusFactor);
 88 | 		
 89 | 		int dWidth = (int) Math.ceil((double) levels.length / radius);
 90 | 		int dHeight = (int) Math.ceil((double) levels[0].length / radius);
 91 | 		double[][] thresholds = new double[dWidth][dHeight];
 92 | 		for (int di=0; di<dWidth; ++di) {
 93 | 			for (int dj=0; dj<dHeight; ++dj) {
 94 | 				int i = di*radius + radius / 2;
 95 | 				int j = dj*radius + radius / 2;
 96 | 				if (i < levels.length && j < levels[0].length) {
 97 | 					int[] histogram = new int[(int) ImageUtils.MAX_LEVEL+1];
 98 | 					int total = 0;
 99 | 					for (int i0 = Math.max(0,i-radius); i0 < Math.min(levels.length,i+radius); i0++) {
100 | 						for (int j0 = Math.max(0,j-radius); j0 < Math.min(levels[i].length,j+radius); j0++) {
101 | 							histogram[(int) levels[i0][j0]]++;
102 | 							total++;
103 | 						}
104 | 					}
105 | 					int rank = (int) (Math.ceil(total * blackPercential));
106 | 					int curRank = 0;
107 | 					double threshold = 0.0;
108 | 					for (int v=0; v<histogram.length; ++v) {
109 | 						curRank += histogram[v];
110 | 						if (curRank >= rank) {
111 | 							threshold = v;
112 | 							break;
113 | 						}
114 | 					}
115 | 					thresholds[di][dj] = threshold;
116 | 				}
117 | 			}
118 | 		}
119 | 		
120 | 		
121 | 		for (int i = 0; i < levels.length; i++) {
122 | 			for (int j = 0; j < levels[i].length; j++) {
123 | 				if (levels[i][j] <= thresholds[i/radius][j/radius]) {
124 | 					levels[i][j] = 0;
125 | 				} else {
126 | 					levels[i][j] = ImageUtils.MAX_LEVEL;
127 | 				}
128 | 			}
129 | 		}
130 | 	}
131 | 	
132 | }
133 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/preprocessing/LineExtractor.java:
--------------------------------------------------------------------------------
  1 | package edu.berkeley.cs.nlp.ocular.preprocessing;
  2 | 
  3 | import java.io.File;
  4 | import java.io.FilenameFilter;
  5 | import java.util.ArrayList;
  6 | import java.util.List;
  7 | 
  8 | import edu.berkeley.cs.nlp.ocular.image.ImageUtils;
  9 | import edu.berkeley.cs.nlp.ocular.image.ImageUtils.ConnectedComponentProcessor;
 10 | import edu.berkeley.cs.nlp.ocular.preprocessing.VerticalProfile.VerticalSegmentation;
 11 | import tberg.murphy.fileio.f;
 12 | import tberg.murphy.tuple.Pair;
 13 | 
 14 | /**
 15 |  * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu)
 16 |  */
 17 | public class LineExtractor {
 18 | 	
 19 | 	public static List<double[][]> extractLines(double[][] levels) {
 20 | 		VerticalProfile verticalProfile = new VerticalProfile(levels);
 21 | 		VerticalModel trainedModel = verticalProfile.runEM(5, 100);
 22 | //		trainedModel.freezeSizeParams(1);
 23 | 		VerticalSegmentation viterbiSegments = verticalProfile.decode(trainedModel);
 24 | //		ImageUtils.display(Visualizer.renderLineExtraction(levels, viterbiSegments));
 25 | 		
 26 | 		List<double[][]> result = new ArrayList<double[][]>();
 27 | 
 28 | 		int topDist = 29;
 29 | 		int botDist = 11;
 30 | 		List<Pair<Integer,Integer>> segments = viterbiSegments.retrieveLineBoundaries();
 31 | 		List<Integer> baselines = viterbiSegments.retrieveBaselines();
 32 | 		for (int s=0; s<baselines.size(); ++s) {
 33 | 			int base = baselines.get(s);
 34 | 			int upper = segments.get(s).getFirst();
 35 | 			int lower = segments.get(s).getSecond();
 36 | 			double[][] line = new double[levels.length][topDist+botDist];
 37 | 			for (int t=0; t<topDist; ++t) {
 38 | 				for (int x = 0; x < levels.length; x++) {
 39 | 					int pos = base+(t-topDist);
 40 | 					if (pos < 0 || pos >= levels[0].length){
 41 | //						if (pos < 0 || pos >= levels[0].length || pos < upper-5 || pos >= lower+5){
 42 | 						line[x][t] = ImageUtils.MAX_LEVEL;
 43 | 					} else {
 44 | 						line[x][t] = levels[x][pos];
 45 | 					}
 46 | 				}
 47 | 			}
 48 | 			for (int b=0; b<botDist; ++b) {
 49 | 				for (int x = 0; x < levels.length; x++) {
 50 | 					int pos = base+b;
 51 | 					if (pos < 0 || pos >= levels[0].length){
 52 | //						if (pos < 0 || pos >= levels[0].length || pos < upper-5 || pos >= lower+5){
 53 | 						line[x][topDist+b] = ImageUtils.MAX_LEVEL;
 54 | 					} else {
 55 | 						line[x][topDist+b] = levels[x][pos];
 56 | 					}
 57 | 				}
 58 | 			}
 59 | 			result.add(line);
 60 | 		}
 61 | 		
 62 | //		List<Pair<Integer,Integer>> lineBoundaries = viterbiSegments.retrieveLineBoundaries();
 63 | //		for (Pair<Integer,Integer> boundary : lineBoundaries) {
 64 | //			double[][] line = new double[levels.length][boundary.getSecond().intValue() - boundary.getFirst().intValue()];
 65 | //			for (int y = boundary.getFirst().intValue(); y < boundary.getSecond().intValue(); y++) {
 66 | //				for (int x = 0; x < levels.length; x++) {
 67 | //					line[x][y-boundary.getFirst()] = levels[x][y];
 68 | //				}
 69 | //			}
 70 | //			result.add(line);
 71 | //		}
 72 | 		
 73 | 		System.out.println("Extractor returned " + result.size() + " line images");
 74 | 		return result;
 75 | 	}
 76 | 
 77 | 	public static void main(String[] args) {
 78 | 		String path = "/Users/tberg/Desktop/F-tem/seg_extraction/";
 79 | 		File dir = new File(path);
 80 | 		for (String name : dir.list(new FilenameFilter() {
 81 | 			public boolean accept(File dir, String name) {
 82 | 				return name.endsWith(".png") || name.endsWith(".jpg");
 83 | 			}
 84 | 		})) {
 85 | 			double[][] levels = ImageUtils.getLevels(f.readImage(path+"/"+name));
 86 | 			ConnectedComponentProcessor ccprocBig = new ConnectedComponentProcessor() {
 87 | 				public void process(double[][] levels, List<int[]> connectedComponent) {
 88 | 					if (connectedComponent.size() > 1000) {
 89 | 						for (int[] pixel : connectedComponent) {
 90 | 							levels[pixel[0]][pixel[1]] = 255.0;
 91 | 						}
 92 | 					}
 93 | 				}
 94 | 			};
 95 | 			ImageUtils.processConnectedComponents(levels, 50.0, ccprocBig);
 96 | 			Binarizer.binarizeGlobal(0.13, levels);
 97 | 			ConnectedComponentProcessor ccprocSmall = new ConnectedComponentProcessor() {
 98 | 				public void process(double[][] levels, List<int[]> connectedComponent) {
 99 | 					if (connectedComponent.size() < 20 || connectedComponent.size() > 1000) {
100 | 						for (int[] pixel : connectedComponent) {
101 | 							levels[pixel[0]][pixel[1]] = 255.0;
102 | 						}
103 | 					}
104 | 				}
105 | 			};
106 | 			ImageUtils.processConnectedComponents(levels, 127.0, ccprocSmall);
107 | 			List<double[][]> lines = extractLines(levels);
108 | 			for (double[][] line : lines) {
109 | 				ImageUtils.display(ImageUtils.makeImage(line));
110 | 			}
111 | 		}
112 | 	}
113 | 	
114 | }
115 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/preprocessing/ManualStackCropperPrep.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.preprocessing;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FilenameFilter;
 5 | import java.util.Arrays;
 6 | import java.util.List;
 7 | 
 8 | import edu.berkeley.cs.nlp.ocular.image.ImageUtils;
 9 | import edu.berkeley.cs.nlp.ocular.image.ImageUtils.ConnectedComponentProcessor;
10 | import tberg.murphy.fileio.f;
11 | 
12 | public class ManualStackCropperPrep {
13 | 	
14 | 	public static void main(String[] args) {
15 | 		String path = args[0];
16 | 		double binarizeThresh = 0.1;
17 | 		if (args.length > 1) {
18 | 			binarizeThresh = Double.parseDouble(args[1]);
19 | 		}
20 | 		File dir = new File(path);
21 | 		String[] names = dir.list(new FilenameFilter() {
22 | 			public boolean accept(File dir, String name) {
23 | 				return name.endsWith(".png") || name.endsWith(".jpg");
24 | 			}
25 | 		});
26 | 		Arrays.sort(names);
27 | 		File oddDirCol1 = new File(path + "/odd_col1");
28 | 		File oddDirCol2 = new File(path + "/odd_col2");
29 | 		oddDirCol1.mkdirs();
30 | 		oddDirCol2.mkdirs();
31 | 		File evenDirCol1 = new File(path + "/even_col1");
32 | 		File evenDirCol2 = new File(path + "/even_col2");
33 | 		evenDirCol1.mkdirs();
34 | 		evenDirCol2.mkdirs();
35 | 		File dirExtr = new File(path + "/col_extraction");
36 | 		dirExtr.mkdirs();
37 | 		boolean odd = false;
38 | 		for (String name : names) {
39 | 			double[][] levels = ImageUtils.getLevels(f.readImage(path+"/"+name));
40 | 			ConnectedComponentProcessor ccprocBig = new ConnectedComponentProcessor() {
41 | 				public void process(double[][] levels, List<int[]> connectedComponent) {
42 | 					if (connectedComponent.size() > 1000) {
43 | 						for (int[] pixel : connectedComponent) {
44 | 							levels[pixel[0]][pixel[1]] = 255.0;
45 | 						}
46 | 					}
47 | 				}
48 | 			};
49 | 			ImageUtils.processConnectedComponents(levels, 50.0, ccprocBig);
50 | 			Binarizer.binarizeGlobal(binarizeThresh, levels);
51 | 			ConnectedComponentProcessor ccprocSmall = new ConnectedComponentProcessor() {
52 | 				public void process(double[][] levels, List<int[]> connectedComponent) {
53 | 					if (connectedComponent.size() < 20 || connectedComponent.size() > 500) {
54 | 						for (int[] pixel : connectedComponent) {
55 | 							levels[pixel[0]][pixel[1]] = 255.0;
56 | 						}
57 | 					}
58 | 				}
59 | 			};
60 | 			ImageUtils.processConnectedComponents(levels, 127.0, ccprocSmall);
61 | 			double[][] rotLevels = Straightener.straighten(levels);
62 | 			String baseName = (name.lastIndexOf('.') == -1) ? name : name.substring(0, name.lastIndexOf('.'));
63 | 			f.writeImage((odd ? oddDirCol1.getAbsolutePath() : evenDirCol1.getAbsolutePath()) +"/"+ baseName + "_col1.png", ImageUtils.makeImage(rotLevels));
64 | 			f.writeImage((odd ? oddDirCol2.getAbsolutePath() : evenDirCol2.getAbsolutePath()) +"/"+ baseName + "_col2.png", ImageUtils.makeImage(rotLevels));
65 | 			odd = !odd;
66 | 		}
67 | 	}
68 | 
69 | }
70 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/preprocessing/Straightener.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.preprocessing;
 2 | 
 3 | import java.awt.image.BufferedImage;
 4 | import java.io.File;
 5 | import java.io.FilenameFilter;
 6 | import java.util.Arrays;
 7 | import java.util.List;
 8 | 
 9 | import edu.berkeley.cs.nlp.ocular.image.ImageUtils;
10 | import edu.berkeley.cs.nlp.ocular.image.ImageUtils.ConnectedComponentProcessor;
11 | import tberg.murphy.fileio.f;
12 | 
13 | /**
14 |  * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu)
15 |  */
16 | public class Straightener {
17 | 	
18 | 	private static final double MIN_ANGLE_RADIANS = -0.05;
19 | 	private static final double MAX_ANGLE_RADIANS = 0.05;
20 | 	private static final int ANGLE_SAMPLE_POINTS = 20;
21 | 	
22 | 	public static double[][] straighten(double[][] levels) {
23 | 		BufferedImage image = ImageUtils.makeImage(levels);
24 | 		double maxTotalVar = Double.NEGATIVE_INFINITY;
25 | 		double bestAngle = Double.NEGATIVE_INFINITY;
26 | 		for (int i=0; i<ANGLE_SAMPLE_POINTS; ++i) {
27 | 			double angle = MIN_ANGLE_RADIANS + ((double) i / (ANGLE_SAMPLE_POINTS-1)) * (MAX_ANGLE_RADIANS - MIN_ANGLE_RADIANS);
28 | 			double[][] rotLevels = ImageUtils.getLevels(ImageUtils.rotateImage(image, angle));
29 | 			double totalVar = verticalTotalVariation(rotLevels);
30 | //			System.out.println("angle: "+angle+", total var: "+totalVar);
31 | 			if (totalVar > maxTotalVar) {
32 | 				maxTotalVar = totalVar;
33 | 				bestAngle = angle;
34 | 			}
35 | 		}
36 | 		return ImageUtils.getLevels(ImageUtils.rotateImage(ImageUtils.makeImage(levels), bestAngle));
37 | 	}
38 | 	
39 | 	private static double verticalTotalVariation(double[][] levels) {
40 | 		double[] horizontalAvg = new double[levels[0].length];
41 | 		for (int i=0; i<levels.length; ++i) {
42 | 			for (int j=0; j<levels[0].length; ++j) {
43 | 				horizontalAvg[j] += levels[i][j] / levels.length;
44 | 			}
45 | 		}
46 | 		double totalVar = 0;
47 | 		for (int j=1; j<levels[0].length; ++j) {
48 | 			totalVar += Math.abs(horizontalAvg[j] - horizontalAvg[j-1]);
49 | 		}
50 | 		return totalVar / (levels[0].length-1);
51 | 	}
52 | 	
53 | 	public static void main(String[] args) {
54 | 		String path = args[0];
55 | 		double binarizeThresh = 0.1;
56 | 		if (args.length > 1) {
57 | 			binarizeThresh = Double.parseDouble(args[1]);
58 | 		}
59 | 		File dir = new File(path);
60 | 		String[] names = dir.list(new FilenameFilter() {
61 | 			public boolean accept(File dir, String name) {
62 | 				return name.endsWith(".png") || name.endsWith(".jpg");
63 | 			}
64 | 		});
65 | 		Arrays.sort(names);
66 | 		File straightDir = new File(path + "/straight");
67 | 		straightDir.mkdirs();
68 | 		for (String name : names) {
69 | 			double[][] levels = ImageUtils.getLevels(f.readImage(path+"/"+name));
70 | 			ConnectedComponentProcessor ccprocBig = new ConnectedComponentProcessor() {
71 | 				public void process(double[][] levels, List<int[]> connectedComponent) {
72 | 					if (connectedComponent.size() > 1000) {
73 | 						for (int[] pixel : connectedComponent) {
74 | 							levels[pixel[0]][pixel[1]] = 255.0;
75 | 						}
76 | 					}
77 | 				}
78 | 			};
79 | 			ImageUtils.processConnectedComponents(levels, 50.0, ccprocBig);
80 | 			Binarizer.binarizeGlobal(binarizeThresh, levels);
81 | 			ConnectedComponentProcessor ccprocSmall = new ConnectedComponentProcessor() {
82 | 				public void process(double[][] levels, List<int[]> connectedComponent) {
83 | 					if (connectedComponent.size() < 20 || connectedComponent.size() > 500) {
84 | 						for (int[] pixel : connectedComponent) {
85 | 							levels[pixel[0]][pixel[1]] = 255.0;
86 | 						}
87 | 					}
88 | 				}
89 | 			};
90 | 			ImageUtils.processConnectedComponents(levels, 127.0, ccprocSmall);
91 | 			double[][] rotLevels = Straightener.straighten(levels);
92 | 			String baseName = (name.lastIndexOf('.') == -1) ? name : name.substring(0, name.lastIndexOf('.'));
93 | 			f.writeImage(straightDir.getAbsolutePath() +"/"+ baseName + ".png", ImageUtils.makeImage(rotLevels));
94 | 		}
95 | 	}
96 | }
97 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/preprocessing/Test.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.preprocessing;
 2 | 
 3 | import java.awt.image.BufferedImage;
 4 | import java.io.File;
 5 | 
 6 | import edu.berkeley.cs.nlp.ocular.data.PdfImageReader;
 7 | import edu.berkeley.cs.nlp.ocular.image.ImageUtils;
 8 | import tberg.murphy.fileio.f;
 9 | 
10 | /**
11 |  * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu)
12 |  */
13 | public class Test {
14 | 
15 | 	public static void main(String[] args) {
16 | 		// String path = "sample_images/multilingual/";
17 | 		// String path = "/Users/dhg/Desktop/pl/";
18 | 		// File dir = new File(path);
19 | 		// for (String name : dir.list()) {
20 | 		// double[][] levels = ImageUtils.getLevels(f.readImage(path+"/"+name));
21 | 		// double[][] rotLevels = Straightener.straighten(levels);
22 | 		// Binarizer.binarizeGlobal(0.08, rotLevels);
23 | 		// ImageUtils.display(ImageUtils.makeImage(rotLevels));
24 | 		//
25 | 		//
26 | 		// // double[][] cropLevels = Cropper.crop(rotLevels);
27 | 		// // ImageUtils.display(ImageUtils.makeImage(cropLevels));
28 | 
29 | 		
30 | 		{
31 | 			File file = new File("sample_images/multilingual/pl_blac_047_00039-800.jpg");
32 | 			BufferedImage image = f.readImage(file.getPath());
33 | 			double[][] levels = ImageUtils.getLevels(image);
34 | 			double[][] rotLevels = Straightener.straighten(levels);
35 | 			Binarizer.binarizeGlobal(0.08, rotLevels);
36 | 			ImageUtils.display(ImageUtils.makeImage(rotLevels));
37 | 			double[][] cropLevels = Cropper.crop(rotLevels, 0.12);
38 | 			ImageUtils.display(ImageUtils.makeImage(cropLevels));
39 | 		}
40 | 
41 | 		{
42 | 			File file = new File("sample_images/multilingual/adv.pdf");
43 | 			BufferedImage image = PdfImageReader.readPdfPageAsImage(file, 1);
44 | 			double[][] levels = ImageUtils.getLevels(image);
45 | 			double[][] rotLevels = Straightener.straighten(levels);
46 | 			Binarizer.binarizeGlobal(0.08, rotLevels);
47 | 			ImageUtils.display(ImageUtils.makeImage(rotLevels));
48 | 			double[][] cropLevels = Cropper.crop(rotLevels, 0.12);
49 | 			ImageUtils.display(ImageUtils.makeImage(cropLevels));
50 | 		}
51 | 
52 | 	}
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/train/ModelPathMaker.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.train;
 2 | 
 3 | /**
 4 |  * @author Dan Garrette (dhgarrette@gmail.com)
 5 |  */
 6 | public class ModelPathMaker {
 7 | 
 8 | 	public static String makeFontDir(String outputPath) {
 9 | 		return outputPath + "/font/";
10 | 	}
11 | 	public static String makeFontPath(String outputPath, int iter, int batch) {
12 | 		return makeFontDir(outputPath) + makeOutputFilePrefix(iter, batch) + ".fontser";
13 | 	}
14 | 	public static String makeFontFilenameRegex() {
15 | 		return makeOutputFilePrefixRegex() + ".fontser";
16 | 	}
17 | 	
18 | 	public static String makeLmDir(String outputPath) {
19 | 		return outputPath + "/lm/";
20 | 	}
21 | 	public static String makeLmPath(String outputPath, int iter, int batch) {
22 | 		return makeLmDir(outputPath) + makeOutputFilePrefix(iter, batch) + ".lmser";
23 | 	}
24 | 	public static String makeLmFilenameRegex() {
25 | 		return makeOutputFilePrefixRegex() + ".lmser";
26 | 	}
27 | 
28 | 	public static String makeGsmDir(String outputPath) {
29 | 		return outputPath + "/gsm/";
30 | 	}
31 | 	public static String makeGsmPath(String outputPath, int iter, int batch) {
32 | 		return makeGsmDir(outputPath) + makeOutputFilePrefix(iter, batch) + ".gsmser";
33 | 	}
34 | 	public static String makeGsmFilenameRegex() {
35 | 		return makeOutputFilePrefixRegex() + ".gsmser";
36 | 	}
37 | 	
38 | 	private static String makeOutputFilePrefix(int iter, int batch) {
39 | 		return "retrained_iter-"+iter+"_batch-"+batch;
40 | 	}
41 | 	public static String makeOutputFilePrefixRegex() {
42 | 		return "retrained_iter-(\\d+)_batch-(\\d+)";
43 | 	}
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/train/TrainingRestarter.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.train;
 2 | 
 3 | import static edu.berkeley.cs.nlp.ocular.train.ModelPathMaker.makeFontPath;
 4 | import static edu.berkeley.cs.nlp.ocular.train.ModelPathMaker.makeGsmPath;
 5 | import static edu.berkeley.cs.nlp.ocular.train.ModelPathMaker.makeLmPath;
 6 | import static edu.berkeley.cs.nlp.ocular.util.Tuple2.Tuple2;
 7 | import static edu.berkeley.cs.nlp.ocular.util.Tuple3.Tuple3;
 8 | 
 9 | import java.io.File;
10 | 
11 | import edu.berkeley.cs.nlp.ocular.font.Font;
12 | import edu.berkeley.cs.nlp.ocular.gsm.GlyphSubstitutionModel;
13 | import edu.berkeley.cs.nlp.ocular.lm.CodeSwitchLanguageModel;
14 | import edu.berkeley.cs.nlp.ocular.main.InitializeFont;
15 | import edu.berkeley.cs.nlp.ocular.main.InitializeGlyphSubstitutionModel;
16 | import edu.berkeley.cs.nlp.ocular.main.InitializeLanguageModel;
17 | import edu.berkeley.cs.nlp.ocular.util.Tuple2;
18 | import edu.berkeley.cs.nlp.ocular.util.Tuple3;
19 | 
20 | /**
21 |  * @author Dan Garrette (dhgarrette@gmail.com)
22 |  */
23 | public class TrainingRestarter {
24 | 
25 | 	/**
26 | 	 * If requested, try and pick up where we left off
27 | 	 */
28 | 	public Tuple2<Integer, Tuple3<Font, CodeSwitchLanguageModel, GlyphSubstitutionModel>> getRestartModels(
29 | 			Font inputFont, CodeSwitchLanguageModel inputLm, GlyphSubstitutionModel inputGsm, 
30 | 			boolean updateLM, boolean updateGsm, String outputPath,
31 | 			int numEMIters, int numUsableDocs, int updateDocBatchSize, boolean noUpdateIfBatchTooSmall) {
32 | 
33 | 		int lastCompletedIteration = 0;
34 | 		String fontPath = null;
35 | 		int lastBatchNumOfIteration = getLastBatchNumOfIteration(numUsableDocs, updateDocBatchSize, noUpdateIfBatchTooSmall);
36 | 		for (int iter = 1; iter <= numEMIters; ++iter) {
37 | 			fontPath = makeFontPath(outputPath, iter, lastBatchNumOfIteration);
38 | 			if (new File(fontPath).exists()) {
39 | 				lastCompletedIteration = iter;
40 | 			}
41 | 		}
42 | 		
43 | 		Font newFont = inputFont;
44 | 		CodeSwitchLanguageModel newLm = inputLm;
45 | 		GlyphSubstitutionModel newGsm = inputGsm;
46 | 		
47 | 		if (lastCompletedIteration == numEMIters) {
48 | 			System.out.println("All iterations are already complete!");
49 | 		}
50 | 		else if (lastCompletedIteration > 0) {
51 | 			System.out.println("Last completed iteration: "+lastCompletedIteration);
52 | 			if (fontPath != null) {
53 | 				String lastFontPath = makeFontPath(outputPath, lastCompletedIteration, lastBatchNumOfIteration);
54 | 				System.out.println("    Loading font of last completed iteration: "+lastFontPath);
55 | 				newFont = InitializeFont.readFont(lastFontPath);
56 | 			}
57 | 			if (updateLM) {
58 | 				String lastLmPath = makeLmPath(outputPath, lastCompletedIteration, lastBatchNumOfIteration);
59 | 				System.out.println("    Loading lm of last completed iteration:  "+lastLmPath);
60 | 				newLm = InitializeLanguageModel.readCodeSwitchLM(lastLmPath);
61 | 			}
62 | 			if (updateGsm) {
63 | 				String lastGsmPath = makeGsmPath(outputPath, lastCompletedIteration, lastBatchNumOfIteration);
64 | 				System.out.println("    Loading gsm of last completed iteration:   "+lastGsmPath);
65 | 				newGsm = InitializeGlyphSubstitutionModel.readGSM(lastGsmPath);
66 | 			}
67 | 		}
68 | 		else {
69 | 			System.out.println("No completed iterations found");
70 | 		}
71 | 		
72 | 		return Tuple2(lastCompletedIteration, Tuple3(newFont,newLm,newGsm));
73 | 	}
74 | 
75 | 	private int getLastBatchNumOfIteration(int numUsableDocs, int updateDocBatchSize, boolean noUpdateIfBatchTooSmall) {
76 | 		int completedBatchesInIteration = 0;
77 | 		int currentBatchSize = 0;
78 | 		for (int docNum = 0; docNum < numUsableDocs; ++docNum) {
79 | 			++currentBatchSize;
80 | 			if (FontTrainer.isBatchComplete(numUsableDocs, docNum, currentBatchSize, updateDocBatchSize, noUpdateIfBatchTooSmall)) {
81 | 				++completedBatchesInIteration;
82 | 			}
83 | 		}
84 | 		return completedBatchesInIteration;
85 | 	}
86 | 
87 | }
88 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/util/ArrayHelper.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.util;
 2 | 
 3 | import java.util.Arrays;
 4 | 
 5 | /**
 6 |  * @author Dan Garrette (dhgarrette@gmail.com)
 7 |  */
 8 | public class ArrayHelper {
 9 | 
10 | 	public static int sum(int[] xs) {
11 | 		int result = 0;
12 | 		for (int x : xs) result += x;
13 | 		return result;
14 | 	}
15 | 
16 | 	public static double sum(double[] xs) {
17 | 		double result = 0.0;
18 | 		for (double x : xs) result += x;
19 | 		return result;
20 | 	}
21 | 
22 | 	public static double avg(int[] xs) {
23 | 		if (xs.length == 0) return 0.0;
24 | 		else return ((double)sum(xs)) / xs.length;
25 | 	}
26 | 
27 | 	public static double avg(double[] xs) {
28 | 		if (xs.length == 0) return 0.0;
29 | 		else return sum(xs) / xs.length;
30 | 	}
31 | 
32 | 	public static int max(int... xs) {
33 | 		if (xs.length == 0) throw new RuntimeException("ArrayHelper.max cannot be used on an empty array.");
34 | 		int v = Integer.MIN_VALUE;
35 | 		for (int x : xs) {
36 | 			if (x > v) v = x;
37 | 		}
38 | 		return v;
39 | 	}
40 | 	
41 | 	public static double max(double... xs) {
42 | 		if (xs.length == 0) throw new RuntimeException("ArrayHelper.max cannot be used on an empty array.");
43 | 		double v = Double.MIN_VALUE;
44 | 		for (double x : xs) {
45 | 			if (x > v) v = x;
46 | 		}
47 | 		return v;
48 | 	}
49 | 	
50 | 	public static int min(int... xs) {
51 | 		if (xs.length == 0) throw new RuntimeException("ArrayHelper.min cannot be used on an empty array.");
52 | 		int v = Integer.MAX_VALUE;
53 | 		for (int x : xs)
54 | 			if (x < v) v = x;
55 | 		return v;
56 | 	}
57 | 
58 | 	public static double min(double... xs) {
59 | 		if (xs.length == 0) throw new RuntimeException("ArrayHelper.min cannot be used on an empty array.");
60 | 		double v = Double.MAX_VALUE;
61 | 		for (double x : xs)
62 | 			if (x < v) v = x;
63 | 		return v;
64 | 	}
65 | 
66 | 	public static int[] prepend(int c, int[] vec1) {
67 | 		int[] result = new int[vec1.length + 1];
68 | 		if (vec1.length > 0) System.arraycopy(vec1, 0, result, 1, vec1.length);
69 | 		result[0] = c;
70 | 		return result;
71 | 	}
72 | 
73 | 	public static <A> A[] append(A[] vec1, A c) {
74 | 		A[] result = Arrays.copyOf(vec1, vec1.length + 1);
75 | 		result[result.length - 1] = c;
76 | 		return result;
77 | 	}
78 | 
79 | 	public static int[] take(int[] vec1, int n) {
80 | 		int n2 = Math.min(vec1.length, n);
81 | 		int[] result = new int[n2];
82 | 		if (vec1.length > 0) System.arraycopy(vec1, 0, result, 0, n2);
83 | 		return result;
84 | 	}
85 | 
86 | 	public static int[] takeRight(int[] vec1, int n) {
87 | 		int n2 = Math.min(vec1.length, n);
88 | 		int[] result = new int[n2];
89 | 		if (vec1.length > 0) System.arraycopy(vec1, vec1.length - n2, result, 0, n2);
90 | 		return result;
91 | 	}
92 | 
93 | }
94 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/util/FileHelper.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.util;
 2 | 
 3 | import java.io.BufferedWriter;
 4 | import java.io.File;
 5 | import java.io.FileOutputStream;
 6 | import java.io.IOException;
 7 | import java.io.OutputStreamWriter;
 8 | 
 9 | /**
10 |  * @author Dan Garrette (dhgarrette@gmail.com)
11 |  */
12 | public class FileHelper {
13 | 
14 | 	public static void writeString(String path, String str) {
15 | 		BufferedWriter out = null;
16 | 		try {
17 | 			File f = new File(path);
18 | 			f.getAbsoluteFile().getParentFile().mkdirs();
19 | 			out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), "utf-8"));
20 | 			out.write(str);
21 | 		} catch (IOException ex) {
22 | 			throw new RuntimeException(ex);
23 | 		} finally {
24 | 			if (out != null) {
25 | 				try { out.close(); } catch (Exception ex) {}
26 | 			}
27 | 		}
28 | 	}
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/util/StringHelper.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.util;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | /**
 6 |  * @author Dan Garrette (dhgarrette@gmail.com)
 7 |  */
 8 | public class StringHelper {
 9 | 
10 | 	public static String toUnicode(String s) {
11 | 		//if (s.length() != 1) throw new RuntimeException("toUnicode input must be a single character");
12 | 		StringBuilder sb = new StringBuilder();
13 | 		for (int i = 0; i < s.length(); ++i)
14 | 			sb.append(toUnicode(s.charAt(i)));
15 | 		return sb.toString();
16 | 	}
17 | 
18 | 	public static String toUnicode(char c) {
19 | 		return "\\u" + Integer.toHexString(c | 0x10000).substring(1);
20 | 	}
21 | 
22 | 	public static String take(String s, int n) {
23 | 		if (n <= 0)
24 | 			return "";
25 | 		else if (n < s.length())
26 | 			return s.substring(0, n);
27 | 		else
28 | 			return s;
29 | 	}
30 | 
31 | 	public static String drop(String s, int n) {
32 | 		if (n <= 0)
33 | 			return s;
34 | 		else if (n < s.length())
35 | 			return s.substring(n);
36 | 		else
37 | 			return "";
38 | 	}
39 | 
40 | 	public static String last(String s) {
41 | 		if (s.isEmpty()) throw new IllegalArgumentException("cannot get `last` of empty string");
42 | 		return s.substring(s.length() - 1);
43 | 	}
44 | 
45 | 	public static String join(String... xs) {
46 | 		StringBuilder sb = new StringBuilder();
47 | 		for (String x : xs)
48 | 			sb.append(x);
49 | 		return sb.toString();
50 | 	}
51 | 
52 | 	public static String join(List<String> xs) {
53 | 		StringBuilder sb = new StringBuilder();
54 | 		for (String x : xs)
55 | 			sb.append(x);
56 | 		return sb.toString();
57 | 	}
58 | 
59 | 	public static String join(List<String> xs, String sep) {
60 | 		int sepLen = sep.length();
61 | 		StringBuilder sb = new StringBuilder();
62 | 		for (String x : xs)
63 | 			sb.append(x).append(sep);
64 | 		return sb.length() > 0 ? sb.delete(sb.length() - sepLen, sb.length()).toString() : "";
65 | 	}
66 | 
67 | 	public static boolean equals(String a, String b) {
68 | 		if (a == null)
69 | 			return b == null;
70 | 		else
71 | 			return a.equals(b);
72 | 	}
73 | 	
74 | 	public static int longestCommonPrefix(String a, String b) {
75 | 		int i = 0;
76 | 		char[] as = a.toCharArray();
77 | 		char[] bs = b.toCharArray();
78 | 		int aLen = as.length;
79 | 		int bLen = bs.length;
80 | 		while (i < aLen && i < bLen && as[i] == bs[i])
81 | 			++i;
82 | 		return i;
83 | 	}
84 | 	
85 | }
86 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/util/Tuple2.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.util;
 2 | 
 3 | import java.io.Serializable;
 4 | import java.util.Comparator;
 5 | 
 6 | /**
 7 |  * @author Dan Klein
 8 |  * @author Dan Garrette (dhgarrette@gmail.com)
 9 |  */
10 | public class Tuple2<A1, A2> implements Serializable {
11 | 	static final long serialVersionUID = 52;
12 | 
13 | 	public final A1 _1;
14 | 	public final A2 _2;
15 | 
16 | 	public boolean equals(Object o) {
17 | 		if (this == o)
18 | 			return true;
19 | 		if (!(o instanceof Tuple2))
20 | 			return false;
21 | 
22 | 		@SuppressWarnings("rawtypes")
23 | 		final Tuple2 tuple = (Tuple2) o;
24 | 
25 | 		if (_1 != null ? !_1.equals(tuple._1) : tuple._1 != null)
26 | 			return false;
27 | 		if (_2 != null ? !_2.equals(tuple._2) : tuple._2 != null)
28 | 			return false;
29 | 
30 | 		return true;
31 | 	}
32 | 
33 | 	public int hashCode() {
34 | 		int result;
35 | 		result = (_1 != null ? _1.hashCode() : 0);
36 | 		result = 29 * result + (_2 != null ? _2.hashCode() : 0);
37 | 		return result;
38 | 	}
39 | 
40 | 	public String toString() {
41 | 		return "(" + _1 + ", " + _2 + ")";
42 | 	}
43 | 
44 | 	public Tuple2(A1 _1, A2 _2) {
45 | 		this._1 = _1;
46 | 		this._2 = _2;
47 | 	}
48 | 
49 | 	public static <A1, A2> Tuple2<A1, A2> Tuple2(A1 _1, A2 _2) {
50 | 		return new Tuple2<A1, A2>(_1, _2);
51 | 	}
52 | 
53 | 	public static class LexicographicTuple2Comparator<A1, A2> implements Comparator<Tuple2<A1, A2>> {
54 | 		Comparator<A1> _1Comparator;
55 | 		Comparator<A2> _2Comparator;
56 | 
57 | 		public int compare(Tuple2<A1, A2> tuple1, Tuple2<A1, A2> tuple2) {
58 | 			int _1Compare = _1Comparator.compare(tuple1._1, tuple2._1);
59 | 			if (_1Compare != 0)
60 | 				return _1Compare;
61 | 			return _2Comparator.compare(tuple1._2, tuple2._2);
62 | 		}
63 | 
64 | 		public LexicographicTuple2Comparator(Comparator<A1> _1Comparator, Comparator<A2> _2Comparator) {
65 | 			this._1Comparator = _1Comparator;
66 | 			this._2Comparator = _2Comparator;
67 | 		}
68 | 	}
69 | 
70 | 	public static class DefaultLexicographicTuple2Comparator<A1 extends Comparable<A1>, A2 extends Comparable<A2>>
71 | 			implements Comparator<Tuple2<A1, A2>> {
72 | 
73 | 		public int compare(Tuple2<A1, A2> x, Tuple2<A1, A2> y) {
74 | 			int _1Compare = x._1.compareTo(y._1);
75 | 			if (_1Compare != 0) {
76 | 				return _1Compare;
77 | 			}
78 | 			return y._2.compareTo(y._2);
79 | 		}
80 | 
81 | 	}
82 | 
83 | }
84 | 


--------------------------------------------------------------------------------
/src/main/java/edu/berkeley/cs/nlp/ocular/util/Tuple3.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.util;
 2 | 
 3 | import java.io.Serializable;
 4 | import java.util.Comparator;
 5 | 
 6 | /**
 7 |  * @author Dan Klein
 8 |  * @author Dan Garrette (dhgarrette@gmail.com)
 9 |  */
10 | public class Tuple3<A1, A2, A3> implements Serializable {
11 | 	static final long serialVersionUID = 53;
12 | 
13 | 	public final A1 _1;
14 | 	public final A2 _2;
15 | 	public final A3 _3;
16 | 
17 | 	public boolean equals(Object o) {
18 | 		if (this == o)
19 | 			return true;
20 | 		if (!(o instanceof Tuple3))
21 | 			return false;
22 | 
23 | 		@SuppressWarnings("rawtypes")
24 | 		final Tuple3 tuple = (Tuple3) o;
25 | 
26 | 		if (_1 != null ? !_1.equals(tuple._1) : tuple._1 != null)
27 | 			return false;
28 | 		if (_2 != null ? !_2.equals(tuple._2) : tuple._2 != null)
29 | 			return false;
30 | 		if (_3 != null ? !_3.equals(tuple._3) : tuple._3 != null)
31 | 			return false;
32 | 
33 | 		return true;
34 | 	}
35 | 
36 | 	public int hashCode() {
37 | 		int result;
38 | 		result = (_1 != null ? _1.hashCode() : 0);
39 | 		result = 29 * result + (_2 != null ? _2.hashCode() : 0);
40 | 		result = 31 * result + (_3 != null ? _3.hashCode() : 0);
41 | 		return result;
42 | 	}
43 | 
44 | 	public String toString() {
45 | 		return "(" + _1 + ", " + _2 + ", " + _3 + ")";
46 | 	}
47 | 
48 | 	public Tuple3(A1 _1, A2 _2, A3 _3) {
49 | 		this._1 = _1;
50 | 		this._2 = _2;
51 | 		this._3 = _3;
52 | 	}
53 | 
54 | 	public static <A1, A2, A3> Tuple3<A1, A2, A3> Tuple3(A1 _1, A2 _2, A3 _3) {
55 | 		return new Tuple3<A1, A2, A3>(_1, _2, _3);
56 | 	}
57 | 
58 | 	public static class LexicographicTuple3Comparator<A1, A2, A3> implements Comparator<Tuple3<A1, A2, A3>> {
59 | 		Comparator<A1> _1Comparator;
60 | 		Comparator<A2> _2Comparator;
61 | 		Comparator<A3> _3Comparator;
62 | 
63 | 		public int compare(Tuple3<A1, A2, A3> tuple1, Tuple3<A1, A2, A3> tuple2) {
64 | 			int _1Compare = _1Comparator.compare(tuple1._1, tuple2._1);
65 | 			if (_1Compare != 0)
66 | 				return _1Compare;
67 | 			int _2Compare = _2Comparator.compare(tuple1._2, tuple2._2);
68 | 			if (_2Compare != 0)
69 | 				return _2Compare;
70 | 			return _3Comparator.compare(tuple1._3, tuple2._3);
71 | 		}
72 | 
73 | 		public LexicographicTuple3Comparator(Comparator<A1> _1Comparator, Comparator<A2> _2Comparator, Comparator<A3> _3Comparator) {
74 | 			this._1Comparator = _1Comparator;
75 | 			this._2Comparator = _2Comparator;
76 | 			this._3Comparator = _3Comparator;
77 | 		}
78 | 	}
79 | 
80 | 	public static class DefaultLexicographicTuple3Comparator<A1 extends Comparable<A1>, A2 extends Comparable<A2>, A3 extends Comparable<A3>>
81 | 			implements Comparator<Tuple3<A1, A2, A3>> {
82 | 
83 | 		public int compare(Tuple3<A1, A2, A3> x, Tuple3<A1, A2, A3> y) {
84 | 			int _1Compare = x._1.compareTo(y._1);
85 | 			if (_1Compare != 0) {
86 | 				return _1Compare;
87 | 			}
88 | 			int _2Compare = x._2.compareTo(y._2);
89 | 			if (_2Compare != 0) {
90 | 				return _2Compare;
91 | 			}
92 | 			return x._3.compareTo(y._3);
93 | 		}
94 | 
95 | 	}
96 | 
97 | }
98 | 


--------------------------------------------------------------------------------
/src/test/java/edu/berkeley/cs/nlp/ocular/data/textreader/BasicTextReaderTests.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.data.textreader;
 2 | 
 3 | import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.ACUTE_COMBINING;
 4 | import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.DIAERESIS_COMBINING;
 5 | import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.GRAVE_COMBINING;
 6 | import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.MACRON_COMBINING;
 7 | import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.TILDE_COMBINING;
 8 | import static org.junit.Assert.assertEquals;
 9 | import static org.junit.Assert.fail;
10 | 
11 | import java.util.Arrays;
12 | import java.util.List;
13 | 
14 | import org.junit.Test;
15 | 
16 | /**
17 |  * @author Dan Garrette (dhgarrette@gmail.com)
18 |  */
19 | public class BasicTextReaderTests {
20 | 
21 | 	private String s1 = "ing th\\~q || | follies of thõsè, who éither ``sæek'' out th\\\"os\\`e wæys \"and\" means, which either are sq̃uccess lessons";
22 | 
23 | 	@Test
24 | 	public void test_readCharacters_qtilde() {
25 | 		TextReader tr = new BasicTextReader();
26 | 		assertEquals(Arrays.asList("q" + TILDE_COMBINING), tr.readCharacters("q̃"));
27 | 		assertEquals(Arrays.asList("t", "h", "q" + TILDE_COMBINING, "r"), tr.readCharacters("thq̃r"));
28 | 		assertEquals(Arrays.asList("t", "h", "q" + TILDE_COMBINING, "r"), tr.readCharacters("th\\~qr"));
29 | 	}
30 | 
31 | 	@Test
32 | 	public void test_readCharacters_stackedDiacritics() {
33 | 		TextReader tr = new BasicTextReader();
34 | 		assertEquals(Arrays.asList("n" + TILDE_COMBINING + MACRON_COMBINING + DIAERESIS_COMBINING + ACUTE_COMBINING + GRAVE_COMBINING), tr.readCharacters("\\`\\'ñ" + MACRON_COMBINING + DIAERESIS_COMBINING));
35 | 	}
36 | 
37 | 	@Test
38 | 	public void test_readCharacters_dia() {
39 | 		TextReader tr = new BasicTextReader();
40 | 		List<String> r = Arrays.asList("i", "n", "g", " ", "t", "h", "q" + TILDE_COMBINING, " ", "|", "|", " ", "|", " ", "f", "o", "l", "l", "i", "e", "s", " ", "o", "f", " ", "t", "h", "o" + TILDE_COMBINING, "s", "e" + GRAVE_COMBINING, ",", " ", "w", "h", "o", " ", "e" + ACUTE_COMBINING, "i", "t", "h", "e", "r", " ", "\"", "s", "æ", "e", "k", "\"", " ", "o", "u", "t", " ", "t", "h", "o" + DIAERESIS_COMBINING, "s", "e" + GRAVE_COMBINING, " ", "w", "æ", "y", "s", " ", "\"", "a", "n", "d", "\"", " ", "m", "e", "a", "n", "s", ",", " ", "w", "h", "i", "c", "h", " ", "e", "i", "t", "h", "e", "r", " ", "a", "r", "e", " ", "s", "q" + TILDE_COMBINING, "u", "c", "c", "e", "s", "s", " ", "l", "e", "s", "s", "o", "n", "s");
41 | 		assertEquals(r, tr.readCharacters(s1));
42 | 	}
43 | 
44 | 	@Test
45 | 	public void test_readCharacters_backslash() {
46 | 		TextReader tr = new BasicTextReader();
47 | 		List<String> r = Arrays.asList("t", "h", "i", "s", "\\\\", "t", "h", "a", "t", "\\\\", "t", "h", "e", "\\\\");
48 | 		assertEquals(r, tr.readCharacters("this\\\\that\\\\the\\\\"));
49 | 		try {
50 | 			List<String> r2 = tr.readCharacters("this\\that\\the\\");
51 | 			fail("Exception expected, found: ["+r2+"]");
52 | 		} catch (RuntimeException e) {
53 | 			assertEquals("Unrecognized escape sequence: [\\t]", e.getMessage());
54 | 		}
55 | 	}
56 | 
57 | 	@Test
58 | 	public void test_readCharacters_noEscapeChar() {
59 | 		BasicTextReader tr = new BasicTextReader(false);
60 | 		assertEquals(Arrays.asList("t", "h", "\\\\", "~", "q", "r", "\\\\", "\\\\", "x"), tr.readCharacters("th\\~qr\\\\x"));
61 | 	}
62 | 
63 | }
64 | 


--------------------------------------------------------------------------------
/src/test/java/edu/berkeley/cs/nlp/ocular/data/textreader/BlacklistCharacterSetTextReaderTests.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.data.textreader;
 2 | 
 3 | import static org.junit.Assert.assertEquals;
 4 | 
 5 | import java.util.Arrays;
 6 | 
 7 | import org.junit.Test;
 8 | 
 9 | import edu.berkeley.cs.nlp.ocular.util.CollectionHelper;
10 | 
11 | /**
12 |  * @author Dan Garrette (dhgarrette@gmail.com)
13 |  */
14 | public class BlacklistCharacterSetTextReaderTests {
15 | 
16 | 	@Test
17 | 	public void test_readCharacters() {
18 | 		String s = "thi&s tha$t t$he";
19 | 		
20 | 		TextReader tr = new BlacklistCharacterSetTextReader(CollectionHelper.makeSet("&", "$"), new BasicTextReader());
21 | 		assertEquals(Arrays.asList("t", "h", "i", "s", " ", "t", "h", "a", "t", " ", "t", "h", "e"), tr.readCharacters(s));
22 | 	}
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/src/test/java/edu/berkeley/cs/nlp/ocular/data/textreader/CharIndexerTests.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.data.textreader;
 2 | 
 3 | import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.TILDE_COMBINING;
 4 | import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.TILDE_ESCAPE;
 5 | import static org.junit.Assert.assertEquals;
 6 | import static org.junit.Assert.assertFalse;
 7 | import static org.junit.Assert.assertTrue;
 8 | 
 9 | import org.junit.Test;
10 | 
11 | import tberg.murphy.indexer.Indexer;
12 | 
13 | /**
14 |  * @author Dan Garrette (dhgarrette@gmail.com)
15 |  */
16 | public class CharIndexerTests {
17 | 
18 | 	@Test
19 | 	public void test() {
20 | 		Indexer<String> i = new CharIndexer();
21 | 
22 | 		String ae = TILDE_ESCAPE + "a";
23 | 		String ac = "a" + TILDE_COMBINING;
24 | 
25 | 		String ee = TILDE_ESCAPE + "e";
26 | 		String ec = "e" + TILDE_COMBINING;
27 | 
28 | 		String ne = TILDE_ESCAPE + "n";
29 | 		String nc = "n" + TILDE_COMBINING;
30 | 		String np = "ñ";
31 | 
32 | 		i.index(new String[] { "a", "b", ec });
33 | 
34 | 		assertTrue(i.contains("a"));
35 | 		assertTrue(i.contains("b"));
36 | 		assertTrue(i.contains(ec));
37 | 		assertTrue(i.contains(ee));
38 | 		assertEquals(0, i.getIndex("a"));
39 | 		assertEquals("a", i.getObject(0));
40 | 		assertEquals(1, i.getIndex("b"));
41 | 		assertEquals("b", i.getObject(1));
42 | 		assertEquals(2, i.getIndex(ec));
43 | 		assertEquals(ec, i.getObject(2));
44 | 		assertEquals(2, i.getIndex(ec));
45 | 		assertEquals(3, i.size());
46 | 
47 | 		assertFalse(i.contains(ae));
48 | 		assertFalse(i.contains(ac));
49 | 		assertEquals(3, i.getIndex(ae));
50 | 		assertTrue(i.contains(ae));
51 | 		assertTrue(i.contains(ac));
52 | 		assertEquals(3, i.getIndex(ac));
53 | 		assertTrue(i.contains(ae));
54 | 		assertTrue(i.contains(ac));
55 | 		assertEquals(4, i.size());
56 | 
57 | 		assertFalse(i.contains(ne));
58 | 		assertFalse(i.contains(nc));
59 | 		assertFalse(i.contains(np));
60 | 		assertEquals(4, i.getIndex(np));
61 | 		assertEquals(nc, i.getObject(4));
62 | 		assertTrue(i.contains(ne));
63 | 		assertTrue(i.contains(nc));
64 | 		assertTrue(i.contains(np));
65 | 		assertEquals(4, i.getIndex(ne));
66 | 		assertEquals(4, i.getIndex(nc));
67 | 		assertEquals(nc, i.getObject(4));
68 | 		assertEquals(5, i.size());
69 | 
70 | 		assertFalse(i.locked());
71 | 		i.lock();
72 | 		assertTrue(i.locked());
73 | 	}
74 | 
75 | }
76 | 


--------------------------------------------------------------------------------
/src/test/java/edu/berkeley/cs/nlp/ocular/data/textreader/ConvertLongSTextReaderTests.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.data.textreader;
 2 | 
 3 | import static org.junit.Assert.assertEquals;
 4 | import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.*;
 5 | 
 6 | import java.util.Arrays;
 7 | import java.util.List;
 8 | 
 9 | import org.junit.Test;
10 | 
11 | /**
12 |  * @author Dan Garrette (dhgarrette@gmail.com)
13 |  */
14 | public class ConvertLongSTextReaderTests {
15 | 
16 | 	private String s1 = "ing th\\~q || | follies of thõsè, who éither ``sæek'' out th\\\"os\\`e wæys \"and\" means, which either are sq̃uccess confession asi \\\\lessons";
17 | 
18 | 	@Test
19 | 	public void test_readCharacters() {
20 | 		TextReader tr = new ConvertLongSTextReader(new BasicTextReader());
21 | 		assertEquals(Arrays.asList("t", "h", "o" + TILDE_COMBINING, "ſ", "e" + GRAVE_COMBINING), tr.readCharacters("thõsè"));
22 | 		assertEquals(Arrays.asList("ſ", "i"), tr.readCharacters("si"));
23 | 		assertEquals(Arrays.asList("ſ", "i", "n"), tr.readCharacters("sin"));
24 | 		assertEquals(Arrays.asList("a", "ſ", "i"), tr.readCharacters("asi"));
25 | 		assertEquals(Arrays.asList("ſ", "s", "i"), tr.readCharacters("ssi"));
26 | 		assertEquals(Arrays.asList("a", "ſ", "s", "i"), tr.readCharacters("assi"));
27 | 		assertEquals(Arrays.asList("ſ", "s", "i", "n"), tr.readCharacters("ssin"));
28 | 		assertEquals(Arrays.asList("a", "ſ", "s", "i", "n"), tr.readCharacters("assin"));
29 | 		List<String> r = Arrays.asList("i", "n", "g", " ", "t", "h", "q" + TILDE_COMBINING, " ", "|", "|", " ", "|", " ", "f", "o", "l", "l", "i", "e", "s", " ", "o", "f", " ", "t", "h", "o" + TILDE_COMBINING, "ſ", "e" + GRAVE_COMBINING, ",", " ", "w", "h", "o", " ", "e" + ACUTE_COMBINING, "i", "t", "h", "e", "r", " ", "\"", "ſ", "æ", "e", "k", "\"", " ", "o", "u", "t", " ", "t", "h", "o" + DIAERESIS_COMBINING, "ſ", "e" + GRAVE_COMBINING, " ", "w", "æ", "y", "s", " ", "\"", "a", "n", "d", "\"", " ", "m", "e", "a", "n", "s", ",", " ", "w", "h", "i", "c", "h", " ", "e", "i", "t", "h", "e", "r", " ", "a", "r", "e", " ", "ſ", "q" + TILDE_COMBINING, "u", "c", "c", "e", "ſ", "s", " ", "c", "o", "n", "f", "e", "ſ", "s", "i", "o", "n", " ", "a", "ſ", "i", " ", "\\\\", "l", "e", "ſ", "ſ", "o", "n", "s");
30 | 		assertEquals(r, tr.readCharacters(s1));
31 | 	}
32 | 
33 | 	@Test
34 | 	public void test_readCharacters_removeDia() {
35 | 		TextReader tr = new ConvertLongSTextReader(new RemoveAllDiacriticsTextReader(new BasicTextReader()));
36 | 		List<String> r = Arrays.asList("i", "n", "g", " ", "t", "h", "q", " ", "|", "|", " ", "|", " ", "f", "o", "l", "l", "i", "e", "s", " ", "o", "f", " ", "t", "h", "o", "ſ", "e", ",", " ", "w", "h", "o", " ", "e", "i", "t", "h", "e", "r", " ", "\"", "ſ", "æ", "e", "k", "\"", " ", "o", "u", "t", " ", "t", "h", "o", "ſ", "e", " ", "w", "æ", "y", "s", " ", "\"", "a", "n", "d", "\"", " ", "m", "e", "a", "n", "s", ",", " ", "w", "h", "i", "c", "h", " ", "e", "i", "t", "h", "e", "r", " ", "a", "r", "e", " ", "ſ", "q", "u", "c", "c", "e", "ſ", "s", " ", "c", "o", "n", "f", "e", "ſ", "s", "i", "o", "n", " ", "a", "ſ", "i", " ", "\\\\", "l", "e", "ſ", "ſ", "o", "n", "s");
37 | 		assertEquals(r, tr.readCharacters(s1));
38 | 	}
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/src/test/java/edu/berkeley/cs/nlp/ocular/data/textreader/RemoveAllDiacriticsTextReaderTests.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.data.textreader;
 2 | 
 3 | import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.DIAERESIS_COMBINING;
 4 | import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.MACRON_COMBINING;
 5 | import static org.junit.Assert.assertEquals;
 6 | 
 7 | import java.util.Arrays;
 8 | import java.util.List;
 9 | 
10 | import org.junit.Test;
11 | 
12 | /**
13 |  * @author Dan Garrette (dhgarrette@gmail.com)
14 |  */
15 | public class RemoveAllDiacriticsTextReaderTests {
16 | 
17 | 	private String s1 = "ing th\\~q || | follies of thõsè, who éither ``sæek'' out th\\\"os\\`e wæys \"and\" means, which either are sq̃uccess lessons";
18 | 
19 | 	@Test
20 | 	public void test_readCharacters_qtilde_nodia() {
21 | 		TextReader tr = new RemoveAllDiacriticsTextReader(new BasicTextReader());
22 | 		assertEquals(Arrays.asList("t", "h", "q", "r"), tr.readCharacters("thq̃r"));
23 | 		assertEquals(Arrays.asList("t", "h", "q", "r"), tr.readCharacters("th\\~qr"));
24 | 	}
25 | 
26 | 	@Test
27 | 	public void test_readCharacters_stackedDiacritics_nodia() {
28 | 		TextReader tr = new RemoveAllDiacriticsTextReader(new BasicTextReader());
29 | 		assertEquals(Arrays.asList("n"), tr.readCharacters("\\`\\'ñ" + MACRON_COMBINING + DIAERESIS_COMBINING));
30 | 	}
31 | 
32 | 	@Test
33 | 	public void test_readCharacters_plain() {
34 | 		TextReader tr = new RemoveAllDiacriticsTextReader(new BasicTextReader());
35 | 		//assertEquals(Arrays.asList(), tr.readCharacters("tiquinhu\\-almoqu\\-ixtililia"));
36 | 
37 | 		List<String> r = Arrays.asList("i", "n", "g", " ", "t", "h", "q", " ", "|", "|", " ", "|", " ", "f", "o", "l", "l", "i", "e", "s", " ", "o", "f", " ", "t", "h", "o", "s", "e", ",", " ", "w", "h", "o", " ", "e", "i", "t", "h", "e", "r", " ", "\"", "s", "æ", "e", "k", "\"", " ", "o", "u", "t", " ", "t", "h", "o", "s", "e", " ", "w", "æ", "y", "s", " ", "\"", "a", "n", "d", "\"", " ", "m", "e", "a", "n", "s", ",", " ", "w", "h", "i", "c", "h", " ", "e", "i", "t", "h", "e", "r", " ", "a", "r", "e", " ", "s", "q", "u", "c", "c", "e", "s", "s", " ", "l", "e", "s", "s", "o", "n", "s");
38 | 		assertEquals(r, tr.readCharacters(s1));
39 | 
40 | 	}
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/src/test/java/edu/berkeley/cs/nlp/ocular/data/textreader/ReplaceSomeTextReaderTests.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.data.textreader;
 2 | 
 3 | import static edu.berkeley.cs.nlp.ocular.util.CollectionHelper.makeList;
 4 | import static edu.berkeley.cs.nlp.ocular.util.Tuple2.Tuple2;
 5 | import static org.junit.Assert.assertEquals;
 6 | 
 7 | import java.util.List;
 8 | 
 9 | import org.junit.Test;
10 | 
11 | import edu.berkeley.cs.nlp.ocular.util.StringHelper;
12 | 
13 | /**
14 |  * @author Dan Garrette (dhgarrette@gmail.com)
15 |  */
16 | public class ReplaceSomeTextReaderTests {
17 | 
18 | 	@Test
19 | 	public void test_readCharacters_1() {
20 | 		TextReader tr = new ReplaceSomeTextReader(makeList(Tuple2(Tuple2((List<String>) makeList("a", "b"), (List<String>) makeList("x", "y", "z")), 3)), new BasicTextReader());
21 | 		assertEquals("ab1ab2xyz3ab4ab5xyz6ab7ab8", StringHelper.join(tr.readCharacters("ab1ab2ab3ab4ab5ab6ab7ab8")));
22 | 	}
23 | 
24 | 	@Test
25 | 	public void test_readCharacters_2() {
26 | 		TextReader tr = new ReplaceSomeTextReader(makeList(Tuple2(Tuple2((List<String>) makeList("a", "b"), (List<String>) makeList("x", "y", "z")), 4)), new BasicTextReader());
27 | 		assertEquals("ab1ab2ab3xyz4ab5ab6ab7xyz8", StringHelper.join(tr.readCharacters("ab1ab2ab3ab4ab5ab6ab7ab8")));
28 | 	}
29 | 
30 | 	@Test
31 | 	public void test_readCharacters_3() {
32 | 		TextReader tr = new ReplaceSomeTextReader(makeList(Tuple2(Tuple2((List<String>) makeList("a", "b"), (List<String>) makeList("x", "y", "z")), 1)), new BasicTextReader());
33 | 		assertEquals("xyz", StringHelper.join(tr.readCharacters("ab")));
34 | 	}
35 | 
36 | 	@Test
37 | 	public void test_readCharacters_4() {
38 | 		TextReader tr = new ReplaceSomeTextReader(makeList(Tuple2(Tuple2((List<String>) makeList("a", "b"), (List<String>) makeList("x", "y", "z")), 4)), new BasicTextReader());
39 | 		assertEquals("ab1ab2ab3xyz4ab5ab6ab7xyz", StringHelper.join(tr.readCharacters("ab1ab2ab3ab4ab5ab6ab7ab")));
40 | 	}
41 | 
42 | 	@Test
43 | 	public void test_readCharacters_5() {
44 | 		TextReader tr = new ReplaceSomeTextReader(makeList( //
45 | 				Tuple2(Tuple2((List<String>) makeList("a", "b"), (List<String>) makeList("x", "y", "z")), 3), //
46 | 				Tuple2(Tuple2((List<String>) makeList("y", "z"), (List<String>) makeList("e")), 2)), // 
47 | 				new BasicTextReader());
48 | 		assertEquals("ab1ab2xyz3ab4ab5xe6ab7ab8", StringHelper.join(tr.readCharacters("ab1ab2ab3ab4ab5ab6ab7ab8")));
49 | 	}
50 | 
51 | 	@Test
52 | 	public void test_readCharacters_6() {
53 | 		TextReader tr = new ReplaceSomeTextReader(makeList(Tuple2(Tuple2((List<String>) makeList("x", "x"), (List<String>) makeList("a")), 1)), new BasicTextReader());
54 | 		assertEquals("aa", StringHelper.join(tr.readCharacters("xxxx")));
55 | 	}
56 | 
57 | 	@Test
58 | 	public void test_readCharacters_7() {
59 | 		TextReader tr = new ReplaceSomeTextReader(makeList(Tuple2(Tuple2((List<String>) makeList("x", "x"), (List<String>) makeList("a", "x")), 1)), new BasicTextReader());
60 | 		assertEquals("axax", StringHelper.join(tr.readCharacters("xxxx")));
61 | 	}
62 | 
63 | }
64 | 


--------------------------------------------------------------------------------
/src/test/java/edu/berkeley/cs/nlp/ocular/data/textreader/WhitelistCharacterSetTextReaderTests.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.data.textreader;
 2 | 
 3 | import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.ACUTE_COMBINING;
 4 | import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.GRAVE_COMBINING;
 5 | import static org.junit.Assert.assertEquals;
 6 | 
 7 | import java.util.Arrays;
 8 | 
 9 | import org.junit.Test;
10 | 
11 | import edu.berkeley.cs.nlp.ocular.util.CollectionHelper;
12 | 
13 | /**
14 |  * @author Dan Garrette (dhgarrette@gmail.com)
15 |  */
16 | public class WhitelistCharacterSetTextReaderTests {
17 | 
18 | 	@Test
19 | 	public void test_readCharacters_default() {
20 | 		String s = "thi&s thá$t t$hè";
21 | 		WhitelistCharacterSetTextReader tr1 = new WhitelistCharacterSetTextReader(CollectionHelper.makeSet("a", "e", "h", "i", "s", "t"), new BasicTextReader());
22 | 		assertEquals(Arrays.asList("t", "h", "i", "s", " ", "t", "h", "t", " ", "t", "h"), tr1.readCharacters(s));
23 | 		WhitelistCharacterSetTextReader tr2 = new WhitelistCharacterSetTextReader(CollectionHelper.makeSet("a", "e", "h", "i", "s", "t", "\\'a"), new BasicTextReader());
24 | 		assertEquals(Arrays.asList("t", "h", "i", "s", " ", "t", "h", "a" + ACUTE_COMBINING, "t", " ", "t", "h"), tr2.readCharacters(s));
25 | 		WhitelistCharacterSetTextReader tr3 = new WhitelistCharacterSetTextReader(CollectionHelper.makeSet("a", "e", "h", "í", "s", "t"), new BasicTextReader());
26 | 		assertEquals(Arrays.asList("t", "h", "s", " ", "t", "h", "t", " ", "t", "h"), tr3.readCharacters(s));
27 | 	}
28 | 
29 | 	@Test
30 | 	public void test_readCharacters_considerDiacritics() {
31 | 		String s = "thi&s thá$t t$hè";
32 | 		WhitelistCharacterSetTextReader tr1 = new WhitelistCharacterSetTextReader(CollectionHelper.makeSet("a", "e", "h", "i", "s", "t"), false, new BasicTextReader());
33 | 		assertEquals(Arrays.asList("t", "h", "i", "s", " ", "t", "h", "t", " ", "t", "h"), tr1.readCharacters(s));
34 | 		WhitelistCharacterSetTextReader tr2 = new WhitelistCharacterSetTextReader(CollectionHelper.makeSet("a", "e", "h", "i", "s", "t", "\\'a"), false, new BasicTextReader());
35 | 		assertEquals(Arrays.asList("t", "h", "i", "s", " ", "t", "h", "a" + ACUTE_COMBINING, "t", " ", "t", "h"), tr2.readCharacters(s));
36 | 		WhitelistCharacterSetTextReader tr3 = new WhitelistCharacterSetTextReader(CollectionHelper.makeSet("a", "e", "h", "í", "s", "t"), false, new BasicTextReader());
37 | 		assertEquals(Arrays.asList("t", "h", "s", " ", "t", "h", "t", " ", "t", "h"), tr3.readCharacters(s));
38 | 	}
39 | 
40 | 	@Test
41 | 	public void test_readCharacters_disregardDiacritics() {
42 | 		String s = "thi&s thá$t t$hè";
43 | 		WhitelistCharacterSetTextReader tr1 = new WhitelistCharacterSetTextReader(CollectionHelper.makeSet("a", "e", "h", "i", "s", "t"), true, new BasicTextReader());
44 | 		assertEquals(Arrays.asList("t", "h", "i", "s", " ", "t", "h", "a" + ACUTE_COMBINING, "t", " ", "t", "h", "e" + GRAVE_COMBINING), tr1.readCharacters(s));
45 | 		WhitelistCharacterSetTextReader tr2 = new WhitelistCharacterSetTextReader(CollectionHelper.makeSet("a", "e", "h", "i", "s", "t", "\\'a"), true, new BasicTextReader());
46 | 		assertEquals(Arrays.asList("t", "h", "i", "s", " ", "t", "h", "a" + ACUTE_COMBINING, "t", " ", "t", "h", "e" + GRAVE_COMBINING), tr2.readCharacters(s));
47 | 		WhitelistCharacterSetTextReader tr3 = new WhitelistCharacterSetTextReader(CollectionHelper.makeSet("a", "e", "h", "í", "s", "t"), true, new BasicTextReader());
48 | 		assertEquals(Arrays.asList("t", "h", "s", " ", "t", "h", "a" + ACUTE_COMBINING, "t", " ", "t", "h", "e" + GRAVE_COMBINING), tr3.readCharacters(s));
49 | 	}
50 | 
51 | }
52 | 


--------------------------------------------------------------------------------
/src/test/java/edu/berkeley/cs/nlp/ocular/gsm/BasicGlyphSubstitutionModelTests.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.gsm;
 2 | 
 3 | import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.TILDE_COMBINING;
 4 | import static edu.berkeley.cs.nlp.ocular.util.CollectionHelper.makeSet;
 5 | import static org.junit.Assert.assertEquals;
 6 | 
 7 | import java.util.ArrayList;
 8 | import java.util.List;
 9 | import java.util.Set;
10 | 
11 | import org.junit.Test;
12 | 
13 | import edu.berkeley.cs.nlp.ocular.data.textreader.Charset;
14 | import edu.berkeley.cs.nlp.ocular.gsm.BasicGlyphSubstitutionModel.BasicGlyphSubstitutionModelFactory;
15 | import tberg.murphy.indexer.HashMapIndexer;
16 | import tberg.murphy.indexer.Indexer;
17 | 
18 | /**
19 |  * @author Dan Garrette (dhgarrette@gmail.com)
20 |  */
21 | public class BasicGlyphSubstitutionModelTests {
22 | 
23 | 	@Test
24 | 	public void test_getSmoothingValue() {
25 | 
26 | 		double gsmSmoothingCount = 0.1;
27 | 		double gsmElisionSmoothingCountMultiplier = 500.0;
28 | 		Indexer<String> langIndexer = new HashMapIndexer<String>(); langIndexer.index(new String[] {"spanish", "latin"}); langIndexer.lock();
29 | 		String[] chars = new String[] {" ","-","a","b","c","d","e","f","k","n","o","s","\\'o"};
30 | 		Indexer<String> charIndexer = new HashMapIndexer<String>(); charIndexer.index(chars);
31 | 
32 | 		List<Integer> charIndices = new ArrayList<Integer>(); 
33 | 		for (String c : chars) charIndices.add(charIndexer.getIndex(c)); 
34 | 		Set<Integer> fullCharSet = makeSet(charIndices);
35 | 		@SuppressWarnings("unchecked")
36 | 		Set<Integer>[] activeCharacterSets = new Set[] {fullCharSet, fullCharSet};
37 | 		charIndexer.getIndex("z");
38 | 		charIndexer.getIndex(Charset.LONG_S);
39 | 		for (String c : new String[] {"a","b","c","d","e","f","k","n","o","s","z"}) charIndices.add(charIndexer.getIndex(c+TILDE_COMBINING));
40 | 		charIndexer.lock();
41 | 		double gsmPower = 2.0; 
42 | 		int minCountsForEvalGsm = 2;
43 | 		String outputPath = ""; 
44 | 		
45 | 		BasicGlyphSubstitutionModelFactory gsmf = new BasicGlyphSubstitutionModelFactory(
46 | 				gsmSmoothingCount,
47 | 				gsmElisionSmoothingCountMultiplier,
48 | 				langIndexer, 
49 | 				charIndexer, 
50 | 				activeCharacterSets,
51 | 				gsmPower, 
52 | 				minCountsForEvalGsm,
53 | 				outputPath);
54 | 
55 | 		assertEquals(gsmSmoothingCount*gsmElisionSmoothingCountMultiplier, gsmf.getSmoothingValue(0, charIndexer.getIndex("\\'o"), gsmf.GLYPH_ELISION_TILDE), 1e-9);
56 | 		assertEquals(gsmSmoothingCount, gsmf.getSmoothingValue(0, charIndexer.getIndex("k"), charIndexer.getIndex("k")), 1e-9);
57 | 		assertEquals(gsmSmoothingCount*gsmElisionSmoothingCountMultiplier, gsmf.getSmoothingValue(0, charIndexer.getIndex("k"), gsmf.GLYPH_FIRST_ELIDED), 1e-9);
58 | 		assertEquals(gsmSmoothingCount*gsmElisionSmoothingCountMultiplier, gsmf.getSmoothingValue(0, charIndexer.getIndex("k"), gsmf.GLYPH_FIRST_ELIDED), 1e-9);
59 | 		assertEquals(gsmSmoothingCount*gsmElisionSmoothingCountMultiplier, gsmf.getSmoothingValue(0, charIndexer.getIndex("k"), gsmf.GLYPH_TILDE_ELIDED), 1e-9);
60 | 		assertEquals(gsmSmoothingCount, gsmf.getSmoothingValue(0, charIndexer.getIndex("a"), charIndexer.getIndex("a")), 1e-9);
61 | 		assertEquals(gsmSmoothingCount*gsmElisionSmoothingCountMultiplier, gsmf.getSmoothingValue(0, charIndexer.getIndex("n"), gsmf.GLYPH_TILDE_ELIDED), 1e-9);
62 | 		assertEquals(gsmSmoothingCount, gsmf.getSmoothingValue(0, charIndexer.getIndex("a"), charIndexer.getIndex("a")), 1e-9);
63 | 		assertEquals(0.0, gsmf.getSmoothingValue(0, charIndexer.getIndex("a"), charIndexer.getIndex("z")), 1e-9);
64 | 		assertEquals(0.0, gsmf.getSmoothingValue(0, charIndexer.getIndex("a"), charIndexer.getIndex(Charset.LONG_S)), 1e-9);
65 | 		assertEquals(gsmSmoothingCount, gsmf.getSmoothingValue(0, charIndexer.getIndex("s"), charIndexer.getIndex(Charset.LONG_S)), 1e-9);
66 | 
67 | 	}
68 | 	
69 | }
70 | 


--------------------------------------------------------------------------------
/src/test/java/edu/berkeley/cs/nlp/ocular/model/FontTrainEMTests.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.model;
 2 | 
 3 | import static org.junit.Assert.assertEquals;
 4 | 
 5 | import java.util.Collection;
 6 | import java.util.List;
 7 | 
 8 | import org.junit.Test;
 9 | 
10 | import edu.berkeley.cs.nlp.ocular.gsm.GlyphChar;
11 | import edu.berkeley.cs.nlp.ocular.gsm.GlyphChar.GlyphType;
12 | import edu.berkeley.cs.nlp.ocular.model.transition.SparseTransitionModel.TransitionState;
13 | import edu.berkeley.cs.nlp.ocular.train.FontTrainer;
14 | import edu.berkeley.cs.nlp.ocular.util.Tuple2;
15 | import static edu.berkeley.cs.nlp.ocular.util.CollectionHelper.*;
16 | import tberg.murphy.indexer.HashMapIndexer;
17 | import tberg.murphy.indexer.Indexer;
18 | import edu.berkeley.cs.nlp.ocular.model.DecodeState;
19 | import static edu.berkeley.cs.nlp.ocular.model.TransitionStateType.*;
20 | 
21 | /**
22 |  * @author Dan Garrette (dhgarrette@gmail.com)
23 |  */
24 | public class FontTrainEMTests {
25 | 
26 | 	class TS implements TransitionState {
27 | 		public final int id;
28 | 		private int languageIndex;
29 | 		private int lmCharIndex;
30 | 		private TransitionStateType type;
31 | 		private GlyphChar glyphChar;
32 | 		
33 | 		public TS(int id, int languageIndex, int lmCharIndex, TransitionStateType type, GlyphChar glyphChar) {
34 | 			this.id = id;
35 | 			this.languageIndex = languageIndex;
36 | 			this.lmCharIndex = lmCharIndex;
37 | 			this.type = type;
38 | 			this.glyphChar = glyphChar;
39 | 		}
40 | 		@Override public int getLanguageIndex() { return languageIndex; }
41 | 		@Override public int getLmCharIndex() { return lmCharIndex; }
42 | 		@Override public TransitionStateType getType() { return type; }
43 | 		@Override public GlyphChar getGlyphChar() { return glyphChar; }
44 | 		
45 | 		@Override public int getOffset() { return -1; }
46 | 		@Override public int getExposure() { return -1; }
47 | 		@Override public Collection<Tuple2<TransitionState, Double>> forwardTransitions() { return null; }
48 | 		@Override public Collection<Tuple2<TransitionState, Double>> nextLineStartStates() { return null; }
49 | 		@Override public double endLogProb() { return -1; }
50 | 		
51 | 		@Override public String toString() {
52 | 			return "TS("+id+", "+languageIndex+", "+lmCharIndex+", "+type+", "+glyphChar+")";
53 | 		}
54 | 	}
55 | 	
56 | 	private DecodeState DS(TS ts) {
57 | 		return new DecodeState(ts, 0, 0, 0, 0);
58 | 	}
59 | 	
60 | 	@Test
61 | 	public void test_makeFullViterbiStateSeq() {
62 | 
63 | 		Indexer<String> charIndexer = new HashMapIndexer<String>();
64 | 		charIndexer.index(new String[] { " ", "-", "a", "b", "c" });
65 | 		DecodeState[][] decodeStates = new DecodeState[][] {
66 | 			new DecodeState[]{	DS(new TS(1, -1, 0, LMRGN, new GlyphChar(0, GlyphType.NORMAL_CHAR))), 
67 | 								DS(new TS(2, -1, 0, LMRGN, new GlyphChar(0, GlyphType.NORMAL_CHAR))),
68 | 								DS(new TS(3, -1, 0, TMPL, new GlyphChar(0, GlyphType.NORMAL_CHAR))), 
69 | 								DS(new TS(4, 1, 2, TMPL, new GlyphChar(2, GlyphType.NORMAL_CHAR))),
70 | 								DS(new TS(5, 1, 3, TMPL, new GlyphChar(3, GlyphType.NORMAL_CHAR))), 
71 | 								DS(new TS(6, 1, 4, TMPL, new GlyphChar(4, GlyphType.NORMAL_CHAR))), 
72 | 								DS(new TS(7, 1, 1, RMRGN_HPHN_INIT, new GlyphChar(1, GlyphType.NORMAL_CHAR))), 
73 | 								DS(new TS(8, 1, 0, RMRGN_HPHN, new GlyphChar(0, GlyphType.NORMAL_CHAR))), 
74 | 								DS(new TS(9, 1, 0, RMRGN_HPHN, new GlyphChar(0, GlyphType.NORMAL_CHAR))) },
75 | 			new DecodeState[]{	DS(new TS(10, 1, 0, LMRGN_HPHN, new GlyphChar(0, GlyphType.NORMAL_CHAR))), 
76 | 								DS(new TS(11, 1, 0, LMRGN_HPHN, new GlyphChar(0, GlyphType.NORMAL_CHAR))),
77 | 								DS(new TS(12, 1, 0, TMPL, new GlyphChar(0, GlyphType.NORMAL_CHAR))),
78 | 								DS(new TS(13, 1, 2, TMPL, new GlyphChar(2, GlyphType.NORMAL_CHAR))),
79 | 								DS(new TS(14, 1, 3, TMPL, new GlyphChar(3, GlyphType.NORMAL_CHAR))),
80 | 								DS(new TS(15, 1, 4, TMPL, new GlyphChar(4, GlyphType.NORMAL_CHAR))),
81 | 								DS(new TS(16, 1, 0, RMRGN, new GlyphChar(0, GlyphType.NORMAL_CHAR))),
82 | 								DS(new TS(17, 1, 0, RMRGN, new GlyphChar(0, GlyphType.NORMAL_CHAR))) }
83 | 		};
84 | 		List<DecodeState> tsSeq = FontTrainer.makeFullViterbiStateSeq(decodeStates, charIndexer);
85 | 		List<Integer> expectedIds = makeList(2, 3, 4, 1);
86 | 		for (int i = 0; i < expectedIds.size(); ++i) {
87 | 			assertEquals(expectedIds.get(i).intValue(), ((TS)tsSeq.get(i).ts).id);
88 | 		}
89 | 
90 | 
91 | 	}
92 | }
93 | 


--------------------------------------------------------------------------------
/src/test/java/edu/berkeley/cs/nlp/ocular/util/ArrayHelperTests.java:
--------------------------------------------------------------------------------
  1 | package edu.berkeley.cs.nlp.ocular.util;
  2 | 
  3 | import static org.junit.Assert.*;
  4 | 
  5 | import org.junit.Test;
  6 | 
  7 | /**
  8 |  * @author Dan Garrette (dhgarrette@gmail.com)
  9 |  */
 10 | public class ArrayHelperTests {
 11 | 
 12 | 	@Test
 13 | 	public void test_sum_int() {
 14 | 		assertEquals(225, ArrayHelper.sum(new int[] { 50, 0, 150, 25 }));
 15 | 		assertEquals(25, ArrayHelper.sum(new int[] { 25 }));
 16 | 		assertEquals(0, ArrayHelper.sum(new int[] { 0 }));
 17 | 		assertEquals(0, ArrayHelper.sum(new int[] { 0, 0 }));
 18 | 		assertEquals(0, ArrayHelper.sum(new int[0]));
 19 | 	}
 20 | 
 21 | 	@Test
 22 | 	public void test_avg_int() {
 23 | 		assertEquals(54.8, ArrayHelper.avg(new int[] { 50, 0, 150, 74, 0 }), 1e-9);
 24 | 		assertEquals(67.5, ArrayHelper.avg(new int[] { 50, 150, 70, 0 }), 1e-9);
 25 | 		assertEquals(90, ArrayHelper.avg(new int[] { 50, 150, 70 }), 1e-9);
 26 | 		assertEquals(25.0, ArrayHelper.avg(new int[] { 25 }), 1e-9);
 27 | 		assertEquals(0, ArrayHelper.avg(new int[] { 0 }), 1e-9);
 28 | 		assertEquals(0, ArrayHelper.avg(new int[] { 0, 0 }), 1e-9);
 29 | 		assertEquals(0, ArrayHelper.avg(new int[0]), 1e-9);
 30 | 	}
 31 | 
 32 | 	@Test
 33 | 	public void test_sum_double() {
 34 | 		assertEquals(2.25, ArrayHelper.sum(new double[] { 0.5, 0.0, 1.5, 0.25 }), 1e-9);
 35 | 		assertEquals(0.25, ArrayHelper.sum(new double[] { 0.25 }), 1e-9);
 36 | 		assertEquals(0.0, ArrayHelper.sum(new double[] { 0.0 }), 1e-9);
 37 | 		assertEquals(0.0, ArrayHelper.sum(new double[] { 0.0, 0.0 }), 1e-9);
 38 | 		assertEquals(0.0, ArrayHelper.sum(new double[0]), 1e-9);
 39 | 	}
 40 | 
 41 | 	@Test
 42 | 	public void test_avg_double() {
 43 | 		assertEquals(0.54, ArrayHelper.avg(new double[] { 0.5, 0.0, 1.5, 0.7, 0.0 }), 1e-9);
 44 | 		assertEquals(0.675, ArrayHelper.avg(new double[] { 0.5, 1.5, 0.7, 0.0 }), 1e-9);
 45 | 		assertEquals(0.9, ArrayHelper.avg(new double[] { 0.5, 1.5, 0.7 }), 1e-9);
 46 | 		assertEquals(0.25, ArrayHelper.avg(new double[] { 0.25 }), 1e-9);
 47 | 		assertEquals(0.0, ArrayHelper.avg(new double[] { 0.0 }), 1e-9);
 48 | 		assertEquals(0.0, ArrayHelper.avg(new double[] { 0.0, 0.0 }), 1e-9);
 49 | 		assertEquals(0.0, ArrayHelper.avg(new double[0]), 1e-9);
 50 | 	}
 51 | 
 52 | 	@Test
 53 | 	public void test_min_int() {
 54 | 		assertEquals(10, ArrayHelper.min(new int[] { 50, 10, 25, 150, 10, 25 }));
 55 | 		assertEquals(25, ArrayHelper.min(new int[] { 25 }));
 56 | 		assertEquals(20, ArrayHelper.min(new int[] { 20 }));
 57 | 		assertEquals(20, ArrayHelper.min(new int[] { 20, 20 }));
 58 | 		try {
 59 | 			ArrayHelper.min(new int[0]);
 60 | 			fail("exception expected");
 61 | 		}
 62 | 		catch(RuntimeException e) {
 63 | 			// good
 64 | 		}
 65 | 	}
 66 | 
 67 | 	@Test
 68 | 	public void test_prepend() {
 69 | 		{
 70 | 			int[] b = ArrayHelper.prepend(0, new int[] { 1, 2, 3 });
 71 | 			assertEquals(4, b.length);
 72 | 			assertEquals(0, b[0]);
 73 | 			assertEquals(1, b[1]);
 74 | 			assertEquals(2, b[2]);
 75 | 			assertEquals(3, b[3]);
 76 | 		}
 77 | 		{
 78 | 			int[] b = ArrayHelper.prepend(0, new int[] {});
 79 | 			assertEquals(1, b.length);
 80 | 			assertEquals(0, b[0]);
 81 | 		}
 82 | 	}
 83 | 
 84 | 	@Test
 85 | 	public void test_append() {
 86 | 		{
 87 | 			Integer[] b = ArrayHelper.append(new Integer[] { 0, 1, 2 }, 3);
 88 | 			assertEquals(4, b.length);
 89 | 			assertEquals((int) 0, (int) b[0]);
 90 | 			assertEquals((int) 1, (int) b[1]);
 91 | 			assertEquals((int) 2, (int) b[2]);
 92 | 			assertEquals((int) 3, (int) b[3]);
 93 | 		}
 94 | 		{
 95 | 			Integer[] b = ArrayHelper.append(new Integer[] {}, 0);
 96 | 			assertEquals(1, b.length);
 97 | 			assertEquals((int) 0, (int) b[0]);
 98 | 		}
 99 | 	}
100 | 
101 | 	@Test
102 | 	public void test_take() {
103 | 		{
104 | 			int[] b = ArrayHelper.take(new int[] { 1, 2, 3 }, 2);
105 | 			assertEquals(2, b.length);
106 | 			assertEquals(1, b[0]);
107 | 			assertEquals(2, b[1]);
108 | 		}
109 | 		{
110 | 			int[] b = ArrayHelper.take(new int[] { 1, 2, 3 }, 3);
111 | 			assertEquals(3, b.length);
112 | 			assertEquals(1, b[0]);
113 | 			assertEquals(2, b[1]);
114 | 			assertEquals(3, b[2]);
115 | 		}
116 | 		{
117 | 			int[] b = ArrayHelper.take(new int[] { 1, 2, 3 }, 0);
118 | 			assertEquals(0, b.length);
119 | 		}
120 | 		{
121 | 			int[] b = ArrayHelper.take(new int[] { 1, 2, 3 }, 8);
122 | 			assertEquals(3, b.length);
123 | 			assertEquals(1, b[0]);
124 | 			assertEquals(2, b[1]);
125 | 			assertEquals(3, b[2]);
126 | 		}
127 | 		{
128 | 			int[] b = ArrayHelper.take(new int[] {}, 0);
129 | 			assertEquals(0, b.length);
130 | 		}
131 | 		{
132 | 			int[] b = ArrayHelper.take(new int[] {}, 2);
133 | 			assertEquals(0, b.length);
134 | 		}
135 | 	}
136 | 
137 | 	@Test
138 | 	public void test_takeRight() {
139 | 		{
140 | 			int[] b = ArrayHelper.takeRight(new int[] { 1, 2, 3 }, 2);
141 | 			assertEquals(2, b.length);
142 | 			assertEquals(2, b[0]);
143 | 			assertEquals(3, b[1]);
144 | 		}
145 | 		{
146 | 			int[] b = ArrayHelper.takeRight(new int[] { 1, 2, 3 }, 3);
147 | 			assertEquals(3, b.length);
148 | 			assertEquals(1, b[0]);
149 | 			assertEquals(2, b[1]);
150 | 			assertEquals(3, b[2]);
151 | 		}
152 | 		{
153 | 			int[] b = ArrayHelper.takeRight(new int[] { 1, 2, 3 }, 0);
154 | 			assertEquals(0, b.length);
155 | 		}
156 | 		{
157 | 			int[] b = ArrayHelper.takeRight(new int[] { 1, 2, 3 }, 8);
158 | 			assertEquals(3, b.length);
159 | 			assertEquals(1, b[0]);
160 | 			assertEquals(2, b[1]);
161 | 			assertEquals(3, b[2]);
162 | 		}
163 | 		{
164 | 			int[] b = ArrayHelper.takeRight(new int[] {}, 0);
165 | 			assertEquals(0, b.length);
166 | 		}
167 | 		{
168 | 			int[] b = ArrayHelper.takeRight(new int[] {}, 2);
169 | 			assertEquals(0, b.length);
170 | 		}
171 | 	}
172 | }
173 | 


--------------------------------------------------------------------------------
/src/test/java/edu/berkeley/cs/nlp/ocular/util/FileUtilTests.java:
--------------------------------------------------------------------------------
 1 | package edu.berkeley.cs.nlp.ocular.util;
 2 | 
 3 | import static org.junit.Assert.assertEquals;
 4 | 
 5 | import java.util.ArrayList;
 6 | import java.util.List;
 7 | 
 8 | import org.junit.Test;
 9 | 
10 | /**
11 |  * @author Dan Garrette (dhgarrette@gmail.com)
12 |  */
13 | public class FileUtilTests {
14 | 
15 | 	@Test
16 | 	public void test_lowestCommonPath() {
17 | 		{
18 | 		List<String> paths = new ArrayList<String>();
19 | 		paths.add("/well/this/and/that/");
20 | 		paths.add("/well/this/and/the/other.txt");
21 | 		paths.add("/well/this/and/thus.txt");
22 | 		String lcpd = FileUtil.lowestCommonPath(paths);
23 | 		assertEquals("/well/this/and", lcpd);
24 | 		}
25 | 		{
26 | 		List<String> paths = new ArrayList<String>();
27 | 		paths.add("/well/this/and/thus.txt");
28 | 		String lcpd = FileUtil.lowestCommonPath(paths);
29 | 		assertEquals("/well/this/and/thus.txt", lcpd);
30 | 		}
31 | 		{
32 | 		List<String> paths = new ArrayList<String>();
33 | 		paths.add("/well/this/and/");
34 | 		paths.add("/well/this/and/");
35 | 		String lcpd = FileUtil.lowestCommonPath(paths);
36 | 		assertEquals("/well/this/and", lcpd);
37 | 		}
38 | 		{
39 | 		List<String> paths = new ArrayList<String>();
40 | 		paths.add("/well/this/and/");
41 | 		String lcpd = FileUtil.lowestCommonPath(paths);
42 | 		assertEquals("/well/this/and", lcpd);
43 | 		}
44 | 	}
45 | 
46 | 	@Test
47 | 	public void test_pathRelativeTo() {
48 | 		String d0 = "/well/this/and/";
49 | 		String f1 = "/well/this/and/that.txt";
50 | 		String f2 = "/well/this/and/that";
51 | 		String f3 = "/well/this/and/that/or.txt";
52 | 		String f4 = "/well/this/and/that/else/";
53 | 		
54 | 		assertEquals("that.txt", FileUtil.pathRelativeTo(f1,d0));
55 | 		assertEquals("that", FileUtil.pathRelativeTo(f2,d0));
56 | 		assertEquals("that/or.txt", FileUtil.pathRelativeTo(f3,d0));
57 | 		assertEquals("that/else", FileUtil.pathRelativeTo(f4,d0));
58 | 	}
59 | 
60 | }
61 | 


--------------------------------------------------------------------------------
/src/test/java/edu/berkeley/cs/nlp/ocular/util/StringHelperTests.java:
--------------------------------------------------------------------------------
  1 | package edu.berkeley.cs.nlp.ocular.util;
  2 | 
  3 | import static edu.berkeley.cs.nlp.ocular.util.StringHelper.drop;
  4 | import static edu.berkeley.cs.nlp.ocular.util.StringHelper.join;
  5 | import static edu.berkeley.cs.nlp.ocular.util.StringHelper.last;
  6 | import static edu.berkeley.cs.nlp.ocular.util.StringHelper.longestCommonPrefix;
  7 | import static edu.berkeley.cs.nlp.ocular.util.StringHelper.take;
  8 | import static edu.berkeley.cs.nlp.ocular.util.StringHelper.toUnicode;
  9 | import static org.junit.Assert.assertEquals;
 10 | import static org.junit.Assert.assertFalse;
 11 | import static org.junit.Assert.assertTrue;
 12 | import static org.junit.Assert.fail;
 13 | 
 14 | import java.util.Arrays;
 15 | 
 16 | import org.junit.Test;
 17 | 
 18 | /**
 19 |  * @author Dan Garrette (dhgarrette@gmail.com)
 20 |  */
 21 | public class StringHelperTests {
 22 | 
 23 | 	@Test
 24 | 	public void testToUnicode_string() {
 25 | 		assertEquals("\\u0061", toUnicode("a"));
 26 | 	}
 27 | 
 28 | 	@Test
 29 | 	public void testToUnicode_char() {
 30 | 		assertEquals("\\u0061", toUnicode('a'));
 31 | 	}
 32 | 
 33 | 	@Test
 34 | 	public void testTake() {
 35 | 		assertEquals("", take("", 0));
 36 | 		assertEquals("", take("", -2));
 37 | 		assertEquals("", take("", 2));
 38 | 		assertEquals("", take("abc", 0));
 39 | 		assertEquals("", take("abc", -2));
 40 | 		assertEquals("a", take("a", 1));
 41 | 		assertEquals("a", take("a", 2));
 42 | 		assertEquals("ab", take("abc", 2));
 43 | 	}
 44 | 
 45 | 	@Test
 46 | 	public void testDrop() {
 47 | 		assertEquals("", drop("", 0));
 48 | 		assertEquals("", drop("", -2));
 49 | 		assertEquals("", drop("", 2));
 50 | 		assertEquals("abc", drop("abc", 0));
 51 | 		assertEquals("abc", drop("abc", -2));
 52 | 		assertEquals("", drop("a", 1));
 53 | 		assertEquals("", drop("a", 2));
 54 | 		assertEquals("c", drop("abc", 2));
 55 | 		assertEquals("bc", drop("abc", 1));
 56 | 	}
 57 | 
 58 | 	@Test
 59 | 	public void testLast() {
 60 | 		assertEquals("a", last("a"));
 61 | 		assertEquals("c", last("abc"));
 62 | 		try {
 63 | 			assertEquals("a", last(""));
 64 | 			fail();
 65 | 		} catch (IllegalArgumentException e) {
 66 | 		}
 67 | 	}
 68 | 
 69 | 	@Test
 70 | 	public void testJoin_varargs() {
 71 | 		assertEquals("abc", join("a", "", "b", "c"));
 72 | 	}
 73 | 
 74 | 	@Test
 75 | 	public void testJoin_list() {
 76 | 		assertEquals("abc", join(Arrays.asList(new String[] { "a", "", "b", "c" })));
 77 | 	}
 78 | 
 79 | 	@Test
 80 | 	public void testJoin_list_sep() {
 81 | 		assertEquals("a;;b;c", join(Arrays.asList(new String[] { "a", "", "b", "c" }), ";"));
 82 | 	}
 83 | 
 84 | 	@Test
 85 | 	public void testEquals() {
 86 | 		assertTrue(StringHelper.equals("", ""));
 87 | 		assertFalse(StringHelper.equals("a", ""));
 88 | 		assertFalse(StringHelper.equals("", "a"));
 89 | 		assertFalse(StringHelper.equals(null, ""));
 90 | 		assertFalse(StringHelper.equals("", null));
 91 | 		assertFalse(StringHelper.equals(null, "a"));
 92 | 		assertFalse(StringHelper.equals("a", null));
 93 | 		assertTrue(StringHelper.equals(null, null));
 94 | 	}
 95 | 
 96 | 	@Test
 97 | 	public void testLongestCommonPrefix() {
 98 | 		assertEquals("".length(), longestCommonPrefix("", ""));
 99 | 		assertEquals("".length(), longestCommonPrefix("abc", ""));
100 | 		assertEquals("".length(), longestCommonPrefix("", "abc"));
101 | 		assertEquals("ab".length(), longestCommonPrefix("abc", "ab"));
102 | 		assertEquals("ab".length(), longestCommonPrefix("ab", "abc"));
103 | 		assertEquals("abc".length(), longestCommonPrefix("abc", "abc"));
104 | 	}
105 | 
106 | }
107 | 


--------------------------------------------------------------------------------
/src/test/resources/.gitignore:
--------------------------------------------------------------------------------
1 | *.lmser
2 | *.fontser
3 | *.gsmser
4 | *train_output/
5 | *test_output/
6 | extracted_lines/
7 | 
8 | 


--------------------------------------------------------------------------------
/src/test/resources/doc.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tberg12/ocular/3470d41a38a16a2c8b5e77f382c82e51d886cdd6/src/test/resources/doc.jpg


--------------------------------------------------------------------------------
/src/test/resources/doc.txt:
--------------------------------------------------------------------------------
1 | aabc abc bc, bc abc aa bc aa aa bc aa aabc
2 | aabc abc bc bc abc, aa bc aa aa bc aa aabc
3 | aabc abc, bc bc abc aa bc aa, aa bc aa aabc
4 | 


--------------------------------------------------------------------------------
/src/test/resources/doc_normalized.txt:
--------------------------------------------------------------------------------
1 | aabc abc bc bc abc aa bc aa aa bc aa aabc
2 | aabc abc bc bc abc aa bc aa aa bc aa aabc
3 | aabc abc bc bc abc aa bc aa aa bc aa aabc
4 | 


--------------------------------------------------------------------------------
/src/test/resources/extracted_lines/doc-line_extract_jpg/line00.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tberg12/ocular/3470d41a38a16a2c8b5e77f382c82e51d886cdd6/src/test/resources/extracted_lines/doc-line_extract_jpg/line00.jpg


--------------------------------------------------------------------------------
/src/test/resources/extracted_lines/doc-line_extract_jpg/line01.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tberg12/ocular/3470d41a38a16a2c8b5e77f382c82e51d886cdd6/src/test/resources/extracted_lines/doc-line_extract_jpg/line01.jpg


--------------------------------------------------------------------------------
/src/test/resources/extracted_lines/doc-line_extract_jpg/line02.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tberg12/ocular/3470d41a38a16a2c8b5e77f382c82e51d886cdd6/src/test/resources/extracted_lines/doc-line_extract_jpg/line02.jpg


--------------------------------------------------------------------------------
/src/test/resources/initialize_font.sh:
--------------------------------------------------------------------------------
1 | target/start edu.berkeley.cs.nlp.ocular.main.InitializeFont \
2 |   -inputLmPath src/test/resources/doc.lmser \
3 |   -outputFontPath src/test/resources/doc-init.fontser
4 | 
5 | target/start edu.berkeley.cs.nlp.ocular.main.InitializeFont \
6 |   -inputLmPath src/test/resources/multiling.lmser \
7 |   -outputFontPath src/test/resources/multiling-init.fontser
8 | 


--------------------------------------------------------------------------------
/src/test/resources/initialize_lm.sh:
--------------------------------------------------------------------------------
 1 | target/start edu.berkeley.cs.nlp.ocular.main.InitializeLanguageModel \
 2 |   -inputTextPath src/test/resources/doc.txt \
 3 |   -outputLmPath src/test/resources/doc.lmser \
 4 |   -minCharCount 0
 5 | 
 6 | target/start edu.berkeley.cs.nlp.ocular.main.InitializeLanguageModel \
 7 |   -inputTextPath "Lang1->src/test/resources/doc.txt,Lang2->src/test/resources/doc.txt" \
 8 |   -outputLmPath src/test/resources/multiling.lmser \
 9 |   -charNgramLength "Lang1->6,Lang2->4" \
10 |   -minCharCount 0
11 | 
12 | 


--------------------------------------------------------------------------------
/src/test/resources/train_font.sh:
--------------------------------------------------------------------------------
 1 | target/start edu.berkeley.cs.nlp.ocular.main.TrainFont \
 2 |   -inputFontPath src/test/resources/doc-init.fontser \
 3 |   -inputLmPath src/test/resources/doc.lmser \
 4 |   -inputDocPath src/test/resources/doc.jpg \
 5 |   -extractedLinesPath src/test/resources/extracted_lines \
 6 |   -outputFontPath src/test/resources/doc-trained.fontser \
 7 |   -outputPath src/test/resources/train_output \
 8 |   -numEmIters 1
 9 | #  -allowGlyphSubstitution true \
10 | #  -updateGsm true \
11 | #  -outputGsmPath src/test/resources/doc.gsmser \
12 | 
13 | target/start edu.berkeley.cs.nlp.ocular.main.TrainFont \
14 |   -inputFontPath src/test/resources/multiling-init.fontser \
15 |   -inputLmPath src/test/resources/multiling.lmser \
16 |   -inputDocPath src/test/resources/doc.jpg \
17 |   -extractedLinesPath src/test/resources/extracted_lines \
18 |   -outputFontPath src/test/resources/multiling-trained.fontser \
19 |   -outputPath src/test/resources/multiling_train_output \
20 |   -numEmIters 1 \
21 |   -allowGlyphSubstitution true \
22 |   -updateGsm true \
23 |   -outputGsmPath src/test/resources/multiling.gsmser \
24 | 


--------------------------------------------------------------------------------