├── .gitignore
├── CIAWFBFixer.scala
├── LICENSE.txt
├── README.md
├── bin
    ├── cwarxml2txt.sh
    ├── cwarxml2txttgn.sh
    ├── download-geonames.sh
    ├── download-wiki-data.sh
    ├── fieldspring
    ├── fix-trconll.sh
    ├── prepare-cwar.sh
    ├── runexps.sh
    └── sbt-launch-0.12.0.jar
├── build
├── build.sbt
├── data
    ├── lists
    │   └── stopwords.english
    └── models
    │   └── getOpenNLPModels.sh
├── lib
    ├── argot_2.9.1-0.3.5-benwing.jar
    ├── codeanticode-GLGraphics-0.9.4.jar
    ├── controlP5-1.5.2.jar
    ├── fhpotsdam-unfolding-0.9.1.jar
    ├── lift-json_2.9.1-2.4.jar
    ├── opengl-core-20120724.jar
    ├── processing-opengl-20120724.jar
    ├── scoobi_2.9.2-0.6.0-cdh3-SNAPSHOT-benwing.jar
    ├── trove-scala_2.9.1-0.0.2-SNAPSHOT.jar
    └── upenn-junto-1.1-assembly.jar
├── project
    └── plugins.sbt
└── src
    ├── main
        ├── java
        │   ├── ags
        │   │   └── utils
        │   │   │   └── KdTree.java
        │   └── opennlp
        │   │   └── fieldspring
        │   │       └── tr
        │   │           ├── app
        │   │               ├── BaseApp.java
        │   │               ├── EvaluateCorpus.java
        │   │               ├── ImportCorpus.java
        │   │               ├── ImportGazetteer.java
        │   │               ├── LabelPropPreproc.java
        │   │               ├── LabelPropPreprocOld.java
        │   │               ├── RunResolver.java
        │   │               └── WriteCorpusToKML.java
        │   │           ├── eval
        │   │               ├── AccuracyEvaluator.java
        │   │               ├── DistanceReport.java
        │   │               ├── DocDistanceEvaluator.java
        │   │               ├── EDEvaluator.java
        │   │               ├── Evaluator.java
        │   │               ├── Report.java
        │   │               ├── SharedNEEvaluator.java
        │   │               └── SignatureEvaluator.java
        │   │           ├── resolver
        │   │               ├── BasicMinDistResolver.java
        │   │               ├── LabelPropComplexResolver.java
        │   │               ├── LabelPropContextSensitiveResolver.java
        │   │               ├── LabelPropDefaultRuleResolver.java
        │   │               ├── RandomResolver.java
        │   │               ├── Resolver.java
        │   │               ├── SimpleDocumentResolver.java
        │   │               └── WeightedMinDistResolver.java
        │   │           ├── text
        │   │               ├── CompactCorpus.java
        │   │               ├── Corpus.java
        │   │               ├── Document.java
        │   │               ├── DocumentSource.java
        │   │               ├── DocumentSourceWrapper.java
        │   │               ├── GeoTextDocument.java
        │   │               ├── Sentence.java
        │   │               ├── SimpleSentence.java
        │   │               ├── SimpleToken.java
        │   │               ├── SimpleToponym.java
        │   │               ├── StoredCorpus.java
        │   │               ├── StoredToken.java
        │   │               ├── StoredToponym.java
        │   │               ├── StreamCorpus.java
        │   │               ├── Token.java
        │   │               ├── Toponym.java
        │   │               ├── io
        │   │               │   ├── CorpusKMLWriter.java
        │   │               │   ├── CorpusXMLSource.java
        │   │               │   ├── CorpusXMLWriter.java
        │   │               │   ├── GeoTextCorpusKMLWriter.java
        │   │               │   ├── GeoTextSource.java
        │   │               │   ├── PlainTextDirSource.java
        │   │               │   ├── PlainTextSource.java
        │   │               │   ├── TextSource.java
        │   │               │   ├── TrXMLDirSource.java
        │   │               │   └── TrXMLSource.java
        │   │               └── prep
        │   │               │   ├── CandidateAnnotator.java
        │   │               │   ├── CandidateRepopulator.java
        │   │               │   ├── HighRecallToponymRecognizer.java
        │   │               │   ├── JythonNER.java
        │   │               │   ├── NamedEntityRecognizer.java
        │   │               │   ├── NamedEntityType.java
        │   │               │   ├── OpenNLPRecognizer.java
        │   │               │   ├── OpenNLPSentenceDivider.java
        │   │               │   ├── OpenNLPTokenizer.java
        │   │               │   ├── ScriptNER.java
        │   │               │   ├── SentenceDivider.java
        │   │               │   ├── Tokenizer.java
        │   │               │   ├── ToponymAnnotator.java
        │   │               │   └── ToponymRemover.java
        │   │           ├── topo
        │   │               ├── Coordinate.java
        │   │               ├── Location.java
        │   │               ├── PointRegion.java
        │   │               ├── PointSetRegion.java
        │   │               ├── RectRegion.java
        │   │               ├── Region.java
        │   │               └── gaz
        │   │               │   ├── CandidateList.java
        │   │               │   ├── FilteredGeoNamesReader.java
        │   │               │   ├── Gazetteer.java
        │   │               │   ├── GazetteerFileReader.java
        │   │               │   ├── GazetteerLineReader.java
        │   │               │   ├── GazetteerReader.java
        │   │               │   ├── GeoNamesGazetteer.java
        │   │               │   ├── GeoNamesGazetteerWithList.java
        │   │               │   ├── GeoNamesReader.java
        │   │               │   ├── InMemoryGazetteer.java
        │   │               │   ├── LoadableGazetteer.java
        │   │               │   ├── MultiGazetteer.java
        │   │               │   └── WorldReader.java
        │   │           └── util
        │   │               ├── Constants.java
        │   │               ├── CountingLexicon.java
        │   │               ├── DoubleStringPair.java
        │   │               ├── EditMapper.java
        │   │               ├── FastTrig.java
        │   │               ├── FastTrig.java~
        │   │               ├── IOUtil.java
        │   │               ├── KMLUtil.java
        │   │               ├── KMLUtil.java~
        │   │               ├── Lexicon.java
        │   │               ├── MemoryUtil.java
        │   │               ├── SimpleCountingLexicon.java
        │   │               ├── SimpleLexicon.java
        │   │               ├── Span.java
        │   │               ├── StringDoublePair.java
        │   │               ├── StringEditMapper.java
        │   │               ├── StringUtil.java
        │   │               ├── TopoUtil.java
        │   │               ├── ToponymFinder.java
        │   │               └── XMLUtil.java
        ├── python
        │   ├── article_statistics.py
        │   ├── convert-infochimps.py
        │   ├── convert_to_new_article_format.py
        │   ├── find-first-tweet-time.py
        │   ├── fix_redirects.py
        │   ├── format-thresh-grid.py
        │   ├── generate-numbers.py
        │   ├── generate_combined.py
        │   ├── ner
        │   │   ├── DummyNER.py
        │   │   └── stanford2places.py
        │   ├── nlputil.py
        │   ├── parse-wex.py
        │   ├── permute_wiki.py
        │   ├── process_article_data.py
        │   ├── processwiki.py
        │   ├── run-geolocate-exper.py
        │   ├── split_bzip.py
        │   ├── splitdevtest.py
        │   ├── tei2txt.py
        │   ├── tei_entities.py
        │   ├── trrraw2plain.py
        │   ├── twitter-graphs
        │   │   ├── twitter.py
        │   │   └── twitterRelationGraphs.py
        │   ├── twitter_geotext_process.py
        │   ├── twitter_to_lda.py
        │   └── unescape_entities.py
        ├── resources
        │   └── data
        │   │   ├── deu
        │   │       └── stopwords.txt
        │   │   ├── eng
        │   │       ├── stopwords.txt
        │   │       └── stopwords.txt.old
        │   │   ├── geo
        │   │       └── country-codes.txt
        │   │   └── por
        │   │       └── stopwords.txt
        └── scala
        │   └── opennlp
        │       └── fieldspring
        │           ├── geolocate
        │               ├── CombinedModelCell.scala
        │               ├── GenerateKML.scala
        │               ├── Geolocate.scala
        │               ├── Hadoop.scala
        │               ├── KDTreeCell.scala
        │               ├── MultiRegularCell.scala
        │               ├── SphereCell.scala
        │               ├── SphereCellDist.scala
        │               ├── SphereDocument.scala
        │               ├── SphereEvaluation.scala
        │               ├── TwitterDocument.scala
        │               ├── WikipediaDocument.scala
        │               └── toponym
        │               │   └── Toponym.scala
        │           ├── gridlocate
        │               ├── Cell.scala
        │               ├── CellDist.scala
        │               ├── DistDocument.scala
        │               ├── Evaluation.scala
        │               ├── GridLocate.scala
        │               ├── Reranker.scala
        │               └── TextGrounderInfo.scala
        │           ├── perceptron
        │               ├── Memoizer.scala
        │               ├── Perceptron.scala
        │               └── package.scala
        │           ├── poligrounder
        │               ├── Poligrounder.scala
        │               ├── TimeCell.scala
        │               └── TimeDocument.scala
        │           ├── postprocess
        │               ├── DocumentPinKMLGenerator.scala
        │               ├── DocumentRankerByError.scala
        │               ├── ErrorKMLGenerator.scala
        │               ├── KNNKMLGenerator.scala
        │               ├── WordRankerByAvgError.scala
        │               └── WordRankerByAvgErrorUT.scala
        │           ├── preprocess
        │               ├── ConvertTwitterInfochimps.scala
        │               ├── ExtractGeotaggedListFromWikiDump.scala
        │               ├── ExtractLinksFromWikiDump.scala
        │               ├── FindPolitical.scala
        │               ├── FrobTextDB.scala
        │               ├── MergeMetadataAndOldCounts.scala
        │               ├── OldGroupCorpus.scala
        │               ├── ParseTweets.scala
        │               ├── Permute.scala
        │               ├── ProcessFiles.scala
        │               ├── ScoobiConvertTwitterInfochimps.scala
        │               ├── ScoobiProcessFilesApp.scala
        │               ├── ScoobiWordCount.scala
        │               └── TwitterPullLocationVariance.scala
        │           ├── tr
        │               ├── app
        │               │   ├── ConvertCorpusToPlaintext.scala
        │               │   ├── ConvertCorpusToToponymAsDoc.scala
        │               │   ├── ConvertCorpusToUnigramCounts.scala
        │               │   ├── ConvertCwarToGoldCorpus.scala
        │               │   ├── ConvertGeoTextToJSON.scala
        │               │   ├── CorpusErrorAnalyzer.scala
        │               │   ├── CorpusInfo.scala
        │               │   ├── FilterGeotaggedWiki.scala
        │               │   ├── GazEntryKMLPlotter.scala
        │               │   ├── GeoTextLabelProp.scala
        │               │   ├── GeoTextLabelPropDecoder.scala
        │               │   ├── GeoTextLabelPropPreproc.scala
        │               │   ├── Preprocess.scala
        │               │   ├── ReprocessTrApp.scala
        │               │   ├── SplitDevTest.scala
        │               │   ├── SupervisedTRMaxentModelTrainer.scala
        │               │   ├── TrainingDirectoriesCombiner.scala
        │               │   └── VisualizeCorpus.scala
        │               ├── model
        │               │   └── AltBasicMinDistModel.scala
        │               ├── resolver
        │               │   ├── BayesRuleResolver.scala
        │               │   ├── DocDistResolver.scala
        │               │   ├── GaussianTPPResolver.scala
        │               │   ├── HeuristicTPPResolver.scala
        │               │   ├── LabelPropResolver.scala
        │               │   ├── MaxentResolver.scala
        │               │   ├── PopulationResolver.scala
        │               │   ├── ProbabilisticResolver.scala
        │               │   ├── TPPResolver.scala
        │               │   └── ToponymAsDocDistResolver.scala
        │               ├── text
        │               │   └── io
        │               │   │   ├── DynamicKMLWriter.scala
        │               │   │   ├── GigawordSource.scala
        │               │   │   └── WikiTextSource.scala
        │               ├── topo
        │               │   ├── SphericalGeometry.scala
        │               │   ├── gaz
        │               │   │   ├── CorpusGazetteerReader.scala
        │               │   │   └── geonames
        │               │   │   │   └── GeoNamesParser.scala
        │               │   └── util
        │               │   │   └── CodeConverter.scala
        │               ├── tpp
        │               │   ├── ACOTPPSolver.scala
        │               │   ├── ClusterMarketCreator.scala
        │               │   ├── ConstructionTPPSolver.scala
        │               │   ├── FileTravelCoster.scala
        │               │   ├── GaussianPurchaseCoster.scala
        │               │   ├── GaussianTravelCoster.scala
        │               │   ├── GaussianUtil.scala
        │               │   ├── GridMarketCreator.scala
        │               │   ├── LinkTravelCoster.scala
        │               │   ├── LinkTravelWriter.scala
        │               │   ├── MarketCreator.scala
        │               │   ├── MaxentPurchaseCoster.scala
        │               │   ├── MultiPurchaseCoster.scala
        │               │   ├── PurchaseCoster.scala
        │               │   ├── SimpleContainmentPurchaseCoster.scala
        │               │   ├── SimpleDistanceTravelCoster.scala
        │               │   ├── TPPInstance.scala
        │               │   ├── TPPSolver.scala
        │               │   └── TravelCoster.scala
        │               └── util
        │               │   ├── Average.scala
        │               │   ├── DistanceTable.scala
        │               │   ├── LogUtil.scala
        │               │   ├── StopwordUtil.scala
        │               │   ├── TextUtil.scala
        │               │   ├── cluster
        │               │       └── KMeans.scala
        │               │   └── sanity
        │               │       └── CandidateCheck.scala
        │           ├── util
        │               ├── MeteredTask.scala
        │               ├── Serializer.scala
        │               ├── WikiRelFreqs.scala
        │               ├── argparser.scala
        │               ├── collectionutil.scala
        │               ├── distances.scala
        │               ├── experiment.scala
        │               ├── hadoop.scala
        │               ├── ioutil.scala
        │               ├── mathutil.scala
        │               ├── osutil.scala
        │               ├── printutil.scala
        │               ├── textdbutil.scala
        │               ├── textutil.scala
        │               ├── timeutil.scala
        │               └── twokenize.scala
        │           └── worddist
        │               ├── BigramWordDist.scala.bitrotted
        │               ├── DirichletUnigramWordDist.scala
        │               ├── DiscountedUnigramWordDist.scala
        │               ├── FastDiscountedUnigramWordDist.scala
        │               ├── JelinekMercerUnigramWordDist.scala
        │               ├── Memoizer.scala
        │               ├── NgramWordDist.scala
        │               ├── PseudoGoodTuringBigramWordDist.scala.bitrotted
        │               ├── PseudoGoodTuringUnigramWordDist.scala
        │               ├── UnigramWordDist.scala
        │               ├── UnsmoothedNgramWordDist.scala
        │               └── WordDist.scala
    └── test
        └── scala
            ├── opennlp
                └── fieldspring
                │   └── topo
                │       └── Coordinate.scala
            └── testparse.scala


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | *.log
 3 | 
 4 | *.kml
 5 | *.gz
 6 | *.class
 7 | *.orig
 8 | target/
 9 | tmp/
10 | *~
11 | project/boot
12 | lib_managed/
13 | data/models/*.bin
14 | data/gazetteers/*.zip
15 | 
16 | 
17 | # sbt specific
18 | dist/*
19 | target/
20 | lib_managed/
21 | src_managed/
22 | project/boot/
23 | project/plugins/project/
24 | 
25 | # Scala-IDE specific
26 | .scala_dependencies


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | fieldspring
2 | ===========
3 | 
4 | A system for disambiguating toponyms (placenames) given textual context and creating visualizations of the locations referenced in a given text or corpus.


--------------------------------------------------------------------------------
/bin/cwarxml2txt.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | indir=${1%/}
 4 | outdir=${2%/}
 5 | 
 6 | for f in $indir/*.xml
 7 | do
 8 |   filename=$(basename $f)
 9 |   filename=${filename%.*}
10 |   grep '<milestone unit="sentence"' $f | sed 's/<[^<>]*>//g' > $outdir/$filename.txt
11 | done
12 | 


--------------------------------------------------------------------------------
/bin/cwarxml2txttgn.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | indir=${1%/}
 4 | outdir=${2%/}
 5 | 
 6 | if [ ! -e $outdir ]; then
 7 |     mkdir $outdir
 8 | fi
 9 | 
10 | for f in $indir/*.xml
11 | do
12 |   filename=$(basename $f)
13 |   filename=${filename%.*}
14 |   grep '<milestone unit="sentence"' $f | sed -re 's/tgn,([^"]+)">([^<]+)/>tgn,\1-\2-]]/' | sed -re 's/tgn,([^"]+)-(\w+) (\w+)-]]/tgn,\1-\2-\3-]]/' | sed 's/<[^<>]*>//g' > $outdir/$filename.txt
15 | done
16 | 


--------------------------------------------------------------------------------
/bin/download-geonames.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z $FIELDSPRING_DIR ]; then
 4 |     echo "You must set the environment variable FIELDSPRING_DIR to point to Fieldspring's installation directory."
 5 |     exit
 6 | fi
 7 | 
 8 | origwd=`pwd`
 9 | 
10 | if [ ! -e $FIELDSPRING_DIR/data/gazetteers/allCountries.zip ]; then
11 |     cd $FIELDSPRING_DIR/data/gazetteers
12 |     wget http://web.corral.tacc.utexas.edu/utcompling/fieldspring-data/allCountries.zip
13 | fi
14 | 
15 | 
16 | 
17 | cd $origwd
18 | 


--------------------------------------------------------------------------------
/bin/download-wiki-data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | wget -r http://web.corral.tacc.utexas.edu/utcompling/fieldspring-data/wistr-models-cwardev-gt
 4 | wget -r http://web.corral.tacc.utexas.edu/utcompling/fieldspring-data/wistr-models-cwartest-gt
 5 | wget -r http://web.corral.tacc.utexas.edu/utcompling/fieldspring-data/wistr-models-trdev-gt
 6 | wget -r http://web.corral.tacc.utexas.edu/utcompling/fieldspring-data/wistr-models-trtest-gt
 7 | wget http://web.corral.tacc.utexas.edu/utcompling/fieldspring-data/wiki/enwiki-cwardev-20spd-100.log
 8 | wget http://web.corral.tacc.utexas.edu/utcompling/fieldspring-data/wiki/enwiki-cwartest-20spd-100.log
 9 | wget http://web.corral.tacc.utexas.edu/utcompling/fieldspring-data/wiki/enwiki-trconlldev-100.log
10 | wget http://web.corral.tacc.utexas.edu/utcompling/fieldspring-data/wiki/enwiki-trconlltest-100.log
11 | 


--------------------------------------------------------------------------------
/bin/fix-trconll.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | indir=${1%%/}
 4 | outdir=${2%%/}
 5 | 
 6 | # Fixes countries from CIA World Factbook that had -0 as longitude:
 7 | fieldspring run CIAWFBFixer $3 $indir $outdir
 8 | 
 9 | # Fixes states with swapped coordinates:
10 | for fullpath in $outdir/*.xml
11 | do
12 |   filename=${fullpath##*/}
13 |   sed -i -e's/\(^.*US_STATE.*lat=\"\)\([^"]*\)\(\".*long=\"\)\([^"]*\)\(\".*$\)/\1\4\3\2\5/' $outdir/$filename
14 | done
15 | 


--------------------------------------------------------------------------------
/bin/prepare-cwar.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z $FIELDSPRING_DIR ]; then
 4 |     echo "You must set the environment variable FIELDSPRING_DIR to point to Fieldspring's installation directory."
 5 |     exit
 6 | fi
 7 | 
 8 | origcwarxmldir=${1%/}
 9 | pathtokml=$2
10 | pathtogaz=$3
11 | cwarxmloutdir=${4%/}
12 | 
13 | echo "Converting original Cwar corpus to plain format..."
14 | cwarxml2txttgn.sh $origcwarxmldir cwarplaintgn
15 | echo "Splitting corpus into dev and test sets..."
16 | fieldspring --memory 2g run opennlp.fieldspring.tr.app.SplitDevTest cwarplaintgn
17 | if [ ! -e $cwarxmloutdir ]; then
18 |     mkdir $cwarxmloutdir
19 | fi
20 | if [ ! -e $cwarxmloutdir/dev ]; then
21 |     mkdir $cwarxmloutdir/dev
22 | fi
23 | if [ ! -e $cwarxmloutdir/test ]; then
24 |     mkdir $cwarxmloutdir/test
25 | fi
26 | 
27 | echo "Converting dev corpus to Fieldspring format..."
28 | fieldspring --memory 8g run opennlp.fieldspring.tr.app.ConvertCwarToGoldCorpus cwarplaintgndev $pathtokml $pathtogaz > $cwarxmloutdir/dev/cwar-dev.xml
29 | echo "Converting test corpus to Fieldspring format..."
30 | fieldspring --memory 8g run opennlp.fieldspring.tr.app.ConvertCwarToGoldCorpus cwarplaintgntest $pathtokml $pathtogaz > $cwarxmloutdir/test/cwar-test.xml
31 | 
32 | echo "Deleting temporary files..."
33 | rm -rf cwarplaintgn
34 | rm -rf cwarplaintgndev
35 | rm -rf cwarplaintgntest
36 | echo "Done."
37 | 
38 | 


--------------------------------------------------------------------------------
/bin/sbt-launch-0.12.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/utcompling/fieldspring/a8425085188fb8f4dc06ccdcdeb2682a172cd680/bin/sbt-launch-0.12.0.jar


--------------------------------------------------------------------------------
/build:
--------------------------------------------------------------------------------
1 | java -Dfile.encoding=UTF8 -XX:+CMSClassUnloadingEnabled -XX:MaxPermSize=256m -Xmx1024M -Xss2M -jar bin/sbt-launch-*.jar "$@"
2 | 


--------------------------------------------------------------------------------
/data/models/getOpenNLPModels.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | wget http://opennlp.sourceforge.net/models-1.5/en-ner-location.bin
4 | wget http://opennlp.sourceforge.net/models-1.5/en-token.bin
5 | wget http://opennlp.sourceforge.net/models-1.5/en-sent.bin


--------------------------------------------------------------------------------
/lib/argot_2.9.1-0.3.5-benwing.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/utcompling/fieldspring/a8425085188fb8f4dc06ccdcdeb2682a172cd680/lib/argot_2.9.1-0.3.5-benwing.jar


--------------------------------------------------------------------------------
/lib/codeanticode-GLGraphics-0.9.4.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/utcompling/fieldspring/a8425085188fb8f4dc06ccdcdeb2682a172cd680/lib/codeanticode-GLGraphics-0.9.4.jar


--------------------------------------------------------------------------------
/lib/controlP5-1.5.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/utcompling/fieldspring/a8425085188fb8f4dc06ccdcdeb2682a172cd680/lib/controlP5-1.5.2.jar


--------------------------------------------------------------------------------
/lib/fhpotsdam-unfolding-0.9.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/utcompling/fieldspring/a8425085188fb8f4dc06ccdcdeb2682a172cd680/lib/fhpotsdam-unfolding-0.9.1.jar


--------------------------------------------------------------------------------
/lib/lift-json_2.9.1-2.4.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/utcompling/fieldspring/a8425085188fb8f4dc06ccdcdeb2682a172cd680/lib/lift-json_2.9.1-2.4.jar


--------------------------------------------------------------------------------
/lib/opengl-core-20120724.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/utcompling/fieldspring/a8425085188fb8f4dc06ccdcdeb2682a172cd680/lib/opengl-core-20120724.jar


--------------------------------------------------------------------------------
/lib/processing-opengl-20120724.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/utcompling/fieldspring/a8425085188fb8f4dc06ccdcdeb2682a172cd680/lib/processing-opengl-20120724.jar


--------------------------------------------------------------------------------
/lib/scoobi_2.9.2-0.6.0-cdh3-SNAPSHOT-benwing.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/utcompling/fieldspring/a8425085188fb8f4dc06ccdcdeb2682a172cd680/lib/scoobi_2.9.2-0.6.0-cdh3-SNAPSHOT-benwing.jar


--------------------------------------------------------------------------------
/lib/trove-scala_2.9.1-0.0.2-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/utcompling/fieldspring/a8425085188fb8f4dc06ccdcdeb2682a172cd680/lib/trove-scala_2.9.1-0.0.2-SNAPSHOT.jar


--------------------------------------------------------------------------------
/lib/upenn-junto-1.1-assembly.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/utcompling/fieldspring/a8425085188fb8f4dc06ccdcdeb2682a172cd680/lib/upenn-junto-1.1-assembly.jar


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.8.3")
2 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/app/ImportGazetteer.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This class imports a gazetteer from a text file and serializes it, to be read quickly by RunResolver quickly.
 3 |  */
 4 | 
 5 | package opennlp.fieldspring.tr.app;
 6 | 
 7 | import opennlp.fieldspring.tr.topo.gaz.*;
 8 | import opennlp.fieldspring.tr.util.*;
 9 | import java.io.*;
10 | import java.util.zip.*;
11 | 
12 | public class ImportGazetteer extends BaseApp {
13 | 
14 |     public static void main(String[] args) throws Exception {
15 |         ImportGazetteer currentRun = new ImportGazetteer();
16 |         currentRun.initializeOptionsFromCommandLine(args);
17 |         currentRun.serialize(currentRun.doImport(currentRun.getInputPath(), currentRun.isDoingKMeans()), currentRun.getOutputPath());
18 |     }
19 | 
20 |     public GeoNamesGazetteer doImport(String gazInputPath, boolean runKMeans) throws Exception {
21 |         System.out.println("Reading GeoNames gazetteer from " + gazInputPath + " ...");
22 | 
23 |         checkExists(gazInputPath);
24 |         
25 |         GeoNamesGazetteer gnGaz = null;
26 |         if(gazInputPath.toLowerCase().endsWith(".zip")) {
27 |             ZipFile zf = new ZipFile(gazInputPath);
28 |             ZipInputStream zis = new ZipInputStream(new FileInputStream(gazInputPath));
29 |             ZipEntry ze = zis.getNextEntry();
30 |             gnGaz = new GeoNamesGazetteer(new BufferedReader(new InputStreamReader(zf.getInputStream(ze))), runKMeans);
31 |             zis.close();
32 |         }
33 |         else {
34 |             gnGaz = new GeoNamesGazetteer(new BufferedReader(new FileReader(gazInputPath)), runKMeans);
35 |         }
36 | 
37 |         System.out.println("Done.");
38 | 
39 |         return gnGaz;
40 |     }
41 | 
42 |     public void serialize(GeoNamesGazetteer gnGaz, String serializedGazOutputPath) throws Exception {
43 |         System.out.print("Serializing GeoNames gazetteer to " + serializedGazOutputPath + " ...");
44 | 
45 |         ObjectOutputStream oos = null;
46 |         if(serializedGazOutputPath.toLowerCase().endsWith(".gz")) {
47 |             GZIPOutputStream gos = new GZIPOutputStream(new FileOutputStream(serializedGazOutputPath));
48 |             oos = new ObjectOutputStream(gos);
49 |         }
50 |         else {
51 |             FileOutputStream fos = new FileOutputStream(serializedGazOutputPath);
52 |             oos = new ObjectOutputStream(fos);
53 |         }
54 |         oos.writeObject(gnGaz);
55 |         oos.close();
56 | 
57 |         System.out.println("done.");
58 |     }
59 | }
60 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/app/WriteCorpusToKML.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This class takes a corpus with system resolved toponyms and generates a KML file visualizable in Google Earth.
 3 |  */
 4 | 
 5 | package opennlp.fieldspring.tr.app;
 6 | 
 7 | import opennlp.fieldspring.tr.text.*;
 8 | import opennlp.fieldspring.tr.text.io.*;
 9 | import opennlp.fieldspring.tr.util.*;
10 | import java.io.*;
11 | 
12 | public class WriteCorpusToKML extends BaseApp {
13 | 
14 |     public static void main(String[] args) throws Exception {
15 | 
16 |         WriteCorpusToKML currentRun = new WriteCorpusToKML();
17 |         currentRun.initializeOptionsFromCommandLine(args);
18 | 
19 |         if(currentRun.getSerializedCorpusInputPath() == null) {
20 |             System.out.println("Please specify an input corpus in serialized format via the -sci flag.");
21 |             System.exit(0);
22 |         }
23 | 
24 |         if(currentRun.getKMLOutputPath() == null) {
25 |             System.out.println("Please specify a KML output path via the -ok flag.");
26 |             System.exit(0);
27 |         }
28 | 
29 |         System.out.print("Reading serialized corpus from " + currentRun.getSerializedCorpusInputPath() + " ...");
30 |         Corpus corpus = TopoUtil.readCorpusFromSerialized(currentRun.getSerializedCorpusInputPath());
31 |         System.out.println("done.");
32 | 
33 |         currentRun.writeToKML(corpus, currentRun.getKMLOutputPath(), currentRun.getOutputGoldLocations(), currentRun.getOutputUserKML(), currentRun.getCorpusFormat());
34 |     }
35 | 
36 |     public void writeToKML(Corpus corpus, String kmlOutputPath, boolean outputGoldLocations, boolean outputUserKML, Enum<BaseApp.CORPUS_FORMAT> corpusFormat) throws Exception {
37 |         System.out.print("Writing visualizable corpus in KML format to " + kmlOutputPath + " ...");
38 |         CorpusKMLWriter kw;
39 |         if(corpusFormat == CORPUS_FORMAT.GEOTEXT && outputUserKML)
40 |             kw = new GeoTextCorpusKMLWriter(corpus, outputGoldLocations);
41 |         else
42 |             kw = new CorpusKMLWriter(corpus, outputGoldLocations);
43 |         kw.write(new File(kmlOutputPath));
44 |         System.out.println("done.");
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/eval/AccuracyEvaluator.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This is a simple Evaluator that assumes gold named entities were used in preprocessing. For each gold disambiguated toponym, the model
 3 |  * either got that Location right or wrong, and a Report containing the accuracy figure on this task is returned.
 4 |  */
 5 | 
 6 | package opennlp.fieldspring.tr.eval;
 7 | 
 8 | import opennlp.fieldspring.tr.text.*;
 9 | 
10 | public class AccuracyEvaluator extends Evaluator {
11 | 
12 |     public AccuracyEvaluator(Corpus corpus) {
13 |         super(corpus);
14 |     }
15 | 
16 |     @Override
17 |     public Report evaluate() {
18 | 
19 |         Report report = new Report();
20 | 
21 |         for(Document<Token> doc : corpus) {
22 |             for(Sentence<Token> sent : doc) {
23 |                 for(Toponym toponym : sent.getToponyms()) {
24 |                     if(toponym.hasGold()) {
25 |                         if(toponym.getGoldIdx() == toponym.getSelectedIdx()) {
26 |                             report.incrementTP();
27 |                         }
28 |                         else {
29 |                             report.incrementInstanceCount();
30 |                         }
31 |                     }
32 |                 }
33 |             }
34 |         }
35 | 
36 |         return report;
37 |     }
38 | 
39 |     @Override
40 |     public Report evaluate(Corpus<Token> pred, boolean useSelected) {
41 |         return null;
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/eval/DistanceReport.java:
--------------------------------------------------------------------------------
 1 | package opennlp.fieldspring.tr.eval;
 2 | 
 3 | import java.util.*;
 4 | 
 5 | public class DistanceReport {
 6 | 
 7 |     private List<Double> distances = new ArrayList<Double>();
 8 |     private boolean isSorted = true;
 9 | 
10 |     public void addDistance(double distance) {
11 |         distances.add(distance);
12 |         isSorted = false;
13 |     }
14 | 
15 |     public double getMeanDistance() {
16 |         if(distances.size() == 0) return -1;
17 | 
18 |         double total = 0.0;
19 |         for(double distance : distances) {
20 |             total += distance;
21 |         }
22 |         return total / distances.size();
23 |     }
24 | 
25 |     public double getMedianDistance() {
26 |         if(distances.size() == 0) return -1;
27 |         sort();
28 |         return distances.get(distances.size() / 2);
29 |     }
30 | 
31 |     public int getNumDistances() {
32 |         return distances.size();
33 |     }
34 | 
35 |     public double getFractionDistancesWithinThreshold(double threshold) {
36 |         int count = 0;
37 |         for(double distance : distances)
38 |             if(distance <= threshold)
39 |                 count++;
40 |         return ((double)count) / distances.size();
41 |     }
42 | 
43 |     public double getMinDistance() {
44 |         if(distances.size() == 0) return -1;
45 |         sort();
46 |         return distances.get(0);
47 |     }
48 | 
49 |     public double getMaxDistance() {
50 |         if(distances.size() == 0) return -1;
51 |         sort();
52 |         return distances.get(distances.size()-1);
53 |     }
54 | 
55 |     private void sort() {
56 |         if(isSorted)
57 |             return;
58 |         Collections.sort(distances);
59 |         isSorted = true;
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/eval/DocDistanceEvaluator.java:
--------------------------------------------------------------------------------
 1 | package opennlp.fieldspring.tr.eval;
 2 | 
 3 | import opennlp.fieldspring.tr.text.*;
 4 | import opennlp.fieldspring.tr.topo.*;
 5 | 
 6 | public class DocDistanceEvaluator {
 7 | 
 8 |     protected final Corpus<Token> corpus;
 9 | 
10 |     public DocDistanceEvaluator(Corpus<? extends Token> corpus) {
11 |         this.corpus = (Corpus<Token>)corpus;
12 |     }
13 | 
14 |   /* Evaluate the "selected" candidates in the corpus using its "gold"
15 |    * candidates. */
16 |     public DistanceReport evaluate() {
17 |         DistanceReport dreport = new DistanceReport();
18 | 
19 |         for(Document<Token> doc : corpus) {
20 | 
21 |             if(!doc.isTrain()) {
22 | 
23 |                 Coordinate systemCoord = doc.getSystemCoord();
24 |                 Coordinate goldCoord = doc.getGoldCoord();
25 | 
26 |                 if(systemCoord != null && goldCoord != null) {
27 |                     dreport.addDistance(systemCoord.distanceInKm(goldCoord));
28 |                 }
29 |             }
30 |         }
31 | 
32 |         return dreport;
33 |     }
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/eval/EDEvaluator.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.eval;
17 | 
18 | import java.util.Iterator;
19 | 
20 | import opennlp.fieldspring.tr.text.Document;
21 | import opennlp.fieldspring.tr.text.Corpus;
22 | import opennlp.fieldspring.tr.text.Sentence;
23 | import opennlp.fieldspring.tr.text.Token;
24 | 
25 | public class EDEvaluator extends Evaluator {
26 |   public EDEvaluator(Corpus<Token> corpus) {
27 |     super(corpus);
28 |   }
29 | 
30 |   public Report evaluate() {
31 |     return null;
32 |   }
33 | 
34 |   public Report evaluate(Corpus<Token> pred, boolean useSelected) {
35 |     Iterator<Document<Token>> goldDocs = this.corpus.iterator();
36 |     Iterator<Document<Token>> predDocs = pred.iterator();
37 | 
38 |     while (goldDocs.hasNext() && predDocs.hasNext()) {
39 |       Iterator<Sentence<Token>> goldSents = goldDocs.next().iterator();
40 |       Iterator<Sentence<Token>> predSents = predDocs.next().iterator();
41 | 
42 |       while (goldSents.hasNext() && predSents.hasNext()) {
43 |       }
44 | 
45 |       assert !goldSents.hasNext() && !predSents.hasNext() : "Documents have different numbers of sentences.";
46 |     }
47 | 
48 |     assert !goldDocs.hasNext() && !predDocs.hasNext() : "Corpora have different numbers of documents.";
49 |     return null;
50 |   }
51 | }
52 | 
53 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/eval/Evaluator.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.eval;
17 | 
18 | import opennlp.fieldspring.tr.text.Corpus;
19 | import opennlp.fieldspring.tr.text.Token;
20 | 
21 | public abstract class Evaluator {
22 |   protected final Corpus<Token> corpus;
23 | 
24 |   /* The given corpus should include either gold or selected candidates or
25 |    * both. */
26 |   public Evaluator(Corpus<? extends Token> corpus) {
27 |     this.corpus = (Corpus<Token>) corpus;
28 |   }
29 | 
30 |   /* Evaluate the "selected" candidates in the corpus using its "gold"
31 |    * candidates. */
32 |   public abstract Report evaluate();
33 | 
34 |   /* Evaluate the given corpus using either the gold or selected candidates in
35 |    * the current corpus. */
36 |   public abstract Report evaluate(Corpus<Token> pred, boolean useSelected);
37 | 
38 |   /* A convenience method providing a default for evaluate. */
39 |   public Report evaluate(Corpus<Token> pred) {
40 |     return this.evaluate(pred, false);
41 |   }
42 | }
43 | 
44 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/eval/Report.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.eval;
17 | 
18 | public class Report {
19 | 
20 |     private int tp;
21 |     private int fp;
22 |     private int fn;
23 |     private int totalInstances;
24 | 
25 |     public int getFN() {
26 |         return fn;
27 |     }
28 | 
29 |     public int getFP() {
30 |         return fp;
31 |     }
32 | 
33 |     public int getTP() {
34 |         return tp;
35 |     }
36 | 
37 |     public int getInstanceCount() {
38 |         return totalInstances;
39 |     }
40 | 
41 |     public void incrementTP() {
42 |         tp++;
43 |         totalInstances++;
44 |     }
45 | 
46 |     public void incrementFP() {
47 |         fp++;
48 |         totalInstances++;
49 |     }
50 | 
51 |     public void incrementFN() {
52 |         fn++;
53 |         totalInstances++;
54 |     }
55 | 
56 |     public void incrementFPandFN() {
57 |         fp++;
58 |         fn++;
59 |         totalInstances++;
60 |     }
61 | 
62 |     public void incrementInstanceCount() {
63 |         totalInstances++;
64 |     }
65 | 
66 |     public double getAccuracy() {
67 |         return (double) tp / totalInstances;
68 |     }
69 | 
70 |     public double getPrecision() {
71 |         return (double) tp / (tp + fp);
72 |     }
73 | 
74 |     public double getRecall() {
75 |         return (double) tp / (tp + fn);
76 |     }
77 | 
78 |     public double getFScore() {
79 |         double p = getPrecision();
80 |         double r = getRecall();
81 |         return (2 * p * r) / (p + r);
82 |     }
83 | }
84 | 
85 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/resolver/RandomResolver.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Random baseline resolver. Selects a random location for each toponym.
 3 |  */
 4 | 
 5 | package opennlp.fieldspring.tr.resolver;
 6 | 
 7 | import opennlp.fieldspring.tr.text.*;
 8 | import java.util.*;
 9 | 
10 | public class RandomResolver extends Resolver {
11 | 
12 |     private Random rand = new Random();
13 | 
14 |     @Override
15 |     public StoredCorpus disambiguate(StoredCorpus corpus) {
16 | 
17 |         for(Document<StoredToken> doc : corpus) {
18 |             for(Sentence<StoredToken> sent : doc) {
19 |                 for(Toponym toponym : sent.getToponyms()) {
20 |                     int ambiguity = toponym.getAmbiguity();
21 |                     if (ambiguity > 0 && (overwriteSelecteds || !toponym.hasSelected())) {
22 |                         toponym.setSelectedIdx(rand.nextInt(ambiguity));
23 |                     }
24 |                 }
25 |             }
26 |         }
27 |         
28 |         return corpus;
29 |     }    
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/resolver/Resolver.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This version of Resolver (started 9/22/10) is just an abstract class with the disambiguate(Corpus) method.
 3 |  */
 4 | 
 5 | package opennlp.fieldspring.tr.resolver;
 6 | 
 7 | import opennlp.fieldspring.tr.text.*;
 8 | 
 9 | /**
10 |  * @param corpus
11 |  *        a corpus without any selected candidates for each toponym (or ignores the selections if they are present)
12 |  * @return
13 |  *        a corpus with selected candidates, ready for evaluation
14 |  */
15 | public abstract class Resolver {
16 | 
17 |     // Make this false to have a resolver only resolve toponyms that don't already have a selected candidate
18 |     // (not implemented in all resolvers yet)
19 |     public boolean overwriteSelecteds = true;
20 | 
21 |     public void train(StoredCorpus corpus) {
22 |         throw new UnsupportedOperationException("This type of resolver cannot be trained.");
23 |     }
24 | 
25 |     public abstract StoredCorpus disambiguate(StoredCorpus corpus);
26 |     
27 | }
28 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/text/Corpus.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.text;
17 | 
18 | import java.util.Iterator;
19 | 
20 | import opennlp.fieldspring.tr.util.Lexicon;
21 | import opennlp.fieldspring.tr.app.*;
22 | import java.io.*;
23 | 
24 | public abstract class Corpus<A extends Token> implements Iterable<Document<A>>, Serializable {
25 | 
26 |     private static Enum<BaseApp.CORPUS_FORMAT> corpusFormat = null;//BaseApp.CORPUS_FORMAT.PLAIN;
27 | 
28 |   public abstract void addSource(DocumentSource source);
29 |   public abstract void close();
30 | 
31 |   public static Corpus<Token> createStreamCorpus() {
32 |     return new StreamCorpus();
33 |   }
34 | 
35 |   public static StoredCorpus createStoredCorpus() {
36 |     return new CompactCorpus(Corpus.createStreamCorpus());
37 |   }
38 | 
39 |   public DocumentSource asSource() {
40 |     final Iterator<Document<A>> iterator = this.iterator();
41 | 
42 |     return new DocumentSource() {
43 |       public boolean hasNext() {
44 |         return iterator.hasNext();
45 |       }
46 | 
47 |       public Document<Token> next() {
48 |         return (Document<Token>) iterator.next();
49 |       }
50 |     };
51 |   }
52 | 
53 |   public Enum<BaseApp.CORPUS_FORMAT> getFormat() {
54 |       return corpusFormat;
55 |   }
56 | 
57 |   public void setFormat(Enum<BaseApp.CORPUS_FORMAT> corpusFormat) {
58 |       this.corpusFormat = corpusFormat;
59 |   }
60 | }
61 | 
62 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/text/DocumentSource.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.text;
17 | 
18 | import java.util.Iterator;
19 | 
20 | import opennlp.fieldspring.tr.text.Document;
21 | import opennlp.fieldspring.tr.text.Sentence;
22 | import opennlp.fieldspring.tr.text.Token;
23 | 
24 | public abstract class DocumentSource implements Iterator<Document<Token>> {
25 |   public void close() {
26 |   }
27 | 
28 |   public void remove() {
29 |     throw new UnsupportedOperationException("Cannot remove a document from a source.");
30 |   }
31 | 
32 |   protected abstract class SentenceIterator implements Iterator<Sentence<Token>> {
33 |     public void remove() {
34 |       throw new UnsupportedOperationException("Cannot remove a sentence from a source.");
35 |     }    
36 |   }
37 | }
38 | 
39 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/text/DocumentSourceWrapper.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.text;
17 | 
18 | import java.util.Iterator;
19 | 
20 | import opennlp.fieldspring.tr.text.Document;
21 | import opennlp.fieldspring.tr.text.Sentence;
22 | import opennlp.fieldspring.tr.text.Token;
23 | 
24 | /**
25 |  * Wraps a document source in order to perform some operation on it.
26 |  *
27 |  * @author Travis Brown <travis.brown@mail.utexas.edu>
28 |  * @version 0.1.0
29 |  */
30 | public abstract class DocumentSourceWrapper extends DocumentSource {
31 |   private final DocumentSource source;
32 | 
33 |   public DocumentSourceWrapper(DocumentSource source) {
34 |     this.source = source;
35 |   }
36 | 
37 |   /**
38 |    * Closes the underlying source.
39 |    */
40 |   public void close() {
41 |     this.source.close();
42 |   }
43 | 
44 |   /**
45 |    * Indicates whether the underlying source has more documents.
46 |    */
47 |   public boolean hasNext() {
48 |     return this.source.hasNext();
49 |   }
50 | 
51 |   /**
52 |    * Returns the underlying source (for use in subclasses).
53 |    */
54 |   protected DocumentSource getSource() {
55 |     return this.source;
56 |   }
57 | }
58 | 
59 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/text/GeoTextDocument.java:
--------------------------------------------------------------------------------
 1 | package opennlp.fieldspring.tr.text;
 2 | 
 3 | import java.io.*;
 4 | import java.util.*;
 5 | 
 6 | import opennlp.fieldspring.tr.topo.*;
 7 | 
 8 | public class GeoTextDocument extends Document<Token> {
 9 | 
10 |     private static final long serialVersionUID = 42L;
11 | 
12 |     private List<Sentence<Token>> sentences;
13 | 
14 |     public GeoTextDocument(String id, String timestamp, double goldLat, double goldLon) {
15 | 	super(id);
16 | 	this.timestamp = timestamp;
17 | 	this.goldCoord = Coordinate.fromDegrees(goldLat, goldLon);
18 |         this.sentences = new ArrayList<Sentence<Token>>();
19 |         this.systemCoord = null;
20 |         this.timestamp = null;
21 |     }
22 | 
23 |     public GeoTextDocument(String id, String timestamp, double goldLat, double goldLon, Enum<Document.SECTION> section) {
24 |         this(id, timestamp, goldLat, goldLon);
25 |         this.section = section;
26 |     }
27 | 
28 |     public GeoTextDocument(String id, String timestamp, double goldLat, double goldLon, long fold) {
29 |         this(id, timestamp, goldLat, goldLon);
30 |         if(fold >= 1 && fold <= 3)
31 |             this.section = Document.SECTION.TRAIN;
32 |         else if(fold == 4)
33 |             this.section = Document.SECTION.DEV;
34 |         else if(fold == 5)
35 |             this.section = Document.SECTION.TEST;
36 |         else
37 |             this.section = Document.SECTION.ANY;
38 |     }
39 | 
40 |     public void addSentence(Sentence<Token> sentence) {
41 |         sentences.add(sentence);
42 |     }
43 | 
44 |     public Iterator<Sentence<Token>> iterator() {
45 |         return sentences.iterator();
46 |     }
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/text/SimpleSentence.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.text;
17 | 
18 | import java.util.ArrayList;
19 | import java.util.Iterator;
20 | import java.util.List;
21 | import java.util.NoSuchElementException;
22 | 
23 | import opennlp.fieldspring.tr.util.Span;
24 | import java.io.*;
25 | 
26 | public class SimpleSentence<A extends Token> extends Sentence<A> implements Serializable {
27 | 
28 |   private static final long serialVersionUID = 42L;
29 | 
30 |   private final List<A> tokens;
31 |   private final List<Span<A>> toponymSpans;
32 | 
33 |   public SimpleSentence(String id, List<A> tokens) {
34 |     this(id, tokens, new ArrayList<Span<A>>());
35 |   }
36 | 
37 |   public SimpleSentence(String id, List<A> tokens, List<Span<A>> toponymSpans) {
38 |     super(id);
39 |     this.tokens = tokens;
40 |     this.toponymSpans = toponymSpans;
41 |   }
42 | 
43 |   public Iterator<A> tokens() {
44 |     return this.tokens.iterator();
45 |   }
46 | 
47 |   public Iterator<Span<A>> toponymSpans() {
48 |     return this.toponymSpans.iterator();
49 |   }
50 | }
51 | 
52 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/text/SimpleToken.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.text;
17 | 
18 | import java.io.*;
19 | 
20 | public class SimpleToken implements Token, Serializable {
21 | 
22 |   private static final long serialVersionUID = 42L;
23 | 
24 |   private final String form;
25 | 
26 |   public SimpleToken(String form) {
27 |     this.form = form;
28 |   }
29 | 
30 |   public String getForm() {
31 |     return this.form.toLowerCase();
32 |   }
33 | 
34 |   public String getOrigForm() {
35 |     return this.form;
36 |   }
37 | 
38 |   public boolean isToponym() {
39 |     return false;
40 |   }
41 | }
42 | 
43 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/text/StoredCorpus.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.text;
17 | 
18 | import opennlp.fieldspring.tr.topo.Location;
19 | import opennlp.fieldspring.tr.util.CountingLexicon;
20 | import opennlp.fieldspring.tr.util.SimpleCountingLexicon;
21 | import opennlp.fieldspring.tr.util.Span;
22 | import java.io.*;
23 | 
24 | public abstract class StoredCorpus extends Corpus<StoredToken> implements Serializable {
25 |   public abstract int getDocumentCount();
26 |   public abstract int getTokenTypeCount();
27 |   public abstract int getTokenOrigTypeCount();
28 |   public abstract int getToponymTypeCount();
29 |   public abstract int getToponymOrigTypeCount();
30 |   public abstract int getMaxToponymAmbiguity();
31 |   public abstract double getAvgToponymAmbiguity();
32 |   public abstract int getTokenCount();
33 |   public abstract int getToponymTokenCount();
34 |   public abstract void load();
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/text/StoredToken.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.text;
17 | import java.io.*;
18 | 
19 | public interface StoredToken extends Token, Serializable {
20 |   public int getIdx();
21 |   public int getOrigIdx();
22 |   public int getTypeCount();
23 |   public int getOrigTypeCount();
24 | }
25 | 
26 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/text/StoredToponym.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.text;
17 | import java.io.*;
18 | 
19 | public interface StoredToponym extends StoredToken, Toponym, Serializable {
20 | }
21 | 
22 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/text/StreamCorpus.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.text;
17 | 
18 | import java.io.IOException;
19 | import java.util.ArrayList;
20 | import java.util.Iterator;
21 | import java.util.List;
22 | 
23 | import com.google.common.collect.Iterators;
24 | 
25 | public class StreamCorpus extends Corpus<Token> {
26 | 
27 |   private static final long serialVersionUID = 42L;
28 | 
29 |   private final List<DocumentSource> sources;
30 |   private boolean read;
31 | 
32 |   StreamCorpus() {
33 |     this.sources = new ArrayList<DocumentSource>();
34 |     this.read = false;
35 |   }
36 | 
37 |   public Iterator<Document<Token>> iterator() {
38 |     if (this.read) {
39 |       throw new UnsupportedOperationException("Cannot read a stream corpus more than once.");
40 |     } else {
41 |       this.read = true;
42 |       return Iterators.concat(this.sources.iterator());
43 |     }
44 |   }
45 | 
46 |   public void addSource(DocumentSource source) {
47 |     this.sources.add(source);
48 |   }
49 | 
50 |   public void close() {
51 |     for (DocumentSource source : this.sources) {
52 |       source.close();
53 |     }
54 |   }
55 | }
56 | 
57 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/text/Token.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.text;
17 | 
18 | import java.io.*;
19 | 
20 | public interface Token extends Serializable {
21 |   public String getForm();
22 |   public String getOrigForm();
23 |   public boolean isToponym();
24 | }
25 | 
26 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/text/Toponym.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.text;
17 | 
18 | import java.util.List;
19 | import java.io.*;
20 | 
21 | import opennlp.fieldspring.tr.topo.Location;
22 | 
23 | public interface Toponym extends Token, Iterable<Location>, Serializable {
24 |   public boolean hasGold();
25 |   public Location getGold();
26 |   public int getGoldIdx();
27 |   public void setGoldIdx(int idx);
28 | 
29 |   public boolean hasSelected();
30 |   public Location getSelected();
31 |   public int getSelectedIdx();
32 |   public void setSelectedIdx(int idx);
33 | 
34 |   public int getAmbiguity();
35 |   public List<Location> getCandidates();
36 |   public void setCandidates(List<Location> candidates);
37 | 
38 |   public List<Token> getTokens();
39 | }
40 | 
41 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/text/io/GeoTextCorpusKMLWriter.java:
--------------------------------------------------------------------------------
 1 | package opennlp.fieldspring.tr.text.io;
 2 | 
 3 | import opennlp.fieldspring.tr.text.*;
 4 | import javax.xml.stream.*;
 5 | import opennlp.fieldspring.tr.util.*;
 6 | import opennlp.fieldspring.tr.topo.*;
 7 | 
 8 | public class GeoTextCorpusKMLWriter extends CorpusKMLWriter {
 9 |     public GeoTextCorpusKMLWriter(Corpus<? extends Token> corpus, boolean outputGoldLocations) {
10 |         super(corpus, outputGoldLocations);
11 |       }
12 | 
13 |     public GeoTextCorpusKMLWriter(Corpus<? extends Token> corpus) {
14 |         this(corpus, false);
15 |     }
16 | 
17 |       protected void writeDocument(XMLStreamWriter out, Document<? extends Token> document) throws XMLStreamException {
18 |           Coordinate coord = outputGoldLocations ? document.getGoldCoord() : document.getSystemCoord();
19 | 
20 |           KMLUtil.writePlacemark(out, document.getId(), coord, KMLUtil.RADIUS);
21 |           int sentIndex = 0;
22 |           for(Sentence<? extends Token> sent : document) {
23 |               StringBuffer curTweetSB = new StringBuffer();
24 |               for(Token token : sent) {
25 |                   if(isSanitary(token.getOrigForm()))
26 |                      curTweetSB.append(token.getOrigForm()).append(" ");
27 |               }
28 |               String curTweet = curTweetSB.toString().trim();
29 | 
30 |               KMLUtil.writeSpiralPoint(out, document.getId(),
31 |                                        sentIndex, curTweet,
32 |                                        coord.getNthSpiralPoint(sentIndex, KMLUtil.SPIRAL_RADIUS), KMLUtil.RADIUS);
33 |               sentIndex++;
34 |           }
35 |       }
36 | 
37 |     private String okChars = "!?:;,'\"|+=-_*^%$#@`~(){}[]\\/";
38 | 
39 |     private boolean isSanitary(String s) {
40 |         for(int i = 0; i < s.length(); i++) {
41 |             char curChar = s.charAt(i);
42 |             if(!Character.isLetterOrDigit(curChar) && !okChars.contains(curChar + "")) {
43 |                 return false;
44 |             }
45 |         }
46 |         return true;
47 |     }
48 | 
49 |       protected void write(XMLStreamWriter out) throws Exception {
50 | 
51 |           KMLUtil.writeHeader(out, "corpus");
52 |           
53 |           for(Document<? extends Token> doc : corpus) {
54 |               writeDocument(out, doc);
55 |           }
56 | 
57 |           KMLUtil.writeFooter(out);
58 | 
59 |           out.close();
60 |       }
61 | }
62 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/text/io/TextSource.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.text.io;
17 | 
18 | import java.io.BufferedReader;
19 | import java.io.Closeable;
20 | import java.io.IOException;
21 | import java.util.Iterator;
22 | 
23 | import opennlp.fieldspring.tr.text.Document;
24 | import opennlp.fieldspring.tr.text.DocumentSource;
25 | import opennlp.fieldspring.tr.text.Token;
26 | 
27 | public abstract class TextSource extends DocumentSource {
28 |   protected final BufferedReader reader;
29 | 
30 |   public TextSource(BufferedReader reader) throws IOException {
31 |     this.reader = reader;
32 |   }
33 | 
34 |   protected String readLine() {
35 |     String line = null;
36 |     try {
37 |       line = this.reader.readLine();
38 |     } catch (IOException e) {
39 |       System.err.println("Error while reading document source.");
40 |     }
41 |     return line;
42 |   }
43 | 
44 |   public void close() {
45 |     try {
46 |       this.reader.close();
47 |     } catch (IOException e) {
48 |       System.err.println("Error while closing document source.");
49 |     }
50 |   }
51 | }
52 | 
53 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/text/prep/CandidateRepopulator.java:
--------------------------------------------------------------------------------
 1 | package opennlp.fieldspring.tr.text.prep;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.Iterator;
 5 | import java.util.List;
 6 | 
 7 | import opennlp.fieldspring.tr.text.Corpus;
 8 | import opennlp.fieldspring.tr.text.Document;
 9 | import opennlp.fieldspring.tr.text.DocumentSource;
10 | import opennlp.fieldspring.tr.text.DocumentSourceWrapper;
11 | import opennlp.fieldspring.tr.text.Sentence;
12 | import opennlp.fieldspring.tr.text.SimpleSentence;
13 | import opennlp.fieldspring.tr.text.SimpleToponym;
14 | import opennlp.fieldspring.tr.text.Token;
15 | import opennlp.fieldspring.tr.text.Toponym;
16 | import opennlp.fieldspring.tr.topo.gaz.Gazetteer;
17 | import opennlp.fieldspring.tr.topo.Location;
18 | import opennlp.fieldspring.tr.util.Span;
19 | 
20 | 
21 | public class CandidateRepopulator extends DocumentSourceWrapper {
22 | 
23 |   private final Gazetteer gazetteer;
24 | 
25 |     public CandidateRepopulator(DocumentSource source, Gazetteer gazetteer) {
26 |     super(source);
27 |     this.gazetteer = gazetteer;
28 |   }
29 | 
30 |   public Document<Token> next() {
31 |     final Document<Token> document = this.getSource().next();
32 |     final Iterator<Sentence<Token>> sentences = document.iterator();
33 | 
34 |     return new Document<Token>(document.getId()) {
35 |       private static final long serialVersionUID = 42L;
36 |       public Iterator<Sentence<Token>> iterator() {
37 |         return new SentenceIterator() {
38 |           public boolean hasNext() {
39 |             return sentences.hasNext();
40 |           }
41 | 
42 |           public Sentence<Token> next() {
43 |             Sentence<Token> sentence = sentences.next();
44 |             for(Token token : sentence) {
45 |                 if(token.isToponym()) {
46 |                     Toponym toponym = (Toponym) token;
47 |                     List<Location> candidates = gazetteer.lookup(toponym.getForm());
48 |                     if(candidates == null) candidates = new ArrayList<Location>();
49 |                     toponym.setCandidates(candidates);
50 |                     toponym.setGoldIdx(-1);
51 |                 }
52 |             }
53 |             return sentence;
54 |             //return new SimpleSentence(sentence.getId(), sentence.getTokens());
55 |           }
56 |         };
57 |       }
58 |     };
59 |   }
60 | }
61 | 
62 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/text/prep/JythonNER.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.text.prep;
17 | 
18 | import javax.script.ScriptEngine;
19 | import javax.script.ScriptException;
20 | import java.util.ArrayList;
21 | import java.util.List;
22 | 
23 | import opennlp.fieldspring.tr.util.Span;
24 | 
25 | public class JythonNER extends ScriptNER {
26 |   public JythonNER(String name, NamedEntityType type) {
27 |     super("python", name, type);
28 |   }
29 | 
30 |   public JythonNER(String name) {
31 |     this(name, NamedEntityType.LOCATION);
32 |   }
33 | 
34 |   public List<Span<NamedEntityType>> recognize(List<String> tokens) {
35 |     ScriptEngine engine = this.getEngine();
36 |     engine.put("tokens", tokens);
37 | 
38 |     try {
39 |       engine.eval("spans = recognize(tokens)");
40 |     } catch (ScriptException e) {
41 |       return null;
42 |     }
43 | 
44 |     List<List<Integer>> tuples = (List<List<Integer>>) engine.get("spans");
45 |     List<Span<NamedEntityType>> spans =
46 |       new ArrayList<Span<NamedEntityType>>(tuples.size());
47 | 
48 |     for (List<Integer> tuple : tuples) {
49 |       spans.add(new Span<NamedEntityType>(tuple.get(0), tuple.get(1), this.getType()));
50 |     }
51 | 
52 |     return spans;
53 |   }
54 | }
55 | 
56 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/text/prep/NamedEntityRecognizer.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.text.prep;
17 | 
18 | import java.util.List;
19 | 
20 | import opennlp.fieldspring.tr.util.Span;
21 | 
22 | public interface NamedEntityRecognizer {
23 |   public List<Span<NamedEntityType>> recognize(List<String> tokens);
24 | }
25 | 
26 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/text/prep/NamedEntityType.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.text.prep;
17 | 
18 | public enum NamedEntityType {
19 |   DATE,
20 |   LOCATION,
21 |   MONEY,
22 |   ORGANIZATION,
23 |   PERCENTAGE,
24 |   PERSON,
25 |   TIME;
26 | }
27 | 
28 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/text/prep/OpenNLPRecognizer.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.text.prep;
17 | 
18 | import java.io.File;
19 | import java.io.FileInputStream;
20 | import java.io.IOException;
21 | import java.io.InputStream;
22 | import java.util.ArrayList;
23 | import java.util.Iterator;
24 | import java.util.List;
25 | 
26 | import opennlp.tools.namefind.NameFinderME;
27 | import opennlp.tools.namefind.TokenNameFinder;
28 | import opennlp.tools.namefind.TokenNameFinderModel;
29 | import opennlp.tools.util.InvalidFormatException;
30 | 
31 | import opennlp.fieldspring.tr.util.Constants;
32 | import opennlp.fieldspring.tr.util.Span;
33 | 
34 | public class OpenNLPRecognizer implements NamedEntityRecognizer {
35 |   protected final TokenNameFinder finder;
36 |   protected final NamedEntityType type;
37 | 
38 |   public OpenNLPRecognizer() throws IOException, InvalidFormatException {
39 |     this(new FileInputStream(
40 |       Constants.getOpenNLPModelsDir() + File.separator + "en-ner-location.bin"),
41 |       NamedEntityType.LOCATION);
42 |   }
43 | 
44 |   public OpenNLPRecognizer(InputStream in, NamedEntityType type)
45 |     throws IOException, InvalidFormatException {
46 |     this.finder = new NameFinderME(new TokenNameFinderModel(in));
47 |     this.type = type;
48 |   }
49 | 
50 |   public List<Span<NamedEntityType>> recognize(List<String> tokens) {
51 |     List<Span<NamedEntityType>> spans = new ArrayList<Span<NamedEntityType>>();
52 |     for (opennlp.tools.util.Span span : this.finder.find(tokens.toArray(new String[0]))) {
53 |       spans.add(new Span<NamedEntityType>(span.getStart(), span.getEnd(), this.type));
54 |     }
55 |     return spans;
56 |   }
57 | }
58 | 
59 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/text/prep/OpenNLPSentenceDivider.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.text.prep;
17 | 
18 | import java.io.File;
19 | import java.io.FileInputStream;
20 | import java.io.IOException;
21 | import java.io.InputStream;
22 | import java.util.Arrays;
23 | import java.util.List;
24 | 
25 | import opennlp.tools.sentdetect.SentenceDetector;
26 | import opennlp.tools.sentdetect.SentenceDetectorME;
27 | import opennlp.tools.sentdetect.SentenceModel;
28 | import opennlp.tools.util.InvalidFormatException;
29 | 
30 | import opennlp.fieldspring.tr.util.Constants;
31 | 
32 | public class OpenNLPSentenceDivider implements SentenceDivider {
33 |   private final SentenceDetector detector;
34 | 
35 |   public OpenNLPSentenceDivider() throws IOException, InvalidFormatException {
36 |     this(new FileInputStream(Constants.getOpenNLPModelsDir() + File.separator + "en-sent.bin"));
37 |   }
38 | 
39 |   public OpenNLPSentenceDivider(InputStream in) throws IOException, InvalidFormatException {
40 |     this.detector = new SentenceDetectorME(new SentenceModel(in));
41 |   }
42 | 
43 |   public List<String> divide(String text) {
44 |     return Arrays.asList(this.detector.sentDetect(text));
45 |   }
46 | }
47 | 
48 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/text/prep/OpenNLPTokenizer.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.text.prep;
17 | 
18 | import java.io.File;
19 | import java.io.FileInputStream;
20 | import java.io.IOException;
21 | import java.io.InputStream;
22 | import java.util.Arrays;
23 | import java.util.List;
24 | 
25 | import opennlp.tools.tokenize.TokenizerME;
26 | import opennlp.tools.tokenize.TokenizerModel;
27 | import opennlp.tools.util.InvalidFormatException;
28 | 
29 | import opennlp.fieldspring.tr.util.Constants;
30 | 
31 | public class OpenNLPTokenizer implements Tokenizer {
32 |   private final opennlp.tools.tokenize.Tokenizer tokenizer;
33 | 
34 |   public OpenNLPTokenizer() throws IOException, InvalidFormatException {
35 |     this(new FileInputStream(Constants.getOpenNLPModelsDir() + File.separator + "en-token.bin"));
36 |   }
37 | 
38 |   public OpenNLPTokenizer(InputStream in) throws IOException, InvalidFormatException {
39 |     this.tokenizer = new TokenizerME(new TokenizerModel(in));
40 |   }
41 | 
42 |   public List<String> tokenize(String text) {
43 |     return Arrays.asList(this.tokenizer.tokenize(text));
44 |   }
45 | }
46 | 
47 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/text/prep/ScriptNER.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.text.prep;
17 | 
18 | import java.io.InputStream;
19 | import java.io.InputStreamReader;
20 | import java.io.IOException;
21 | import javax.script.ScriptEngine;
22 | import javax.script.ScriptEngineManager;
23 | import javax.script.ScriptException;
24 | 
25 | public abstract class ScriptNER implements NamedEntityRecognizer {
26 |   private final String language;
27 |   private final String name;
28 |   private final NamedEntityType type;
29 |   private final ScriptEngine engine;
30 | 
31 |   /** 
32 |     * Constructor for classes that use the JSR-223 scripting engine to perform
33 |     * named entity recognition.
34 |     *
35 |     * @param language The JSR-223 name of the scripting language
36 |     * @param name     The path to the resource containing the script
37 |     * @param type     The kind of named entity that is recognized
38 |     */
39 |   public ScriptNER(String language, String name, NamedEntityType type) {
40 |     this.language = language;
41 |     this.name = name;
42 |     this.type = type;
43 | 
44 |     ScriptEngineManager manager = new ScriptEngineManager();
45 |     this.engine = manager.getEngineByName(this.language);
46 | 
47 |     try {
48 |       InputStream stream = ScriptNER.class.getResourceAsStream(this.name);
49 |       InputStreamReader reader = new InputStreamReader(stream);
50 |       this.engine.eval(reader);
51 |       stream.close();
52 |     } catch (ScriptException e) {
53 |       System.err.println(e);
54 |       System.exit(1);
55 |     } catch (IOException e) {
56 |       System.err.println(e);
57 |       System.exit(1);
58 |     }
59 |   }
60 | 
61 |   public ScriptNER(String language, String name) {
62 |     this(language, name, NamedEntityType.LOCATION);
63 |   }
64 | 
65 |   protected ScriptEngine getEngine() {
66 |     return this.engine;
67 |   }
68 | 
69 |   protected NamedEntityType getType() {
70 |     return this.type;
71 |   }
72 | }
73 | 
74 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/text/prep/SentenceDivider.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.text.prep;
17 | 
18 | import java.util.List;
19 | 
20 | public interface SentenceDivider {
21 |   public List<String> divide(String text);
22 | }
23 | 
24 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/text/prep/Tokenizer.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.text.prep;
17 | 
18 | import java.util.List;
19 | 
20 | public interface Tokenizer {
21 |   public List<String> tokenize(String text);
22 | }
23 | 
24 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/text/prep/ToponymRemover.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.text.prep;
17 | 
18 | import java.util.ArrayList;
19 | import java.util.Iterator;
20 | import java.util.List;
21 | 
22 | import opennlp.fieldspring.tr.text.Corpus;
23 | import opennlp.fieldspring.tr.text.Document;
24 | import opennlp.fieldspring.tr.text.DocumentSource;
25 | import opennlp.fieldspring.tr.text.DocumentSourceWrapper;
26 | import opennlp.fieldspring.tr.text.Sentence;
27 | import opennlp.fieldspring.tr.text.SimpleSentence;
28 | import opennlp.fieldspring.tr.text.SimpleToponym;
29 | import opennlp.fieldspring.tr.text.Token;
30 | import opennlp.fieldspring.tr.text.Toponym;
31 | import opennlp.fieldspring.tr.topo.gaz.Gazetteer;
32 | import opennlp.fieldspring.tr.util.Span;
33 | 
34 | /**
35 |  * Wraps a document source and removes any toponyms spans that it contains,
36 |  * returning only the tokens.
37 |  *
38 |  * @author Travis Brown <travis.brown@mail.utexas.edu>
39 |  * @version 0.1.0
40 |  */
41 | public class ToponymRemover extends DocumentSourceWrapper {
42 |   public ToponymRemover(DocumentSource source) {
43 |     super(source);
44 |   }
45 | 
46 |   public Document<Token> next() {
47 |     final Document<Token> document = this.getSource().next();
48 |     final Iterator<Sentence<Token>> sentences = document.iterator();
49 | 
50 |     return new Document<Token>(document.getId()) {
51 |       private static final long serialVersionUID = 42L;
52 |       public Iterator<Sentence<Token>> iterator() {
53 |         return new SentenceIterator() {
54 |           public boolean hasNext() {
55 |             return sentences.hasNext();
56 |           }
57 | 
58 |           public Sentence<Token> next() {
59 |             Sentence<Token> sentence = sentences.next();
60 |             return new SimpleSentence<Token>(sentence.getId(), sentence.getTokens());
61 |           }
62 |         };
63 |       }
64 |     };
65 |   }
66 | }
67 | 
68 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/topo/PointRegion.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.topo;
17 | 
18 | import java.util.ArrayList;
19 | import java.util.List;
20 | 
21 | public class PointRegion extends Region {
22 | 
23 |   private static final long serialVersionUID = 42L;
24 | 
25 |   private Coordinate coordinate;
26 | 
27 |   public PointRegion(Coordinate coordinate) {
28 |     this.coordinate = coordinate;
29 |   }
30 | 
31 |   public Coordinate getCenter() {
32 |     return this.coordinate;
33 |   }
34 | 
35 |     public void setCenter(Coordinate coord) {
36 |         this.coordinate = coord;
37 |     }
38 | 
39 |   public boolean contains(double lat, double lng) {
40 |     return lat == this.coordinate.getLat() && lng == this.coordinate.getLng();
41 |   }
42 | 
43 |   public double getMinLat() {
44 |     return this.coordinate.getLat();
45 |   }
46 | 
47 |   public double getMaxLat() {
48 |     return this.coordinate.getLat();
49 |   }
50 | 
51 |   public double getMinLng() {
52 |     return this.coordinate.getLng();
53 |   }
54 | 
55 |   public double getMaxLng() {
56 |     return this.coordinate.getLng();
57 |   }
58 | 
59 |   public List<Coordinate> getRepresentatives() {
60 |     List<Coordinate> representatives = new ArrayList<Coordinate>(1);
61 |     representatives.add(this.coordinate);
62 |     return representatives;
63 |   }
64 | 
65 |     public void setRepresentatives(List<Coordinate> coordinates) {
66 |         this.coordinate = coordinates.get(0);
67 |     }
68 | }
69 | 
70 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/topo/gaz/FilteredGeoNamesReader.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.topo.gaz;
17 | 
18 | import java.io.File;
19 | import java.io.FileInputStream;
20 | import java.io.FileNotFoundException;
21 | import java.io.InputStreamReader;
22 | import java.io.IOException;
23 | import java.io.BufferedReader;
24 | import java.util.zip.GZIPInputStream;
25 | 
26 | import opennlp.fieldspring.tr.topo.Location;
27 | 
28 | public class FilteredGeoNamesReader extends GeoNamesReader {
29 |   public FilteredGeoNamesReader(File file) throws FileNotFoundException, IOException {
30 |     this(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file)))));
31 |   }
32 | 
33 |   public FilteredGeoNamesReader(BufferedReader reader)
34 |     throws FileNotFoundException, IOException {
35 |     super(reader);
36 |   }
37 | 
38 |   protected Location parseLine(String line, int currentId) {
39 |     Location location = super.parseLine(line, currentId);
40 |     if (location != null) {
41 |       Location.Type type = location.getType();
42 |       if (type != Location.Type.STATE && type != Location.Type.CITY) {
43 |         location = null;
44 |       }
45 |     }
46 |     return location;  
47 |   }
48 | }
49 | 
50 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/topo/gaz/Gazetteer.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.topo.gaz;
17 | 
18 | import java.util.List;
19 | 
20 | import opennlp.fieldspring.tr.topo.Location;
21 | 
22 | /**
23 |  * Represents a mapping from toponym strings to lists of location candidates.
24 |  *
25 |  * @author Travis Brown <travis.brown@mail.utexas.edu>
26 |  */
27 | public interface Gazetteer {
28 |   /**
29 |    * Lookup a toponym in the gazetteer, returning null if no candidate list is
30 |    * found.
31 |    */
32 |   public List<Location> lookup(String query);
33 | }
34 | 
35 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/topo/gaz/GazetteerFileReader.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.topo.gaz;
17 | 
18 | import java.io.BufferedReader;
19 | import java.io.FileInputStream;
20 | import java.io.FileNotFoundException;
21 | import java.io.InputStreamReader;
22 | import java.io.IOException;
23 | import java.util.Iterator;
24 | import java.util.zip.GZIPInputStream;
25 | 
26 | public abstract class GazetteerFileReader extends GazetteerReader {
27 |   private final BufferedReader reader;
28 | 
29 |   protected GazetteerFileReader(BufferedReader reader)
30 |     throws FileNotFoundException, IOException {
31 |     this.reader = reader;
32 |   }
33 | 
34 |   protected String readLine() {
35 |     String line = null;
36 |     try {
37 |       line = this.reader.readLine();
38 |     } catch (IOException e) {   
39 |       System.err.format("Error while reading gazetteer file: %s\n", e);
40 |       e.printStackTrace();
41 |     }
42 |     return line;
43 |   }
44 | 
45 |   public void close() {
46 |     try {
47 |       this.reader.close();
48 |     } catch (IOException e) {      
49 |       System.err.format("Error closing gazetteer file: %s\n", e);
50 |       e.printStackTrace();
51 |       System.exit(1); 
52 |     }
53 |   }
54 | }
55 | 
56 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/topo/gaz/GazetteerLineReader.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.topo.gaz;
17 | 
18 | import java.io.BufferedReader;
19 | import java.io.FileNotFoundException;
20 | import java.io.IOException;
21 | import java.util.Iterator;
22 | 
23 | import opennlp.fieldspring.tr.topo.Location;
24 | 
25 | public abstract class GazetteerLineReader extends GazetteerFileReader {
26 |   private Location current;
27 |   private int currentId;
28 | 
29 |   protected GazetteerLineReader(BufferedReader reader)
30 |     throws FileNotFoundException, IOException {
31 |     super(reader);
32 |     this.current = this.nextLocation();
33 |     this.currentId = 1;
34 |   }
35 | 
36 |   protected abstract Location parseLine(String line, int currentId);
37 | 
38 |   private Location nextLocation() {
39 |     Location location = null;
40 |     for (String line = this.readLine(); line != null; line = this.readLine()) {
41 |       location = this.parseLine(line, this.currentId);
42 |       if (location != null) break;
43 |     }
44 |     this.currentId++;
45 |     //if (this.currentId % 50000 == 0) { System.out.format("At location id: %d.\n", this.currentId); }
46 |     return location;
47 |   }
48 | 
49 |   public boolean hasNext() {
50 |     return this.current != null;
51 |   }
52 | 
53 |   public Location next() {
54 |     Location location = this.current;
55 |     this.current = this.nextLocation();
56 |     return location;
57 |   }
58 | }
59 | 
60 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/topo/gaz/GazetteerReader.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.topo.gaz;
17 | 
18 | import java.util.Iterator;
19 | import opennlp.fieldspring.tr.topo.Location;
20 | 
21 | public abstract class GazetteerReader implements Iterable<Location>,
22 |                                                  Iterator<Location> {
23 |   public abstract void close();
24 | 
25 |   protected Location.Type getLocationType(String code) {
26 |     return Location.Type.UNKNOWN;
27 |   }
28 | 
29 |   protected Location.Type getLocationType(String code, String fine) {
30 |     return this.getLocationType(code);
31 |   }
32 | 
33 |   public Iterator<Location> iterator() {
34 |     return this;
35 |   }
36 | 
37 |   public void remove() {
38 |     throw new UnsupportedOperationException("Cannot remove location from gazetteer.");
39 |   }
40 | }
41 | 
42 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/topo/gaz/InMemoryGazetteer.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.topo.gaz;
17 | 
18 | import java.util.ArrayList;
19 | import java.util.HashMap;
20 | import java.util.List;
21 | import java.util.Map;
22 | 
23 | import opennlp.fieldspring.tr.topo.Location;
24 | 
25 | public class InMemoryGazetteer extends LoadableGazetteer {
26 |   private final Map<String, List<Location>> map;
27 | 
28 |   public InMemoryGazetteer() {
29 |     this.map = new HashMap<String, List<Location>>();
30 |   }
31 | 
32 |   public void add(String name, Location location) {
33 |     name = name.toLowerCase();
34 |     List<Location> locations = this.map.get(name);
35 |     if (locations == null) {
36 |       locations = new ArrayList<Location>();
37 |     }
38 |     locations.add(location);
39 |     this.map.put(name, locations);
40 |   }
41 | 
42 |   public List<Location> lookup(String query) {
43 |     return this.map.get(query.toLowerCase());
44 |   }
45 | }
46 | 
47 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/topo/gaz/LoadableGazetteer.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.topo.gaz;
17 | 
18 | import java.util.List;
19 | import opennlp.fieldspring.tr.topo.Location;
20 | 
21 | public abstract class LoadableGazetteer implements Gazetteer {
22 |   public abstract void add(String name, Location location);
23 | 
24 |   public int load(GazetteerReader reader) {
25 |     int count = 0;
26 |     for (Location location : reader) {
27 |       count++;
28 |       this.add(location.getName(), location);
29 |     }
30 |     reader.close();
31 |     this.finishLoading();
32 |     return count;
33 |   }
34 | 
35 |   public void finishLoading() {}
36 |   public void close() {}
37 | }
38 | 
39 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/topo/gaz/MultiGazetteer.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.topo.gaz;
17 | 
18 | import java.util.ArrayList;
19 | import java.util.List;
20 | import opennlp.fieldspring.tr.topo.Location;
21 | 
22 | public class MultiGazetteer implements Gazetteer {
23 |   private final List<Gazetteer> gazetteers;
24 | 
25 |   public MultiGazetteer(List<Gazetteer> gazetteers) {
26 |     this.gazetteers = gazetteers;
27 |   }
28 | 
29 |   public List<Location> lookup(String query) {
30 |     for (Gazetteer gazetteer : this.gazetteers) {
31 |       List<Location> candidates = gazetteer.lookup(query);
32 |       if (candidates != null) {
33 |         return candidates;
34 |       }
35 |     }
36 |     return null;
37 |   }
38 | }
39 | 
40 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/util/CountingLexicon.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.util;
17 | 
18 | import java.io.Serializable;
19 | import java.util.List;
20 | 
21 | public interface CountingLexicon<A extends Serializable> extends Lexicon<A>, Serializable {
22 |   public int count(A entry);
23 |   public int countAtIndex(int index);
24 | }
25 | 
26 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/util/DoubleStringPair.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Taesun Moon, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.util;
17 | 
18 | /**
19 |  * A pair of an double and String
20 |  *
21 |  * @author  Taesun Moon
22 |  */
23 | public final class DoubleStringPair implements Comparable<DoubleStringPair> {
24 |     /**
25 |      *
26 |      */
27 |     public double doubleValue;
28 |     /**
29 |      * 
30 |      */
31 |     public String stringValue;
32 | 
33 |     /**
34 |      * 
35 |      *
36 |      * @param d
37 |      * @param s
38 |      */
39 |     public DoubleStringPair (double d, String s) {
40 | 	doubleValue = d;
41 | 	stringValue = s;
42 |     }
43 | 
44 |     /**
45 |      * sorting order is reversed -- higher (int) values come first
46 |      * 
47 |      * @param p
48 |      * @return
49 |      */
50 |     public int compareTo (DoubleStringPair p) {
51 | 	if (doubleValue < p.doubleValue)
52 | 	    return 1;
53 | 	else if (doubleValue > p.doubleValue)
54 | 	    return -1;
55 | 	else 
56 | 	    return 0;
57 |     }
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/util/Lexicon.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.util;
17 | 
18 | import java.io.Serializable;
19 | import java.util.List;
20 | 
21 | public interface Lexicon<A extends Serializable>
22 |   extends Serializable, Iterable<A> {
23 |   public boolean contains(A entry);
24 |   public int get(A entry);
25 |   public int getOrAdd(A entry);
26 |   public A atIndex(int index);
27 |   public int size();
28 |   public boolean isGrowing();
29 |   public void stopGrowing();
30 |   public void startGrowing();
31 |   public List<Integer> concatenate(Lexicon<A> other);
32 | }
33 | 
34 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/util/MemoryUtil.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This class contains tools for checking memory usage during runtime.
 3 |  */
 4 | 
 5 | package opennlp.fieldspring.tr.util;
 6 | 
 7 | public class MemoryUtil {
 8 |   public static long getMemoryUsage(){
 9 |     takeOutGarbage();
10 |     long totalMemory = Runtime.getRuntime().totalMemory();
11 | 
12 |     takeOutGarbage();
13 |     long freeMemory = Runtime.getRuntime().freeMemory();
14 | 
15 |     return (totalMemory - freeMemory);
16 |   }
17 | 
18 |   private static void takeOutGarbage() {
19 |     collectGarbage();
20 |     collectGarbage();
21 |   }
22 | 
23 |   private static void collectGarbage() {
24 |     try {
25 |       System.gc();
26 |       Thread.currentThread().sleep(100);
27 |       System.runFinalization();
28 |       Thread.currentThread().sleep(100);
29 |     }
30 |     catch (Exception ex){
31 |       ex.printStackTrace();
32 |     }
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/util/Span.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.util;
17 | 
18 | import java.util.ArrayList;
19 | import java.util.List;
20 | import java.io.*;
21 | 
22 | public class Span<A> implements Serializable {
23 | 
24 |   private static final long serialVersionUID = 42L;
25 | 
26 |   private final int start;
27 |   private final int end;
28 |   private final A item;
29 | 
30 |   public Span(int start, int end, A item) {
31 |     this.start = start;
32 |     this.end = end;
33 |     this.item = item;
34 |   }
35 | 
36 |   public int getStart() {
37 |     return this.start;
38 |   }
39 | 
40 |   public int getEnd() {
41 |     return this.end;
42 |   }
43 | 
44 |   public A getItem() {
45 |     return this.item;
46 |   }
47 | }
48 | 
49 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/util/StringDoublePair.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Taesun Moon, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | 
17 | package opennlp.fieldspring.tr.util;
18 | 
19 | /**
20 |  *
21 |  * @author tsmoon
22 |  */
23 | public class StringDoublePair implements Comparable<StringDoublePair> {
24 | 
25 |     public String stringValue;
26 |     public double doubleValue;
27 | 
28 |     public StringDoublePair(String s, double d) {
29 |         stringValue = s;
30 |         doubleValue = d;
31 |     }
32 | 
33 |     public int compareTo(StringDoublePair p) {
34 |         return stringValue.compareTo(p.stringValue);
35 |     }
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/util/StringEditMapper.java:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.util;
17 | 
18 | import java.util.ArrayList;
19 | import java.util.Collections;
20 | import java.util.Iterator;
21 | import java.util.List;
22 | 
23 | import opennlp.fieldspring.tr.util.Span;
24 | 
25 | public class StringEditMapper extends EditMapper<String> {
26 |   public StringEditMapper(List<String> s, List<String> t) {
27 |     super(s, t);
28 |   }
29 | 
30 |   @Override
31 |   protected int delCost(String x) {
32 |     return x.length();
33 |   }
34 | 
35 |   @Override
36 |   protected int insCost(String x) {
37 |     return x.length();
38 |   }
39 | 
40 |   @Override
41 |   protected int subCost(String x, String y) {
42 |     int[][] ds = new int[x.length() + 1][y.length() + 1];
43 |     for (int i = 0; i <= x.length(); i++) { ds[i][0] = i; }
44 |     for (int j = 0; j <= y.length(); j++) { ds[0][j] = j; }
45 | 
46 |     for (int i = 1; i <= x.length(); i++) {
47 |       for (int j = 1; j <= x.length(); j++) {
48 |         int del = ds[i - 1][j] + 1;
49 |         int ins = ds[1][j - 1] + 1;
50 |         int sub = ds[i - 1][j - 1] + (x.charAt(i - 1) == y.charAt(j - 1) ? 0 : 1);
51 |         ds[i][j] = StringEditMapper.minimum(del, ins, sub);
52 |       }
53 |     }
54 | 
55 |     return ds[x.length()][y.length()];
56 |   }
57 | 
58 |   private static int minimum(int a, int b, int c) {
59 |     return Math.min(Math.min(a, b), c);
60 |   }
61 | }
62 | 
63 | 


--------------------------------------------------------------------------------
/src/main/java/opennlp/fieldspring/tr/util/ToponymFinder.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * 
 3 |  */
 4 | package opennlp.fieldspring.tr.util;
 5 | 
 6 | import java.io.BufferedReader;
 7 | import java.io.FileInputStream;
 8 | import java.io.FileReader;
 9 | import java.io.IOException;
10 | import java.io.InputStreamReader;
11 | import java.util.ArrayList;
12 | import java.util.HashSet;
13 | import java.util.List;
14 | import java.util.zip.ZipEntry;
15 | import java.util.zip.ZipFile;
16 | import java.util.zip.ZipInputStream;
17 | 
18 | import opennlp.fieldspring.tr.text.prep.HighRecallToponymRecognizer;
19 | import opennlp.fieldspring.tr.text.prep.NamedEntityRecognizer;
20 | import opennlp.fieldspring.tr.text.prep.NamedEntityType;
21 | import opennlp.fieldspring.tr.text.prep.OpenNLPRecognizer;
22 | import opennlp.fieldspring.tr.text.prep.OpenNLPSentenceDivider;
23 | import opennlp.fieldspring.tr.text.prep.OpenNLPTokenizer;
24 | import opennlp.fieldspring.tr.text.prep.SentenceDivider;
25 | import opennlp.fieldspring.tr.text.prep.Tokenizer;
26 | import opennlp.fieldspring.tr.topo.gaz.GeoNamesGazetteer;
27 | import opennlp.fieldspring.tr.util.Span;
28 | import opennlp.tools.util.InvalidFormatException;
29 | 
30 | /**
31 |  * @author abhimanu kumar
32 |  *
33 |  */
34 | public class ToponymFinder {
35 | 
36 | 	/**
37 | 	 * @param args
38 | 	 */
39 | 	private final SentenceDivider sentDivider;
40 | 	private final Tokenizer tokenizer;
41 | 	private final NamedEntityRecognizer recognizer;
42 | 	private BufferedReader input;
43 | 	
44 | 	public ToponymFinder(BufferedReader reader, String gazPath) throws Exception{
45 | 		sentDivider = new OpenNLPSentenceDivider();
46 | 		tokenizer = new OpenNLPTokenizer();
47 | 		recognizer = new HighRecallToponymRecognizer(gazPath);
48 | 		this.input = reader;
49 | 	}
50 | 
51 | 
52 | 	public static void main(String[] args) throws Exception {
53 | 		ToponymFinder finder = new ToponymFinder(new BufferedReader(new FileReader(args[0]/*"TheStoryTemp.txt"*/)),args[1]/*"data/gazetteers/US.ser.gz"*/);
54 | //		long startTime = System.currentTimeMillis();
55 | 		finder.find();
56 | //		long stopTime = System.currentTimeMillis();
57 | //		System.out.println((stopTime-startTime)/1000 + "secs");
58 | 	}
59 | 
60 | 
61 | 	private HashSet<String> find() throws IOException {
62 | 		String line; 
63 | 		HashSet<String> resultSet = new HashSet<String>();
64 | 		while((line=input.readLine())!=null){
65 | 			List<String> sentencesString = sentDivider.divide(line);
66 | 			for (String sentence : sentencesString){
67 | 				List<String> tokens = new ArrayList<String>();
68 | 				for(String token : tokenizer.tokenize(sentence)){
69 | 					tokens.add(token);
70 | 				}
71 | 				List<Span<NamedEntityType>> spans =recognizer.recognize(tokens);
72 | 				for(Span<NamedEntityType> span:spans){
73 | 					StringBuilder resultToken= new StringBuilder();
74 | 					for(int i=span.getStart();i<span.getEnd();i++){
75 | 						resultToken = resultToken.append(" ").append(tokens.get(i));
76 | 					}
77 | 					resultSet.add(resultToken.toString());
78 | 				}
79 | 				
80 | 			}
81 | 		}
82 | //		System.out.println(resultSet);
83 | 		return resultSet;
84 | 	}
85 | 
86 | }
87 | 


--------------------------------------------------------------------------------
/src/main/python/convert-infochimps.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | 
 5 | Steps for converting Infochimps to our format:
 6 | 
 7 | 1) Input is a series of files, e.g. part-00000.gz, each about 180 MB.
 8 | 2) Each line looks like this:
 9 | 
10 | 
11 | 100000018081132545      20110807002716  25430513        GTheHardWay                                     Niggas Lost in the Sauce ..smh better slow yo roll and tell them hoes to get a job nigga #MRIloveRatsIcanchange&amp;amp;saveherassNIGGA &lt;a href=&quot;http://twitter.com/download/android&quot; rel=&quot;nofollow&quot;&gt;Twitter for Android&lt;/a&gt;    en      42.330165       -83.045913                                      
12 | The fields are:
13 | 
14 | 1) Tweet ID
15 | 2) Time
16 | 3) User ID
17 | 4) User name
18 | 5) Empty?
19 | 6) User name being replied to (FIXME: which JSON field is this?)
20 | 7) User ID for replied-to user name (but sometimes different ID's for same user name)
21 | 8) Empty?
22 | 9) Tweet text -- double HTML-encoded (e.g. & becomes &amp;amp;)
23 | 10) HTML anchor text indicating a link of some sort, HTML-encoded (FIXME: which JSON field is this?)
24 | 11) Language, as a two-letter code
25 | 12) Latitude
26 | 13) Longitude
27 | 14) Empty?
28 | 15) Empty?
29 | 16) Empty?
30 | 17) Empty?
31 | 
32 | 
33 | 3) We want to convert each to two files: (1) containing the article-data 
34 | 
35 | """
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/src/main/python/convert_to_new_article_format.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | #######
 4 | ####### convert_to_new_article_format.py
 5 | #######
 6 | ####### Copyright (c) 2011 Ben Wing.
 7 | #######
 8 | 
 9 | # A one-off program to convert article-data files to the new order, which
10 | # puts the most important article fields (id, name, split, coords) before
11 | # other fields that may be specific to the type of article (e.g. Wikipedia
12 | # article, Twitter feed, tweet, etc.).
13 | 
14 | import sys
15 | from nlputil import *
16 | from process_article_data import *
17 | 
18 | def output_combined_article_data(filename):
19 |   arts_seen = []
20 |   def note_article(art):
21 |     arts_seen.append(art)
22 |   # Note that the article data file indicates the field names at the
23 |   # beginning.
24 |   read_article_data_file(filename, note_article)
25 |   errprint("Writing combined data to stdout ...")
26 |   write_article_data_file(sys.stdout,
27 |     outfields = combined_article_data_outfields,
28 |     articles = arts_seen)
29 |   errprint("Done.")
30 | 
31 | for filename in sys.argv[1:]:
32 |   output_combined_article_data(filename)
33 | 


--------------------------------------------------------------------------------
/src/main/python/fix_redirects.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | #######
 4 | ####### fix_redirects.py
 5 | #######
 6 | ####### Copyright (c) 2010 Ben Wing.
 7 | #######
 8 | 
 9 | # This is a one-off program to fix the redirect fields so that they
10 | # always begin with a capital letter, as is required of article titles in
11 | # Wikipedia.
12 | 
13 | import sys
14 | from nlputil import *
15 | from process_article_data import *
16 | 
17 | def fix_redirects(filename):
18 |   articles_seen = []
19 |   def process(art):
20 |     if art.redir:
21 |       art.redir = capfirst(art.redir)
22 |     articles_seen.append(art)
23 |   errprint("Reading from %s..." % filename)
24 |   fields = read_article_data_file(filename, process,
25 |                                   maxtime=Opts.max_time_per_stage)
26 |   errprint("Writing to stdout...")
27 |   write_article_data_file(sys.stdout, outfields=fields,
28 |                           articles=articles_seen)
29 |   errprint("Done.")
30 | 
31 | ############################################################################
32 | #                                  Main code                               #
33 | ############################################################################
34 | 
35 | class FixRedirectsProgram(NLPProgram):
36 |   def argument_usage(self):
37 |     return "article-data-file"
38 | 
39 |   def handle_arguments(self, opts, op, args):
40 |     global Opts
41 |     Opts = opts
42 |     if len(args) != 1:
43 |       op.error("Must specify exactly one article-data file as an argument")
44 | 
45 |   def implement_main(self, opts, params, args):
46 |     fix_redirects(args[0])
47 |     
48 | FixRedirectsProgram()
49 | 


--------------------------------------------------------------------------------
/src/main/python/format-thresh-grid.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from nlputil import *
 4 | import fileinput
 5 | import re
 6 | 
 7 | switch_thresh_and_grid = True
 8 | 
 9 | errdists = dictdict()
10 | 
11 | for line in fileinput.input():
12 |   line = line.strip()
13 |   m = re.match(r'.*thresh: ([0-9.]*), grid: ([0-9.]*),.*true error distance.*\(([0-9.]*) km.*', line)
14 |   if not m:
15 |     errprint("Can't parse line: %s", line)
16 |   else:
17 |     thresh = float(m.group(1))
18 |     grid = float(m.group(2))
19 |     dist = float(m.group(3))
20 |     if switch_thresh_and_grid:
21 |       errdists[grid][thresh] = dist
22 |     else:
23 |       errdists[thresh][grid] = dist
24 | 
25 | first = True
26 | for (thresh, dic) in key_sorted_items(errdists):
27 |   if first:
28 |     first = False
29 |     errprint(r"   & %s \\" % (
30 |         ' & '.join(["%g" % grid for grid in sorted(dic.keys())])))
31 |     errprint(r"\hline")
32 |   errprint(r"%g & %s \\" % (thresh,
33 |       ' & '.join(["%g" % dist for (grid, dist) in key_sorted_items(dic)])))
34 | 
35 | 


--------------------------------------------------------------------------------
/src/main/python/generate-numbers.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from nlputil import *
 4 | import fileinput
 5 | import re
 6 | 
 7 | def median(values):
 8 |   values = sorted(values)
 9 |   vallen = len(values)
10 | 
11 |   if vallen % 2 == 1:
12 |     return values[(vallen+1)/2-1]
13 |   else:
14 |     lower = values[vallen/2-1]
15 |     upper = values[vallen/2]
16 | 
17 |     return (float(lower + upper)) / 2  
18 | 
19 | def mean(values):
20 |   return float(sum(values))/len(values)
21 | 
22 | def tokm(val):
23 |   return 1.609*val
24 | 
25 | vals = []
26 | for line in fileinput.input():
27 |   line = line.strip()
28 |   m = re.match(r'.*Distance (.*?) miles to predicted region center', line)
29 |   if not m:
30 |     errprint("Can't parse line: %s", line)
31 |   else:
32 |     vals += [float(m.group(1))]
33 | 
34 | med = median(vals)
35 | mn = mean(vals)
36 | uniprint("Median: %g miles (%g km)" % (med, tokm(med)))
37 | uniprint("Mean: %g miles (%g km)" % (mn, tokm(mn)))
38 | 


--------------------------------------------------------------------------------
/src/main/python/ner/DummyNER.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Any Python NER component should provide a function named "recognize" that
 4 | # takes a list of string tokens and returns a list of tuples indicating the
 5 | # beginning and ending of named entity ranges in the token list. This simple
 6 | # example considers all capitalized words to be single-token named entities.
 7 | def recognize(tokens):
 8 |   spans = []
 9 |   for i in range(0, len(tokens)):
10 |     if tokens[i].isalnum() and tokens[i] == tokens[i].capitalize():
11 |       spans.append((i, i + 1))
12 |   return spans
13 | 
14 | #text = ["They", "had", "sailed", "from", "Deptford",
15 | #        ",", "from", "Greenwich", ",", "from", "Erith", "..."]
16 | 
17 | #print(recognize(text))
18 | 
19 | 


--------------------------------------------------------------------------------
/src/main/python/ner/stanford2places.py:
--------------------------------------------------------------------------------
 1 | import sys, re
 2 | 
 3 | locationString = re.compile(r'(\w*/LOCATION(\s*\w*/LOCATION)*)')
 4 | 
 5 | def processFile(filename):
 6 |     inFile = open(filename,'r')
 7 |     curLine = inFile.readline()
 8 |     while(curLine != ""):
 9 |         for ls in locationString.findall(curLine):
10 |             lineToPrint = ls[0].replace("/LOCATION", "").strip()
11 |             if(len(lineToPrint) > 0):
12 |                 print lineToPrint
13 |         curLine = inFile.readline()
14 | 
15 | def processDirectory(dirname):
16 |   fileList = os.listdir(dirname)
17 |   if(not dirname[-1] == "/"):
18 |     dirname += "/"
19 |   for filename in fileList:
20 |     if(os.path.isdir(dirname + filename)):
21 |       processDirectory(dirname + filename)
22 |     elif(os.path.isfile(dirname + filename)):
23 |       processFile(dirname + filename)
24 | 
25 | for filename in sys.argv[1:]:
26 |     processFile(filename)
27 | 


--------------------------------------------------------------------------------
/src/main/python/splitdevtest.py:
--------------------------------------------------------------------------------
 1 | import sys, shutil, os
 2 | 
 3 | def processDirectory(dirname):
 4 |   fileList = os.listdir(dirname)
 5 |   if(not dirname[-1] == "/"):
 6 |     dirname += "/"
 7 |   count = 0
 8 |   for filename in fileList:
 9 |     if(count % 3 == 2):
10 |       shutil.copy(dirname + filename, sys.argv[3])
11 |       print (dirname + filename) + " --> " + sys.argv[3]
12 |     else:
13 |       shutil.copy(dirname + filename, sys.argv[2])
14 |       print (dirname + filename) + " --> " + sys.argv[2]
15 |     count += 1
16 | 
17 | processDirectory(sys.argv[1])
18 | 


--------------------------------------------------------------------------------
/src/main/python/tei2txt.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | import sys
 4 | import os
 5 | import re
 6 | import gzip
 7 | import fnmatch
 8 | 
 9 | from codecs import latin_1_decode
10 | from unicodedata import normalize
11 | from tei_entities import pcl_tei_entities
12 | 
13 | commaRE = re.compile(",")
14 | nonAlpha = re.compile("[^A-Za-z]")
15 | 
16 | pte = pcl_tei_entities()
17 | 
18 | def cleanWord(word):
19 |     word = word.lower()
20 |     if len(word) < 2:
21 |         word = ""
22 |     return word
23 | 
24 | def strip_text (text):
25 |     text = latin_1_decode(text)[0]
26 |     text = normalize('NFD',text).encode('ascii','ignore')
27 | 
28 |     text = re.sub('&mdash+;', ' ', text)   # convert mdash to " "
29 | #    text = re.sub('&amp;', ' and ', text)   # convert mdash to " "
30 |     text = pte.replace_entities(text)
31 | #    text = re.sub('&[A-Za-z]+;', '', text)   # convert ampersand stuff to ""
32 |     text = re.sub('<[^>]*>', ' ', text)   # strip HTML markup
33 |     text = re.sub('\s+', ' ', text)      # strip whitespace
34 | 
35 |     return text
36 | 
37 | directory_name = sys.argv[1]
38 | output_raw_dir = sys.argv[2]
39 | 
40 | if not os.path.exists(output_raw_dir):
41 |     os.makedirs(output_raw_dir)
42 | 
43 | files = os.listdir(directory_name)
44 | for file in files:
45 |     add_line = False
46 |     write_text = False
47 |     if fnmatch.fnmatch(file,"*.xml"):
48 |         print "******",file
49 |         newname = file[:-4]+".txt"
50 |         raw_writer = open(output_raw_dir+"/"+newname,"w")
51 |         file_reader = open(directory_name+"/"+file)
52 |         text = ""
53 | 
54 |         header_end = False
55 |         while not header_end:
56 |             line = file_reader.readline()
57 |             m = re.search('\s*\]>', line)
58 |             if m:
59 |                 header_end = True
60 | 
61 |         for line in file_reader.readlines():
62 |             text = line.strip()
63 |             text = strip_text(text).strip()
64 |             if text != "":
65 |                 raw_writer.write(text)
66 |                 raw_writer.write("\n")
67 | 
68 |         raw_writer.close()
69 |                 
70 | 


--------------------------------------------------------------------------------
/src/main/python/trrraw2plain.py:
--------------------------------------------------------------------------------
 1 | import sys, os
 2 | 
 3 | outDir = sys.argv[2]
 4 | if(not outDir[-1] == "/"):
 5 |     outDir += "/"
 6 | 
 7 | def processFile(filename):
 8 |     global outDir
 9 |     inFile = open(filename,'r')
10 |     newFilename = filename[filename.rfind("/")+1:-3] + ".txt"
11 |     outFile = open(outDir + newFilename, 'w')
12 |     wroteSomething = False
13 |     while(True):
14 |         curLine = inFile.readline()
15 |         if(curLine == ""): break
16 |         if(curLine.startswith(" ") or curLine.startswith("\t")): continue
17 |         nextToken = curLine.split()[0]
18 |         processedToken = nextToken.replace("&equo;", "'").replace("&dquo;", '"').replace("&dollar;", "$").replace("&dash;", "-").replace("&amp;", "&").replace("&times;", "*")
19 |         if(processedToken[0].isalnum() and wroteSomething):
20 |             outFile.write(" ") 
21 |         outFile.write(processedToken)
22 |         wroteSomething = True
23 |     inFile.close()
24 | 
25 | def processDirectory(dirname):
26 |   fileList = os.listdir(dirname)
27 |   if(not dirname[-1] == "/"):
28 |     dirname += "/"
29 |   for filename in fileList:
30 |     if(os.path.isdir(dirname + filename)):
31 |       processDirectory(dirname + filename)
32 |     elif(os.path.isfile(dirname + filename)):
33 |       processFile(dirname + filename)
34 | 
35 | processDirectory(sys.argv[1])
36 | 


--------------------------------------------------------------------------------
/src/main/python/unescape_entities.py:
--------------------------------------------------------------------------------
 1 | import re, htmlentitydefs
 2 | 
 3 | # NOTE: Courtesy of Frederik Lundh.
 4 | #
 5 | # http://effbot.org/zone/re-sub.htm#unescape-html
 6 | 
 7 | ##
 8 | # Removes HTML or XML character references and entities from a text string.
 9 | #
10 | # @param text The HTML (or XML) source text.
11 | # @return The plain text, as a Unicode string, if necessary.
12 | 
13 | def unescape(text):
14 |     def fixup(m):
15 |         text = m.group(0)
16 |         if text[:2] == "&#":
17 |             # character reference
18 |             try:
19 |                 if text[:3] == "&#x":
20 |                     return unichr(int(text[3:-1], 16))
21 |                 else:
22 |                     return unichr(int(text[2:-1]))
23 |             except ValueError:
24 |                 pass
25 |         else:
26 |             # named entity
27 |             try:
28 |                 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
29 |             except KeyError:
30 |                 pass
31 |         return text # leave as is
32 |     return re.sub("&#?\w+;", fixup, text)
33 | 


--------------------------------------------------------------------------------
/src/main/resources/data/deu/stopwords.txt:
--------------------------------------------------------------------------------
  1 | aber
  2 | alle
  3 | allem
  4 | allen
  5 | aller
  6 | alles
  7 | als
  8 | also
  9 | am
 10 | an
 11 | ander
 12 | andere
 13 | anderem
 14 | anderen
 15 | anderer
 16 | anderes
 17 | anderm
 18 | andern
 19 | anderr
 20 | anders
 21 | auch
 22 | auf
 23 | aus
 24 | bei
 25 | bin
 26 | bis
 27 | bist
 28 | da
 29 | damit
 30 | dann
 31 | der
 32 | den
 33 | des
 34 | dem
 35 | die
 36 | das
 37 | daß
 38 | derselbe
 39 | derselben
 40 | denselben
 41 | desselben
 42 | demselben
 43 | dieselbe
 44 | dieselben
 45 | dasselbe
 46 | dazu
 47 | dein
 48 | deine
 49 | deinem
 50 | deinen
 51 | deiner
 52 | deines
 53 | denn
 54 | derer
 55 | dessen
 56 | dich
 57 | dir
 58 | du
 59 | dies
 60 | diese
 61 | diesem
 62 | diesen
 63 | dieser
 64 | dieses
 65 | doch
 66 | dort
 67 | durch
 68 | ein
 69 | eine
 70 | einem
 71 | einen
 72 | einer
 73 | eines
 74 | einig
 75 | einige
 76 | einigem
 77 | einigen
 78 | einiger
 79 | einiges
 80 | einmal
 81 | er
 82 | ihn
 83 | ihm
 84 | es
 85 | etwas
 86 | euer
 87 | eure
 88 | eurem
 89 | euren
 90 | eurer
 91 | eures
 92 | für
 93 | gegen
 94 | gewesen
 95 | hab
 96 | habe
 97 | haben
 98 | hat
 99 | hatte
100 | hatten
101 | hier
102 | hin
103 | hinter
104 | ich
105 | mich
106 | mir
107 | ihr
108 | ihre
109 | ihrem
110 | ihren
111 | ihrer
112 | ihres
113 | euch
114 | im
115 | in
116 | indem
117 | ins
118 | ist
119 | jede
120 | jedem
121 | jeden
122 | jeder
123 | jedes
124 | jene
125 | jenem
126 | jenen
127 | jener
128 | jenes
129 | jetzt
130 | kann
131 | kein
132 | keine
133 | keinem
134 | keinen
135 | keiner
136 | keines
137 | können
138 | könnte
139 | machen
140 | man
141 | manche
142 | manchem
143 | manchen
144 | mancher
145 | manches
146 | mein
147 | meine
148 | meinem
149 | meinen
150 | meiner
151 | meines
152 | mit
153 | muss
154 | musste
155 | nach
156 | nicht
157 | nichts
158 | noch
159 | nun
160 | nur
161 | ob
162 | oder
163 | ohne
164 | sehr
165 | sein
166 | seine
167 | seinem
168 | seinen
169 | seiner
170 | seines
171 | selbst
172 | sich
173 | sie
174 | ihnen
175 | sind
176 | so
177 | solche
178 | solchem
179 | solchen
180 | solcher
181 | solches
182 | soll
183 | sollte
184 | sondern
185 | sonst
186 | über
187 | um
188 | und
189 | uns
190 | unse
191 | unsem
192 | unsen
193 | unser
194 | unses
195 | unter
196 | viel
197 | vom
198 | von
199 | vor
200 | während
201 | war
202 | waren
203 | warst
204 | was
205 | weg
206 | weil
207 | weiter
208 | welche
209 | welchem
210 | welchen
211 | welcher
212 | welches
213 | wenn
214 | werde
215 | werden
216 | wie
217 | wieder
218 | will
219 | wir
220 | wird
221 | wirst
222 | wo
223 | wollen
224 | wollte
225 | würde
226 | würden
227 | zu
228 | zum
229 | zur
230 | zwar
231 | zwischen
232 | 


--------------------------------------------------------------------------------
/src/main/resources/data/por/stopwords.txt:
--------------------------------------------------------------------------------
  1 | de
  2 | a
  3 | o
  4 | que
  5 | e
  6 | do
  7 | da
  8 | em
  9 | um
 10 | para
 11 | com
 12 | não
 13 | uma
 14 | os
 15 | no
 16 | se
 17 | na
 18 | por
 19 | mais
 20 | as
 21 | dos
 22 | como
 23 | mas
 24 | ao
 25 | ele
 26 | das
 27 | à
 28 | seu
 29 | sua
 30 | ou
 31 | quando
 32 | muito
 33 | nos
 34 | já
 35 | eu
 36 | também
 37 | só
 38 | pelo
 39 | pela
 40 | até
 41 | isso
 42 | ela
 43 | entre
 44 | depois
 45 | sem
 46 | mesmo
 47 | aos
 48 | seus
 49 | quem
 50 | nas
 51 | me
 52 | esse
 53 | eles
 54 | você
 55 | essa
 56 | num
 57 | nem
 58 | suas
 59 | meu
 60 | às
 61 | minha
 62 | numa
 63 | pelos
 64 | elas
 65 | qual
 66 | nós
 67 | lhe
 68 | deles
 69 | essas
 70 | esses
 71 | pelas
 72 | este
 73 | dele
 74 | tu
 75 | te
 76 | vocês
 77 | vos
 78 | lhes
 79 | meus
 80 | minhas
 81 | teu
 82 | tua
 83 | teus
 84 | tuas
 85 | nosso
 86 | nossa
 87 | nossos
 88 | nossas
 89 | dela
 90 | delas
 91 | esta
 92 | estes
 93 | estas
 94 | aquele
 95 | aquela
 96 | aqueles
 97 | aquelas
 98 | isto
 99 | aquilo
100 | estou
101 | está
102 | estamos
103 | estão
104 | estive
105 | esteve
106 | estivemos
107 | estiveram
108 | estava
109 | estávamos
110 | estavam
111 | estivera
112 | estivéramos
113 | esteja
114 | estejamos
115 | estejam
116 | estivesse
117 | estivéssemos
118 | estivessem
119 | estiver
120 | estivermos
121 | estiverem
122 | hei
123 | há
124 | havemos
125 | hão
126 | houve
127 | houvemos
128 | houveram
129 | houvera
130 | houvéramos
131 | haja
132 | hajamos
133 | hajam
134 | houvesse
135 | houvéssemos
136 | houvessem
137 | houver
138 | houvermos
139 | houverem
140 | houverei
141 | houverá
142 | houveremos
143 | houverão
144 | houveria
145 | houveríamos
146 | houveriam
147 | sou
148 | somos
149 | são
150 | era
151 | éramos
152 | eram
153 | fui
154 | foi
155 | fomos
156 | foram
157 | fora
158 | fôramos
159 | seja
160 | sejamos
161 | sejam
162 | fosse
163 | fôssemos
164 | fossem
165 | for
166 | formos
167 | forem
168 | serei
169 | será
170 | seremos
171 | serão
172 | seria
173 | seríamos
174 | seriam
175 | tenho
176 | tem
177 | temos
178 | tém
179 | tinha
180 | tínhamos
181 | tinham
182 | tive
183 | teve
184 | tivemos
185 | tiveram
186 | tivera
187 | tivéramos
188 | tenha
189 | tenhamos
190 | tenham
191 | tivesse
192 | tivéssemos
193 | tivessem
194 | tiver
195 | tivermos
196 | tiverem
197 | terei
198 | terá
199 | teremos
200 | terão
201 | teria
202 | teríamos
203 | teriam
204 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/geolocate/CombinedModelCell.scala:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  CombinedModelCellGrid.scala
 3 | //
 4 | //  Copyright (C) 2012 Stephen Roller, The University of Texas at Austin
 5 | //
 6 | //  Licensed under the Apache License, Version 2.0 (the "License");
 7 | //  you may not use this file except in compliance with the License.
 8 | //  You may obtain a copy of the License at
 9 | //
10 | //      http://www.apache.org/licenses/LICENSE-2.0
11 | //
12 | //  Unless required by applicable law or agreed to in writing, software
13 | //  distributed under the License is distributed on an "AS IS" BASIS,
14 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | //  See the License for the specific language governing permissions and
16 | //  limitations under the License.
17 | ///////////////////////////////////////////////////////////////////////////////
18 | 
19 | package opennlp.fieldspring.geolocate
20 | 
21 | import opennlp.fieldspring.util.distances.spheredist
22 | import opennlp.fieldspring.util.distances.SphereCoord
23 | import opennlp.fieldspring.util.experiment._
24 | import opennlp.fieldspring.util.printutil.{errprint, warning}
25 | 
26 | class CombinedModelCellGrid(table: SphereDocumentTable,
27 |                             models: Seq[SphereCellGrid])
28 |     extends SphereCellGrid(table) {
29 | 
30 |   override var total_num_cells: Int = models.map(_.total_num_cells).sum
31 |   override val num_training_passes: Int = models.map(_.num_training_passes).max
32 | 
33 |   var current_training_pass: Int = 0
34 | 
35 |   override def begin_training_pass(pass: Int) = {
36 |     current_training_pass = pass
37 |     for (model <- models) {
38 |       if (pass <= model.num_training_passes) {
39 |         model.begin_training_pass(pass)
40 |       }
41 |     }
42 |   }
43 | 
44 |   def find_best_cell_for_document(doc: SphereDocument,
45 |                                   create_non_recorded: Boolean) = {
46 |       val candidates =
47 |         models.map(_.find_best_cell_for_document(doc, create_non_recorded))
48 |               .filter(_ != null)
49 |       candidates.minBy((cell: SphereCell) =>
50 |                          spheredist(cell.get_center_coord, doc.coord))
51 |   }
52 | 
53 |   def add_document_to_cell(document: SphereDocument) {
54 |     for (model <- models) {
55 |       if (current_training_pass <= model.num_training_passes) {
56 |         model.add_document_to_cell(document)
57 |       }
58 |     }
59 |   }
60 | 
61 |   def initialize_cells() {
62 |   }
63 | 
64 |   override def finish() {
65 |     for (model <- models) {
66 |       model.finish()
67 |     }
68 |     num_non_empty_cells = models.map(_.num_non_empty_cells).sum
69 |   }
70 | 
71 |   def iter_nonempty_cells(nonempty_word_dist: Boolean = false): Iterable[SphereCell] = {
72 |     models.map(_.iter_nonempty_cells(nonempty_word_dist))
73 |           .reduce(_ ++ _)
74 |   }
75 | }
76 | 
77 | 
78 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/geolocate/TwitterDocument.scala:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  TwitterDocument.scala
 3 | //
 4 | //  Copyright (C) 2011 Ben Wing, The University of Texas at Austin
 5 | //
 6 | //  Licensed under the Apache License, Version 2.0 (the "License");
 7 | //  you may not use this file except in compliance with the License.
 8 | //  You may obtain a copy of the License at
 9 | //
10 | //      http://www.apache.org/licenses/LICENSE-2.0
11 | //
12 | //  Unless required by applicable law or agreed to in writing, software
13 | //  distributed under the License is distributed on an "AS IS" BASIS,
14 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | //  See the License for the specific language governing permissions and
16 | //  limitations under the License.
17 | ///////////////////////////////////////////////////////////////////////////////
18 | 
19 | package opennlp.fieldspring.geolocate
20 | 
21 | import opennlp.fieldspring.util.textdbutil.Schema
22 | 
23 | import opennlp.fieldspring.worddist.WordDist.memoizer._
24 | 
25 | class TwitterTweetDocument(
26 |   schema: Schema,
27 |   subtable: TwitterTweetDocumentSubtable
28 | ) extends SphereDocument(schema, subtable.table) {
29 |   var id = 0L
30 |   def title = id.toString
31 | 
32 |   override def set_field(field: String, value: String) {
33 |     field match {
34 |       case "title" => id = value.toLong
35 |       case _ => super.set_field(field, value)
36 |     }
37 |   }
38 | 
39 |   def struct =
40 |     <TwitterTweetDocument>
41 |       <id>{ id }</id>
42 |       {
43 |         if (has_coord)
44 |           <location>{ coord }</location>
45 |       }
46 |     </TwitterTweetDocument>
47 | }
48 | 
49 | class TwitterTweetDocumentSubtable(
50 |   table: SphereDocumentTable
51 | ) extends SphereDocumentSubtable[TwitterTweetDocument](table) {
52 |   def create_document(schema: Schema) = new TwitterTweetDocument(schema, this)
53 | }
54 | 
55 | class TwitterUserDocument(
56 |   schema: Schema,
57 |   subtable: TwitterUserDocumentSubtable
58 | ) extends SphereDocument(schema, subtable.table) {
59 |   var userind = blank_memoized_string
60 |   def title = unmemoize_string(userind)
61 | 
62 |   override def set_field(field: String, value: String) {
63 |     field match {
64 |       case "user" => userind = memoize_string(value)
65 |       case _ => super.set_field(field, value)
66 |     }
67 |   }
68 | 
69 |   def struct =
70 |     <TwitterUserDocument>
71 |       <user>{ unmemoize_string(userind) }</user>
72 |       {
73 |         if (has_coord)
74 |           <location>{ coord }</location>
75 |       }
76 |     </TwitterUserDocument>
77 | }
78 | 
79 | class TwitterUserDocumentSubtable(
80 |   table: SphereDocumentTable
81 | ) extends SphereDocumentSubtable[TwitterUserDocument](table) {
82 |   def create_document(schema: Schema) = new TwitterUserDocument(schema, this)
83 | }
84 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/gridlocate/TextGrounderInfo.scala:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  FieldspringInfo.scala
 3 | //
 4 | //  Copyright (C) 2011 Ben Wing, The University of Texas at Austin
 5 | //
 6 | //  Licensed under the Apache License, Version 2.0 (the "License");
 7 | //  you may not use this file except in compliance with the License.
 8 | //  You may obtain a copy of the License at
 9 | //
10 | //      http://www.apache.org/licenses/LICENSE-2.0
11 | //
12 | //  Unless required by applicable law or agreed to in writing, software
13 | //  distributed under the License is distributed on an "AS IS" BASIS,
14 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | //  See the License for the specific language governing permissions and
16 | //  limitations under the License.
17 | ///////////////////////////////////////////////////////////////////////////////
18 | 
19 | package opennlp.fieldspring.gridlocate
20 | 
21 | import opennlp.fieldspring.util.printutil.errprint
22 | 
23 | /**
24 |  Fieldspring-specific information (e.g. env vars).
25 |  */
26 | 
27 | object FieldspringInfo {
28 |   var fieldspring_dir: String = null
29 | 
30 |   def set_fieldspring_dir(dir: String) {
31 |     fieldspring_dir = dir
32 |   }
33 | 
34 |   def get_fieldspring_dir() = {
35 |     if (fieldspring_dir == null)
36 |       fieldspring_dir = System.getenv("FIELDSPRING_DIR")
37 |     if (fieldspring_dir == null) {
38 |       errprint("""FIELDSPRING_DIR must be set to the top-level directory where
39 | Fieldspring is installed.""")
40 |       require(fieldspring_dir != null)
41 |     }
42 |     fieldspring_dir
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/perceptron/package.scala:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  package.scala
 3 | //
 4 | //  Copyright (C) 2012 Ben Wing, The University of Texas at Austin
 5 | //
 6 | //  Licensed under the Apache License, Version 2.0 (the "License");
 7 | //  you may not use this file except in compliance with the License.
 8 | //  You may obtain a copy of the License at
 9 | //
10 | //      http://www.apache.org/licenses/LICENSE-2.0
11 | //
12 | //  Unless required by applicable law or agreed to in writing, software
13 | //  distributed under the License is distributed on an "AS IS" BASIS,
14 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | //  See the License for the specific language governing permissions and
16 | //  limitations under the License.
17 | ///////////////////////////////////////////////////////////////////////////////
18 | 
19 | package opennlp.fieldspring
20 | 
21 | package object perceptron {
22 |   type WeightVector = Array[Double]
23 | }
24 | 
25 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/poligrounder/TimeDocument.scala:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  TimeDocument.scala
 3 | //
 4 | //  Copyright (C) 2011, 2012 Ben Wing, The University of Texas at Austin
 5 | //
 6 | //  Licensed under the Apache License, Version 2.0 (the "License");
 7 | //  you may not use this file except in compliance with the License.
 8 | //  You may obtain a copy of the License at
 9 | //
10 | //      http://www.apache.org/licenses/LICENSE-2.0
11 | //
12 | //  Unless required by applicable law or agreed to in writing, software
13 | //  distributed under the License is distributed on an "AS IS" BASIS,
14 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | //  See the License for the specific language governing permissions and
16 | //  limitations under the License.
17 | ///////////////////////////////////////////////////////////////////////////////
18 | 
19 | package opennlp.fieldspring.poligrounder
20 | 
21 | import collection.mutable
22 | 
23 | import opennlp.fieldspring.util.distances._
24 | import opennlp.fieldspring.util.textdbutil.Schema
25 | import opennlp.fieldspring.util.printutil._
26 | 
27 | import opennlp.fieldspring.gridlocate.{DistDocument,DistDocumentTable,CellGrid}
28 | import opennlp.fieldspring.gridlocate.DistDocumentConverters._
29 | 
30 | import opennlp.fieldspring.worddist.WordDistFactory
31 | 
32 | class TimeDocument(
33 |   schema: Schema,
34 |   table: TimeDocumentTable
35 | ) extends DistDocument[TimeCoord](schema, table) {
36 |   var coord: TimeCoord = _
37 |   var user: String = _
38 |   def has_coord = coord != null
39 |   def title = if (coord != null) coord.toString else "unknown time"
40 | 
41 |   def struct =
42 |     <TimeDocument>
43 |       {
44 |         if (has_coord)
45 |           <timestamp>{ coord }</timestamp>
46 |       }
47 |     </TimeDocument>
48 | 
49 |   override def set_field(name: String, value: String) {
50 |     name match {
51 |       case "min-timestamp" => coord = get_x_or_null[TimeCoord](value)
52 |       case "user" => user = value
53 |       case _ => super.set_field(name, value)
54 |     }
55 |   }
56 | 
57 |   def coord_as_double(coor: TimeCoord) = coor match {
58 |     case null => Double.NaN
59 |     case TimeCoord(x) => x.toDouble / 1000
60 |   }
61 | 
62 |   def distance_to_coord(coord2: TimeCoord) = {
63 |     (coord_as_double(coord2) - coord_as_double(coord)).abs
64 |   }
65 |   def output_distance(dist: Double) = "%s seconds" format dist
66 | }
67 | 
68 | /**
69 |  * A DistDocumentTable specifically for documents with coordinates described
70 |  * by a TimeCoord.
71 |  * We delegate the actual document creation to a subtable specific to the
72 |  * type of corpus (e.g. Wikipedia or Twitter).
73 |  */
74 | class TimeDocumentTable(
75 |   override val driver: PoligrounderDriver,
76 |   word_dist_factory: WordDistFactory
77 | ) extends DistDocumentTable[TimeCoord, TimeDocument, TimeCellGrid](
78 |   driver, word_dist_factory
79 | ) {
80 |   def create_document(schema: Schema) = new TimeDocument(schema, this)
81 | }
82 | 
83 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/postprocess/DocumentRankerByError.scala:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  DocumentRankerByError.scala
 3 | //
 4 | //  Copyright (C) 2012 Mike Speriosu, The University of Texas at Austin
 5 | //
 6 | //  Licensed under the Apache License, Version 2.0 (the "License");
 7 | //  you may not use this file except in compliance with the License.
 8 | //  You may obtain a copy of the License at
 9 | //
10 | //      http://www.apache.org/licenses/LICENSE-2.0
11 | //
12 | //  Unless required by applicable law or agreed to in writing, software
13 | //  distributed under the License is distributed on an "AS IS" BASIS,
14 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | //  See the License for the specific language governing permissions and
16 | //  limitations under the License.
17 | ///////////////////////////////////////////////////////////////////////////////
18 | 
19 | package opennlp.fieldspring.postprocess
20 | 
21 | // This program takes a log file and outputs the document names to standard out, ranked by prediction error.
22 | 
23 | import org.clapper.argot._
24 | import opennlp.fieldspring.tr.topo._
25 | import opennlp.fieldspring.tr.util.LogUtil
26 | 
27 | object DocumentRankerByError {
28 | 
29 |   import ArgotConverters._
30 | 
31 |   val parser = new ArgotParser("fieldspring run opennlp.fieldspring.postprocess.DocumentRankerByError", preUsage = Some("Fieldspring"))
32 |   val logFile = parser.option[String](List("l", "log"), "log", "log input file")
33 |   
34 |   def main(args: Array[String]) {
35 |     try {
36 |       parser.parse(args)
37 |     }
38 |     catch {
39 |       case e: ArgotUsageException => println(e.message); sys.exit(0)
40 |     }
41 | 
42 |     if(logFile.value == None) {
43 |       println("You must specify a log input file via -l.")
44 |       sys.exit(0)
45 |     }
46 | 
47 |     val docsAndErrors:List[(String, Double, Coordinate, Coordinate)] =
48 |       (for(pe <- LogUtil.parseLogFile(logFile.value.get)) yield {
49 |         val dist = pe.trueCoord.distanceInKm(pe.predCoord)
50 | 
51 |         (pe.docName, dist, pe.trueCoord, pe.predCoord)
52 |       }).sortWith((x, y) => x._2 < y._2)
53 | 
54 |     for((docName, dist, trueCoord, predCoord) <- docsAndErrors) {
55 |       println(docName+"\t"+dist+"\t"+trueCoord+"\t"+predCoord)
56 |     }
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/preprocess/Permute.scala:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Permute.scala
 3 | //
 4 | //  Copyright (C) 2012 Stephen Roller, The University of Texas at Austin
 5 | //
 6 | //  Licensed under the Apache License, Version 2.0 (the "License");
 7 | //  you may not use this file except in compliance with the License.
 8 | //  You may obtain a copy of the License at
 9 | //
10 | //      http://www.apache.org/licenses/LICENSE-2.0
11 | //
12 | //  Unless required by applicable law or agreed to in writing, software
13 | //  distributed under the License is distributed on an "AS IS" BASIS,
14 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | //  See the License for the specific language governing permissions and
16 | //  limitations under the License.
17 | ///////////////////////////////////////////////////////////////////////////////
18 | 
19 | package opennlp.fieldspring.preprocess
20 | 
21 | import util.Random
22 | import com.nicta.scoobi.Scoobi._
23 | import java.io._
24 | 
25 | /*
26 |  * This program randomly permutes all the lines in a text file, using Hadoop
27 |  * and Scoobi.
28 |  */
29 | 
30 | object Permute extends ScoobiApp {
31 |   val rnd = new Random
32 | 
33 |   def generate_key(line: String): (Double, String) = {
34 |     (rnd.nextDouble, line)
35 |   }
36 | 
37 |   def remove_key(kvs: (Double, Iterable[String])): Iterable[String] = {
38 |     val (key, values) = kvs
39 |     for (v <- values)
40 |       yield v
41 |   }
42 | 
43 |   def run() {
44 |     // make sure we get all the input
45 |     val (inputPath, outputPath) =
46 |       if (args.length == 2) {
47 |         (args(0), args(1))
48 |       } else {
49 |         sys.error("Expecting input and output path.")
50 |       }
51 | 
52 |     // Firstly we load up all the (new-line-seperated) json lines
53 |     val lines: DList[String] = TextInput.fromTextFile(inputPath)
54 | 
55 |     // randomly generate keys
56 |     val with_keys = lines.map(generate_key)
57 | 
58 |     // sort by keys
59 |     val keys_sorted = with_keys.groupByKey
60 | 
61 |     // remove keys
62 |     val keys_removed = keys_sorted.flatMap(remove_key)
63 | 
64 |     // save to disk
65 |     persist(TextOutput.toTextFile(keys_removed, outputPath))
66 | 
67 |   }
68 | }
69 | 
70 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/preprocess/ProcessFiles.scala:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  ProcessFiles.scala
 3 | //
 4 | //  Copyright (C) 2011 Ben Wing, The University of Texas at Austin
 5 | //
 6 | //  Licensed under the Apache License, Version 2.0 (the "License");
 7 | //  you may not use this file except in compliance with the License.
 8 | //  You may obtain a copy of the License at
 9 | //
10 | //      http://www.apache.org/licenses/LICENSE-2.0
11 | //
12 | //  Unless required by applicable law or agreed to in writing, software
13 | //  distributed under the License is distributed on an "AS IS" BASIS,
14 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | //  See the License for the specific language governing permissions and
16 | //  limitations under the License.
17 | ///////////////////////////////////////////////////////////////////////////////
18 | 
19 | package opennlp.fieldspring.preprocess
20 | 
21 | import opennlp.fieldspring.util.argparser._
22 | import opennlp.fieldspring.util.experiment._
23 | import opennlp.fieldspring.util.ioutil._
24 | 
25 | /*
26 |    Common code for doing basic file-processing operations.
27 | 
28 |    FIXME: It's unclear there's enough code to justify factoring it out
29 |    like this.
30 | */
31 | 
32 | /////////////////////////////////////////////////////////////////////////////
33 | //                                  Main code                              //
34 | /////////////////////////////////////////////////////////////////////////////
35 | 
36 | /**
37 |  * Class for defining and retrieving command-line arguments.  Consistent
38 |  * with "field-style" access to an ArgParser, this class needs to be
39 |  * instantiated twice with the same ArgParser object, before and after parsing
40 |  * the command line.  The first instance defines the allowed arguments in the
41 |  * ArgParser, while the second one retrieves the values stored into the
42 |  * ArgParser as a result of parsing.
43 |  *
44 |  * @param ap ArgParser object.
45 |  */
46 | class ProcessFilesParameters(ap: ArgParser) extends
47 |     ArgParserParameters(ap) {
48 |   val output_dir =
49 |     ap.option[String]("o", "output-dir",
50 |       metavar = "DIR",
51 |       help = """Directory to store output files in.  It must not already
52 | exist, and will be created (including any parent directories).""")
53 | }
54 | 
55 | abstract class ProcessFilesDriver extends HadoopableArgParserExperimentDriver {
56 |   override type TParam <: ProcessFilesParameters
57 |   type TRunRes = Unit
58 | 
59 |   def handle_parameters() {
60 |     need(params.output_dir, "output-dir")
61 |   }
62 | 
63 |   def setup_for_run() { }
64 | 
65 |   def run_after_setup() {
66 |     if (!get_file_handler.make_directories(params.output_dir))
67 |       param_error("Output dir %s must not already exist" format
68 |         params.output_dir)
69 |   }
70 | }
71 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/preprocess/ScoobiWordCount.scala:
--------------------------------------------------------------------------------
 1 | package opennlp.fieldspring.preprocess
 2 | 
 3 | import com.nicta.scoobi.Scoobi._
 4 | // import com.nicta.scoobi.testing.HadoopLogFactory
 5 | import com.nicta.scoobi.application.HadoopLogFactory
 6 | import org.apache.commons.logging.LogFactory
 7 | import org.apache.hadoop.fs.FileSystem
 8 | import java.io._
 9 | 
10 | object ScoobiWordCount extends ScoobiApp {
11 |   def run() {
12 |     // There's some magic here in the source code to make the get() call
13 |     // work -- there's an implicit conversion in object ScoobiConfiguration
14 |     // from a ScoobiConfiguration to a Hadoop Configuration, which has get()
15 |     // defined on it.  Evidently implicit conversions in the companion object
16 |     // get made available automatically for classes or something?
17 |     System.err.println("mapred.job.tracker " +
18 |       configuration.get("mapred.job.tracker", "value not found"))
19 |     // System.err.println("job tracker " + jobTracker)
20 |     // System.err.println("file system " + fs)
21 |     System.err.println("configure file system " + configuration.fs)
22 |     System.err.println("file system key " +
23 |       configuration.get(FileSystem.FS_DEFAULT_NAME_KEY, "value not found"))
24 | 
25 |     val lines =
26 |       // Test fromTextFileWithPath, but currently appears to trigger an
27 |       // infinite loop.
28 |       // TextInput.fromTextFileWithPath(args(0))
29 |       TextInput.fromTextFile(args(0)).map(x => (args(0), x))
30 | 
31 |     def splitit(x: String) = {
32 |       HadoopLogFactory.setQuiet(false)
33 |       // val logger = LogFactory.getLog("foo.bar")
34 |       // logger.info("Processing " + x)
35 |       // System.err.println("Processing", x)
36 |       x.split(" ")
37 |     }
38 |     //val counts = lines.flatMap(_.split(" "))
39 |     val counts = lines.map(_._2).flatMap(splitit)
40 |                           .map(word => (word, 1))
41 |                           .groupByKey
42 |                           .filter { case (word, lens) => word.length < 8 }
43 |                           .filter { case (word, lens) => lens.exists(x => true) }
44 |                           .combine((a: Int, b: Int) => a + b)
45 |     persist(toTextFile(counts, args(1)))
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/app/ConvertCorpusToPlaintext.scala:
--------------------------------------------------------------------------------
 1 | package opennlp.fieldspring.tr.app
 2 | 
 3 | import java.io._
 4 | 
 5 | import opennlp.fieldspring.tr.topo._
 6 | import opennlp.fieldspring.tr.text._
 7 | import opennlp.fieldspring.tr.text.prep._
 8 | import opennlp.fieldspring.tr.text.io._
 9 | 
10 | import scala.collection.JavaConversions._
11 | 
12 | object ConvertCorpusToPlaintext extends App {
13 | 
14 |   val outDirName = if(args(1).endsWith("/")) args(1) else args(1)+"/"
15 |   val outDir = new File(outDirName)
16 |   if(!outDir.exists)
17 |     outDir.mkdir
18 | 
19 |   val tokenizer = new OpenNLPTokenizer
20 | 
21 |   val corpus = Corpus.createStoredCorpus
22 |   corpus.addSource(new TrXMLDirSource(new File(args(0)), tokenizer))
23 |   corpus.setFormat(BaseApp.CORPUS_FORMAT.TRCONLL)
24 |   corpus.load
25 | 
26 |   for(doc <- corpus) {
27 |     val out = new BufferedWriter(new FileWriter(outDirName+doc.getId+".txt"))
28 |     for(sent <- doc) {
29 |       for(token <- sent) {
30 |         out.write(token.getForm+" ")
31 |       }
32 |       out.write("\n")
33 |     }
34 |     out.close
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/app/ConvertCorpusToToponymAsDoc.scala:
--------------------------------------------------------------------------------
 1 | package opennlp.fieldspring.tr.app
 2 | 
 3 | import java.io._
 4 | 
 5 | import opennlp.fieldspring.tr.topo._
 6 | import opennlp.fieldspring.tr.text._
 7 | import opennlp.fieldspring.tr.text.prep._
 8 | import opennlp.fieldspring.tr.text.io._
 9 | import opennlp.fieldspring.tr.util._
10 | 
11 | import scala.collection.JavaConversions._
12 | 
13 | object ConvertCorpusToToponymAsDoc extends App {
14 | 
15 |   val windowSize = if(args.length >= 2) args(1).toInt else 0
16 | 
17 |   val alphanumRE = """^[a-zA-Z0-9]+$""".r
18 | 
19 |   val tokenizer = new OpenNLPTokenizer
20 | 
21 |   val corpus = Corpus.createStoredCorpus
22 |   corpus.addSource(new TrXMLDirSource(new File(args(0)), tokenizer))
23 |   corpus.setFormat(BaseApp.CORPUS_FORMAT.TRCONLL)
24 |   corpus.load
25 | 
26 |   for(doc <- corpus) {
27 |     val docAsArray = TextUtil.getDocAsArray(doc)
28 |     var tokIndex = 0
29 |     for(token <- docAsArray) {
30 |       if(token.isToponym && token.asInstanceOf[Toponym].hasGold) {
31 |         val goldCoord = token.asInstanceOf[Toponym].getGold.getRegion.getCenter
32 | 
33 |         val unigramCounts = getUnigramCounts(docAsArray, tokIndex, windowSize)
34 |         
35 |         print(doc.getId.drop(1)+"_"+tokIndex+"\t")
36 |         print(doc.getId+"_"+tokIndex+"\t")
37 |         print(goldCoord.getLatDegrees+","+goldCoord.getLngDegrees+"\t")
38 |         print("1\t\tMain\tno\tno\tno\t")
39 |         //print(token.getForm+":"+1+" ")\
40 |         for((word, count) <- unigramCounts) {
41 |           print(word+":"+count+" ")
42 |         }
43 |         println
44 |       }
45 |       tokIndex += 1
46 |     }
47 |   }
48 | 
49 |   def getUnigramCounts(docAsArray:Array[StoredToken], tokIndex:Int, windowSize:Int): Map[String, Int] = {
50 | 
51 |     val startIndex = math.max(0, tokIndex - windowSize)
52 |     val endIndex = math.min(docAsArray.length, tokIndex + windowSize + 1)
53 | 
54 |     val unigramCounts = new collection.mutable.HashMap[String, Int]
55 | 
56 |     for(rawToken <- docAsArray.slice(startIndex, endIndex)) {
57 |       for(token <- rawToken.getForm.split(" ")) {
58 |         val prevCount = unigramCounts.getOrElse(token, 0)
59 |         unigramCounts.put(token, prevCount + 1)
60 |       }
61 |     }
62 | 
63 |     unigramCounts.toMap
64 |   }
65 | }
66 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/app/ConvertCorpusToUnigramCounts.scala:
--------------------------------------------------------------------------------
 1 | package opennlp.fieldspring.tr.app
 2 | 
 3 | import java.io._
 4 | import java.util.zip._
 5 | 
 6 | import opennlp.fieldspring.tr.util._
 7 | import opennlp.fieldspring.tr.topo._
 8 | import opennlp.fieldspring.tr.topo.gaz._
 9 | import opennlp.fieldspring.tr.text._
10 | import opennlp.fieldspring.tr.text.prep._
11 | import opennlp.fieldspring.tr.text.io._
12 | 
13 | import scala.collection.JavaConversions._
14 | 
15 | object ConvertCorpusToUnigramCounts extends BaseApp {
16 | 
17 |   val alphanumRE = """^[a-z0-9]+$""".r
18 | 
19 |   //val tokenizer = new OpenNLPTokenizer
20 | 
21 |   def main(args:Array[String]) { 
22 | 
23 |     initializeOptionsFromCommandLine(args);
24 | 
25 |     /*var corpus = Corpus.createStoredCorpus
26 | 
27 |     if(getCorpusFormat == BaseApp.CORPUS_FORMAT.PLAIN/**/) {
28 |       /*
29 |       val tokenizer = new OpenNLPTokenizer
30 |       //val recognizer = new OpenNLPRecognizer
31 |       //val gis = new GZIPInputStream(new FileInputStream(args(1)))
32 |       //val ois = new ObjectInputStream(gis)
33 |       //val gnGaz = ois.readObject.asInstanceOf[GeoNamesGazetteer]
34 |       //gis.close
35 |       corpus.addSource(new PlainTextSource(
36 |         new BufferedReader(new FileReader(args(0))), new OpenNLPSentenceDivider(), tokenizer))
37 |       //corpus.addSource(new ToponymAnnotator(new PlainTextSource(
38 |       //		new BufferedReader(new FileReader(args(0))), new OpenNLPSentenceDivider(), tokenizer),
39 |       //          recognizer, gnGaz, null))
40 |       corpus.setFormat(BaseApp.CORPUS_FORMAT.PLAIN)
41 |       */
42 |       val importCorpus = new ImportCorpus
43 |       //if(args(0).endsWith("txt"))
44 |       corpus = importCorpus.doImport(getCorpusInputPath, , getCorpusFormat, false)
45 |       //else
46 |       //  corpus = importCorpus
47 |     }
48 |     else if(getCorpusFormat == BaseApp.CORPUS_FORMAT.TRCONLL) {
49 |       corpus.addSource(new TrXMLDirSource(new File(args(0)), tokenizer))
50 |       corpus.setFormat(BaseApp.CORPUS_FORMAT.TRCONLL)
51 |       corpus.load
52 |     }
53 |     //corpus.load*/
54 | 
55 |     val corpus = TopoUtil.readStoredCorpusFromSerialized(getSerializedCorpusInputPath)
56 |     
57 |     var i = 0
58 |     for(doc <- corpus) {
59 |       val unigramCounts = new collection.mutable.HashMap[String, Int]
60 |       for(sent <- doc) {
61 |         for(rawToken <- sent) {
62 |           for(token <- rawToken.getForm.split(" ")) {
63 |             val ltoken = token.toLowerCase
64 |             if(alphanumRE.findFirstIn(ltoken) != None) {
65 |               val prevCount = unigramCounts.getOrElse(ltoken, 0)
66 |               unigramCounts.put(ltoken, prevCount + 1)
67 |             }
68 |           }
69 |         }
70 |       }
71 |     
72 |       print(i/*doc.getId.drop(1)*/ +"\t")
73 |       print(doc.getId+"\t")
74 |       print("0,0\t")
75 |       print("1\t\tMain\tno\tno\tno\t")
76 |       for((word, count) <- unigramCounts) {
77 |         print(word+":"+count+" ")
78 |       }
79 |       println
80 |       i += 1
81 |     }
82 | 
83 |   }
84 | }
85 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/app/ConvertGeoTextToJSON.scala:
--------------------------------------------------------------------------------
 1 | package opennlp.fieldspring.tr.app
 2 | 
 3 | import com.codahale.jerkson.Json._
 4 | 
 5 | object ConvertGeoTextToJSON extends App {
 6 |   for(line <- scala.io.Source.fromFile(args(0), "ISO-8859-1").getLines) {
 7 |     val tokens = line.split("\t")
 8 |     println(generate(new tweet(tokens(3).toDouble, tokens(4).toDouble, tokens(5))))
 9 |   }
10 | }
11 | 
12 | case class tweet(val lat:Double, val lon:Double, val text:String)
13 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/app/CorpusErrorAnalyzer.scala:
--------------------------------------------------------------------------------
 1 | package opennlp.fieldspring.tr.app
 2 | 
 3 | import java.io._
 4 | import java.util.zip._
 5 | 
 6 | import opennlp.fieldspring.tr.util._
 7 | import opennlp.fieldspring.tr.topo._
 8 | import opennlp.fieldspring.tr.text._
 9 | import opennlp.fieldspring.tr.text.prep._
10 | import opennlp.fieldspring.tr.topo.gaz._
11 | import opennlp.fieldspring.tr.text.io._
12 | 
13 | import scala.collection.JavaConversions._
14 | 
15 | object CorpusErrorAnalyzer extends BaseApp {
16 |   
17 |   def main(args:Array[String]) {
18 |     initializeOptionsFromCommandLine(args)
19 | 
20 |     val corpus = TopoUtil.readStoredCorpusFromSerialized(getSerializedCorpusInputPath)
21 | 
22 |     for(doc <- corpus) {
23 |       for(sent <- doc) {
24 |         for(toponym <- sent.getToponyms.filter(_.getAmbiguity > 0)) {
25 |           
26 |         }
27 |       }
28 |     }
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/app/FilterGeotaggedWiki.scala:
--------------------------------------------------------------------------------
 1 | package opennlp.fieldspring.tr.app
 2 | 
 3 | import java.io._
 4 | import java.util.zip._
 5 | 
 6 | import opennlp.fieldspring.tr.util._
 7 | import opennlp.fieldspring.tr.topo._
 8 | import opennlp.fieldspring.tr.topo.gaz._
 9 | import opennlp.fieldspring.tr.text._
10 | import opennlp.fieldspring.tr.text.prep._
11 | import opennlp.fieldspring.tr.text.io._
12 | 
13 | import scala.collection.JavaConversions._
14 | 
15 | import org.apache.commons.compress.compressors.bzip2._
16 | import org.clapper.argot._
17 | import ArgotConverters._
18 | 
19 | object FilterGeotaggedWiki extends App {
20 |   val parser = new ArgotParser("fieldspring run opennlp.fieldspring.tr.app.FilterGeotaggedWiki", preUsage = Some("Fieldspring"))
21 | 
22 |   val wikiTextInputFile = parser.option[String](List("w", "wiki"), "wiki", "wiki text input file")
23 |   val wikiCorpusInputFile = parser.option[String](List("c", "corpus"), "corpus", "wiki corpus input file")
24 | 
25 |   try {
26 |     parser.parse(args)
27 |   }
28 |   catch {
29 |     case e: ArgotUsageException => println(e.message); sys.exit(0)
30 |   }
31 | 
32 |   val ids = new collection.mutable.HashSet[String]
33 | 
34 |   val fis = new FileInputStream(wikiCorpusInputFile.value.get)
35 |   fis.read; fis.read
36 |   val cbzis = new BZip2CompressorInputStream(fis)
37 |   val in = new BufferedReader(new InputStreamReader(cbzis))
38 |   var curLine = in.readLine
39 |   while(curLine != null) {
40 |     ids += curLine.split("\t")(0)
41 |     curLine = in.readLine
42 |   }
43 |   in.close
44 | 
45 |   val wikiTextCorpus = Corpus.createStreamCorpus
46 | 
47 |   wikiTextCorpus.addSource(new WikiTextSource(new BufferedReader(new FileReader(wikiTextInputFile.value.get))))
48 |   wikiTextCorpus.setFormat(BaseApp.CORPUS_FORMAT.WIKITEXT)
49 | 
50 |   for(doc <- wikiTextCorpus) {
51 |     if(ids contains doc.getId) {
52 |       println("Article title: " + doc.title)
53 |       println("Article ID: " + doc.getId)
54 |       for(sent <- doc) {
55 |         for(token <- sent) {
56 |           println(token.getOrigForm)
57 |         }
58 |       }
59 |     }
60 |     else {
61 |       for(sent <- doc) { for(token <- sent) {} }
62 |     }
63 |   }
64 | }
65 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/app/GazEntryKMLPlotter.scala:
--------------------------------------------------------------------------------
 1 | package opennlp.fieldspring.tr.app
 2 | 
 3 | import java.io._
 4 | import java.util.zip._
 5 | 
 6 | import opennlp.fieldspring.tr.util._
 7 | import opennlp.fieldspring.tr.topo._
 8 | import opennlp.fieldspring.tr.topo.gaz._
 9 | import opennlp.fieldspring.tr.text._
10 | import opennlp.fieldspring.tr.text.prep._
11 | import opennlp.fieldspring.tr.text.io._
12 | 
13 | import scala.collection.JavaConversions._
14 | 
15 | object GazEntryKMLPlotter /*extends BaseApp*/ {
16 |   
17 |   def main(args:Array[String]) {
18 | 
19 |     val toponym = args(0).replaceAll("_", " ")
20 |     //val gaz = println("Reading serialized gazetteer from " + args(1) + " ...")
21 |     val gis = new GZIPInputStream(new FileInputStream(args(1)))
22 |     val ois = new ObjectInputStream(gis)
23 |     val gnGaz = ois.readObject.asInstanceOf[GeoNamesGazetteer]
24 |     gis.close
25 | 
26 |     val entries = gnGaz.lookup(toponym)
27 |     if(entries != null) {
28 |       var loc = entries(0)
29 |       for(entry <- entries)
30 |         if(entry.getRegion.getRepresentatives.size > 1)
31 |           loc = entry
32 |       if(loc != null)
33 |       for(coord <- loc.getRegion.getRepresentatives) {
34 |         println("<Placemark>")
35 |         println("<styleUrl>#My_Style</styleUrl>")
36 |         println("<Point>")
37 |         println("<coordinates>"+coord.getLngDegrees+","+coord.getLatDegrees+",0</coordinates>")
38 |         println("</Point>")
39 |         println("</Placemark>")
40 |       }
41 |     }
42 | 
43 |     /*initializeOptionsFromCommandLine(args)
44 | 
45 |     val corpus = TopoUtil.readStoredCorpusFromSerialized(getSerializedCorpusInputPath)
46 | 
47 |     for(doc <- corpus) {
48 |       for(sent <- doc) {
49 |         for(toponym <- sent.getToponyms.filter(_.getAmbiguity > 0)) {
50 |           
51 |         }
52 |       }
53 |     }*/
54 |   }
55 | }
56 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/app/GeoTextLabelPropDecoder.scala:
--------------------------------------------------------------------------------
 1 | package opennlp.fieldspring.tr.app
 2 | 
 3 | import java.io._
 4 | import java.util._
 5 | 
 6 | import opennlp.fieldspring.tr.text._
 7 | import opennlp.fieldspring.tr.text.io._
 8 | import opennlp.fieldspring.tr.text.prep._
 9 | import opennlp.fieldspring.tr.topo._
10 | import opennlp.fieldspring.tr.app._
11 | import opennlp.fieldspring.tr.util.TopoUtil
12 | 
13 | import scala.collection.JavaConversions._
14 | 
15 | object GeoTextLabelPropDecoder extends BaseApp {
16 | 
17 |   import BaseApp._
18 | 
19 |   def DPC = 1.0
20 | 
21 |   def CELL_ = "cell_"
22 |   def CELL_LABEL_ = "cell_label_"
23 |   //def DOC_ = "doc_"
24 |   def USER_ = "USER_"
25 |   def UNI_ = "uni_"
26 |   def BI_ = "bi_"
27 | 
28 |   def main(args: Array[String]) = {
29 | 
30 |     this.initializeOptionsFromCommandLine(args)
31 |     this.doDecode
32 | 
33 |   }
34 | 
35 |   def doDecode() = {
36 |     checkExists(getSerializedCorpusInputPath)
37 |     checkExists(getGraphInputPath)
38 | 
39 |     val corpus = TopoUtil.readStoredCorpusFromSerialized(getSerializedCorpusInputPath)
40 | 
41 |     val docIdsToCells = new collection.mutable.HashMap[String, Int]
42 | 
43 |     val lines = scala.io.Source.fromFile(getGraphInputPath).getLines
44 | 
45 |     for(line <- lines) {
46 |       val tokens = line.split("\t")
47 | 
48 |       if(tokens.length >= 4 && tokens(0).startsWith(USER_)) {
49 |         val docId = tokens(0)
50 | 
51 |         val innertokens = tokens(3).split(" ")
52 | 
53 |         docIdsToCells.put(docId, findGreatestCell(innertokens))
54 |       }
55 |     }
56 | 
57 |     for(document <- corpus) {
58 |       if(document.isDev || document.isTest) {
59 |         if(docIdsToCells.containsKey(document.getId)) {
60 |           val cellNumber = docIdsToCells(document.getId)
61 |           if(cellNumber != -1) {
62 |             val lat = ((cellNumber / 1000) * DPC) + DPC/2.0
63 |             val lon = ((cellNumber % 1000) * DPC) + DPC/2.0
64 |             document.setSystemCoord(Coordinate.fromDegrees(lat, lon))
65 |           }
66 |         }
67 |       }
68 |     }
69 | 
70 |     val eval = new EvaluateCorpus
71 |     eval.doEval(corpus, corpus, CORPUS_FORMAT.GEOTEXT, true)
72 |   }
73 | 
74 |   def findGreatestCell(innertokens: Array[String]): Int = {
75 |     
76 |     for(innertoken <- innertokens) {
77 |       if(innertoken.startsWith(CELL_LABEL_)) {
78 |         return innertoken.substring(CELL_LABEL_.length).toInt
79 |       }
80 |     }
81 |     
82 |     return -1
83 |   }
84 | 
85 | }
86 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/app/Preprocess.scala:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.app
17 | 
18 | import java.io._
19 | 
20 | import opennlp.fieldspring.tr.topo.gaz._
21 | import opennlp.fieldspring.tr.text._
22 | import opennlp.fieldspring.tr.text.io._
23 | import opennlp.fieldspring.tr.text.prep._
24 | import opennlp.fieldspring.tr.util.Constants
25 | 
26 | object Preprocess extends App {
27 |   override def main(args: Array[String]) {
28 |     val divider = new OpenNLPSentenceDivider
29 |     val tokenizer = new OpenNLPTokenizer
30 |     val recognizer = new OpenNLPRecognizer
31 |     val gazetteer = new InMemoryGazetteer
32 | 
33 |     gazetteer.load(new WorldReader(new File(
34 |       Constants.getGazetteersDir() + File.separator + "dataen-fixed.txt.gz"
35 |     )))
36 | 
37 |     val corpus = Corpus.createStreamCorpus
38 | 
39 |     val in = new BufferedReader(new FileReader(args(0)))
40 |     corpus.addSource(
41 |      new ToponymAnnotator(new PlainTextSource(in, divider, tokenizer, args(0)),
42 |      recognizer, gazetteer
43 |     ))
44 | 
45 |     val writer = new CorpusXMLWriter(corpus)
46 |     writer.write(new File(args(1)))
47 |   }
48 | }
49 | 
50 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/app/ReprocessTrApp.scala:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.app
17 | 
18 | import java.io._
19 | 
20 | import opennlp.fieldspring.tr.eval._
21 | import opennlp.fieldspring.tr.resolver._
22 | import opennlp.fieldspring.tr.topo.gaz._
23 | import opennlp.fieldspring.tr.text._
24 | import opennlp.fieldspring.tr.text.io._
25 | import opennlp.fieldspring.tr.text.prep._
26 | import opennlp.fieldspring.tr.util.Constants
27 | 
28 | object ReprocessTrApp {
29 |   def main(args: Array[String]) {
30 |     val tokenizer = new OpenNLPTokenizer
31 |     val recognizer = new OpenNLPRecognizer
32 | 
33 |     val gazetteer = new InMemoryGazetteer
34 |     gazetteer.load(new WorldReader(new File(
35 |       Constants.getGazetteersDir() + File.separator + "dataen-fixed.txt.gz"
36 |     )))
37 | 
38 |     val corpus = Corpus.createStreamCorpus
39 |     val source = new TrXMLDirSource(new File(args(0)), tokenizer)
40 |     val stripped = new ToponymRemover(source)
41 |     corpus.addSource(new ToponymAnnotator(stripped, recognizer, gazetteer))
42 | 
43 |     val writer = new CorpusXMLWriter(corpus)
44 |     writer.write(new File(args(1)))
45 |   }
46 | }
47 | 
48 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/app/SplitDevTest.scala:
--------------------------------------------------------------------------------
 1 | package opennlp.fieldspring.tr.app
 2 | 
 3 | import java.io._
 4 | 
 5 | object SplitDevTest extends App {
 6 |   val dir = new File(args(0))
 7 | 
 8 |   val devDir = new File(dir.getCanonicalPath+"dev")
 9 |   val testDir = new File(dir.getCanonicalPath+"test")
10 |   devDir.mkdir
11 |   testDir.mkdir
12 | 
13 |   val files = dir.listFiles
14 | 
15 |   var i = 1
16 |   for(file <- files) {
17 |     if(i % 3 == 0)
18 |       file.renameTo(new File(testDir, file.getName))
19 |     else
20 |       file.renameTo(new File(devDir, file.getName))
21 |     i += 1
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/app/TrainingDirectoriesCombiner.scala:
--------------------------------------------------------------------------------
 1 | package opennlp.fieldspring.tr.app
 2 | 
 3 | import java.io._
 4 | 
 5 | object TrainingDirectoriesCombiner extends App {
 6 |   val inDir1 = new File(args(0))
 7 |   val inDir2 = new File(args(1))
 8 |   val outDir = new File(args(2))
 9 | 
10 |   if(!outDir.exists)
11 |     outDir.mkdir
12 | 
13 |   // First clear the source directory:
14 |   for(file <- outDir.listFiles)
15 |     file.delete
16 | 
17 |   lineByLineCopy(inDir1, outDir)
18 |   lineByLineCopy(inDir2, outDir)
19 | 
20 |   def lineByLineCopy(inDir:File, outDir:File) {
21 |     for(file <- inDir.listFiles.filter(_.getName.endsWith(".txt"))) {
22 |       val in = new BufferedReader(new FileReader(file))
23 |       val out = new BufferedWriter(new FileWriter(outDir.getCanonicalPath+"/"+file.getName, true))
24 |       println(inDir.getCanonicalPath+"/"+file.getName+" >> "+outDir.getCanonicalPath+"/"+file.getName)
25 |       var line = "i"
26 |       while(line != null) {
27 |         try {
28 |           line = in.readLine
29 |         } catch {
30 |           case e: java.nio.charset.MalformedInputException => line = "E"
31 |         }
32 |         if(line != null && line.size > 1)
33 |           out.write(line+"\n")
34 |       }
35 |       out.close
36 |       in.close
37 |     }
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/model/AltBasicMinDistModel.scala:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.resolver
17 | 
18 | import scala.collection.JavaConversions._
19 | import opennlp.fieldspring.tr.text._
20 | 
21 | 
22 | class AltBasicMinDistResolver extends Resolver {
23 |   def disambiguate(corpus: StoredCorpus): StoredCorpus = {
24 | 
25 |     /* Iterate over documents. */
26 |     corpus.foreach { document =>
27 | 
28 |       /* Collect a list of toponyms with candidates for each document. */
29 |       val toponyms = document.flatMap(_.getToponyms).filter(_.getAmbiguity > 0).toList
30 | 
31 |       /* For each toponym, pick the best candidate. */
32 |       toponyms.foreach { toponym =>
33 | 
34 |         /* Compute all valid totals with indices. */
35 |         toponym.zipWithIndex.flatMap { case (candidate, idx) =>
36 |           toponyms.filterNot(_ == toponym) match {
37 |             case Nil => None
38 |             case ts  => Some(ts.map(_.map(_.distance(candidate)).min).sum, idx)
39 |           }
40 |         } match {
41 |           case Nil => ()
42 |           case xs  => toponym.setSelectedIdx(xs.min._2)
43 |         }
44 |       }
45 |     }
46 | 
47 |     return corpus
48 |   }
49 | }
50 | 
51 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/resolver/DocDistResolver.scala:
--------------------------------------------------------------------------------
 1 | package opennlp.fieldspring.tr.resolver
 2 | 
 3 | import opennlp.fieldspring.tr.text._
 4 | import opennlp.fieldspring.tr.topo._
 5 | import opennlp.fieldspring.tr.util._
 6 | 
 7 | import scala.collection.JavaConversions._
 8 | 
 9 | class DocDistResolver(val logFilePath:String) extends Resolver {
10 | 
11 |   def disambiguate(corpus:StoredCorpus): StoredCorpus = {
12 | 
13 |     val predDocLocations = (for(pe <- LogUtil.parseLogFile(logFilePath)) yield {
14 |       (pe.docName, pe.predCoord)
15 |     }).toMap
16 | 
17 |     for(doc <- corpus) {
18 |       for(sent <- doc) {
19 |         for(toponym <- sent.getToponyms.filter(_.getAmbiguity > 0)) {
20 |           if(overwriteSelecteds || !toponym.hasSelected) {
21 |             val predDocLocation = predDocLocations.getOrElse(doc.getId, null)
22 |             if(predDocLocation != null) {
23 |               val indexToSelect = toponym.getCandidates.zipWithIndex.minBy(
24 |                 p => p._1.getRegion.distance(predDocLocation))._2
25 |               if(indexToSelect != -1) {
26 |                 toponym.setSelectedIdx(indexToSelect)
27 |               }
28 |             }
29 |           }
30 |         }
31 |       }
32 |     }
33 | 
34 |     corpus
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/resolver/PopulationResolver.scala:
--------------------------------------------------------------------------------
 1 | package opennlp.fieldspring.tr.resolver
 2 | 
 3 | import opennlp.fieldspring.tr.text._
 4 | import opennlp.fieldspring.tr.topo._
 5 | import opennlp.fieldspring.tr.util._
 6 | 
 7 | import scala.collection.JavaConversions._
 8 | 
 9 | class PopulationResolver extends Resolver {
10 | 
11 |   def disambiguate(corpus:StoredCorpus): StoredCorpus = {
12 | 
13 |     val rand = new scala.util.Random
14 | 
15 |     for(doc <- corpus) {
16 |       for(sent <- doc) {
17 |         for(toponym <- sent.getToponyms.filter(_.getAmbiguity > 0)) {
18 |           val maxPopLocPair = toponym.getCandidates.zipWithIndex.maxBy(_._1.getPopulation)
19 |           if(maxPopLocPair._1.getPopulation > 0)
20 |             toponym.setSelectedIdx(maxPopLocPair._2)
21 |           else
22 |             toponym.setSelectedIdx(rand.nextInt(toponym.getAmbiguity)) // back off to random
23 |         }
24 |       }
25 |     }
26 | 
27 |     corpus
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/resolver/TPPResolver.scala:
--------------------------------------------------------------------------------
1 | package opennlp.fieldspring.tr.resolver
2 | 
3 | import opennlp.fieldspring.tr.tpp._
4 | 
5 | abstract class TPPResolver(val tppInstance:TPPInstance) extends Resolver {
6 |   
7 | }
8 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/resolver/ToponymAsDocDistResolver.scala:
--------------------------------------------------------------------------------
 1 | package opennlp.fieldspring.tr.resolver
 2 | 
 3 | import opennlp.fieldspring.tr.text._
 4 | import opennlp.fieldspring.tr.topo._
 5 | import opennlp.fieldspring.tr.util._
 6 | 
 7 | import scala.collection.JavaConversions._
 8 | 
 9 | class ToponymAsDocDistResolver(val logFilePath:String) extends Resolver {
10 | 
11 |   val docTokRE = """(.+)_([0-9]+)""".r
12 |   val alphanumRE = """^[a-zA-Z0-9]+$""".r
13 | 
14 |   def disambiguate(corpus:StoredCorpus): StoredCorpus = {
15 | 
16 |     val predLocations = (for(pe <- LogUtil.parseLogFile(logFilePath)) yield {
17 |       val docTokRE(docName, tokenIndex) = pe.docName
18 |       ((docName, tokenIndex.toInt), pe.predCoord)
19 |     }).toMap
20 | 
21 |     for(doc <- corpus) {
22 |       var tokenIndex = 0
23 |       for(sent <- doc) {
24 |         for(token <- sent.filter(t => alphanumRE.findFirstIn(t.getForm) != None)) {
25 |           if(token.isToponym && token.asInstanceOf[Toponym].getAmbiguity > 0) {
26 |             val toponym = token.asInstanceOf[Toponym]
27 |             val predLocation = predLocations.getOrElse((doc.getId, tokenIndex), null)
28 |             if(predLocation != null) {
29 |               val indexToSelect = toponym.getCandidates.zipWithIndex.minBy(p => p._1.getRegion.distance(predLocation))._2
30 |               if(indexToSelect != -1) {
31 |                 toponym.setSelectedIdx(indexToSelect)
32 |               }
33 |             }
34 |           }
35 |           tokenIndex += 1
36 |         }
37 |       }
38 |     }
39 | 
40 |     corpus
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/text/io/DynamicKMLWriter.scala:
--------------------------------------------------------------------------------
 1 | package opennlp.fieldspring.tr.text.io
 2 | 
 3 | import java.io._
 4 | import java.util._
 5 | import javax.xml.datatype._
 6 | import javax.xml.stream._
 7 | 
 8 | import opennlp.fieldspring.tr.text._
 9 | import opennlp.fieldspring.tr.topo._
10 | import opennlp.fieldspring.tr.util._
11 | 
12 | import scala.collection.JavaConversions._
13 | 
14 | class DynamicKMLWriter(val corpus:StoredCorpus/*,
15 |                        val outputGoldLocations:Boolean = false*/) {
16 | 
17 |   lazy val factory = XMLOutputFactory.newInstance
18 | 
19 |   val CONTEXT_SIZE = 20
20 | 
21 |   def write(out:XMLStreamWriter) {
22 |     KMLUtil.writeHeader(out, "corpus")
23 | 
24 |     var globalTokIndex = 0
25 |     var globalTopIndex = 1
26 |     for(doc <- corpus) {
27 |       val docArray = TextUtil.getDocAsArray(doc)
28 |       var tokIndex = 0
29 |       for(token <- docArray) {
30 |         if(token.isToponym) {
31 |           val toponym = token.asInstanceOf[Toponym]
32 |           if(toponym.getAmbiguity > 0 && toponym.hasSelected) {
33 |             val coord = toponym.getSelected.getRegion.getCenter
34 |             val context = TextUtil.getContext(docArray, tokIndex, CONTEXT_SIZE)
35 |             KMLUtil.writePinTimeStampPlacemark(out, toponym.getOrigForm, coord, context, globalTopIndex)
36 |             globalTopIndex += 1
37 |           }
38 |         }
39 |         tokIndex += 1
40 |         globalTokIndex += 1
41 |       }
42 |     }
43 | 
44 |     KMLUtil.writeFooter(out)
45 |     out.close
46 |   }
47 | 
48 |   def write(file:File) {
49 |     val stream = new BufferedOutputStream(new FileOutputStream(file))
50 |     this.write(this.factory.createXMLStreamWriter(stream, "UTF-8"))
51 |     stream.close()
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/text/io/GigawordSource.scala:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.text.io
17 | 
18 | import java.io.BufferedReader
19 | import java.io.File
20 | import java.io.FileReader
21 | import java.util.ArrayList
22 | import java.util.List
23 | import scala.collection.JavaConversions._
24 | import scala.collection.mutable.Buffer
25 | 
26 | import opennlp.fieldspring.tr.text._
27 | import opennlp.fieldspring.tr.text.prep._
28 | 
29 | class GigawordSource(
30 |   reader: BufferedReader,
31 |   private val sentencesPerDocument: Int,
32 |   private val numberOfDocuments: Int)
33 |   extends TextSource(reader) {
34 | 
35 |   def this(reader: BufferedReader, sentencesPerDocument: Int) =
36 |     this(reader, sentencesPerDocument, Int.MaxValue)
37 |   def this(reader: BufferedReader) = this(reader, 50)
38 | 
39 |   val sentences = new Iterator[Sentence[Token]] {
40 |     var current = GigawordSource.this.readLine
41 |     def hasNext: Boolean = current != null
42 |     def next: Sentence[Token] = new Sentence[Token](null) {
43 |       val buffer = Buffer(new SimpleToken(current))
44 |       current = GigawordSource.this.readLine
45 |       while (current.trim.length > 0) {
46 |         buffer += new SimpleToken(current)
47 |         current = GigawordSource.this.readLine
48 |       }
49 |       current = GigawordSource.this.readLine
50 | 
51 |       def tokens: java.util.Iterator[Token] = buffer.toIterator
52 |     }
53 |   }.grouped(sentencesPerDocument).take(numberOfDocuments)
54 | 
55 |   def hasNext: Boolean = sentences.hasNext
56 | 
57 |   def next: Document[Token] = new Document[Token](null) {
58 |     def iterator: java.util.Iterator[Sentence[Token]] =
59 |       sentences.next.toIterator
60 |   }
61 | }
62 | 
63 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/text/io/WikiTextSource.scala:
--------------------------------------------------------------------------------
 1 | package opennlp.fieldspring.tr.text.io
 2 | 
 3 | import java.io._
 4 | 
 5 | import scala.collection.JavaConversions._
 6 | import scala.collection.mutable.Buffer
 7 | import opennlp.fieldspring.tr.text._
 8 | import opennlp.fieldspring.tr.text.prep._
 9 | 
10 | class WikiTextSource(
11 |   reader: BufferedReader
12 | ) extends TextSource(reader) {
13 | 
14 |   val TITLE_PREFIX = "Article title: "
15 |   val TITLE_INDEX = TITLE_PREFIX.length
16 |   val ID_INDEX = "Article ID: ".length
17 | 
18 |   var id = "-1"
19 |   var title = ""
20 | 
21 |   val sentences = new Iterator[Sentence[Token]] {
22 |     var current = WikiTextSource.this.readLine
23 | 
24 |     def hasNext: Boolean = current != null
25 |     def next: Sentence[Token] = new Sentence[Token](null) {
26 |       if(current != null) {
27 |         title = current.drop(TITLE_INDEX).trim
28 |         current = WikiTextSource.this.readLine
29 |         id = current.drop(ID_INDEX).trim
30 |         current = WikiTextSource.this.readLine
31 |       }
32 |       val buffer = Buffer(new SimpleToken(current))
33 |       current = WikiTextSource.this.readLine
34 |       while (current != null && !current.trim.startsWith(TITLE_PREFIX)) {
35 |         buffer += new SimpleToken(current)
36 |         current = WikiTextSource.this.readLine
37 |       }
38 | 
39 |       def tokens: java.util.Iterator[Token] = buffer.toIterator
40 |     }
41 |   }.grouped(1) // assume each document is a whole sentence, since we don't have sentence boundaries
42 | 
43 |   def hasNext: Boolean = sentences.hasNext
44 | 
45 |   def next: Document[Token] = {
46 |     new Document[Token](id, title) {
47 |       def iterator: java.util.Iterator[Sentence[Token]] = {
48 |         sentences.next.toIterator
49 |       }
50 |     }
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/topo/SphericalGeometry.scala:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.topo
17 | 
18 | import scala.io._
19 | 
20 | import scala.collection.JavaConversions._
21 | 
22 | import opennlp.fieldspring.tr.util.cluster._
23 | 
24 | object SphericalGeometry {
25 |   implicit def g: Geometry[Coordinate] = new Geometry[Coordinate] {
26 |     def distance(x: Coordinate)(y: Coordinate): Double = x.distance(y)
27 |     def centroid(ps: Seq[Coordinate]): Coordinate = Coordinate.centroid(ps)
28 |   }
29 | 
30 |   def main(args: Array[String]) {
31 |     val max = args(1).toInt
32 |     val k = args(2).toInt
33 |     val style = args(3)
34 | 
35 |     val cs = Source.fromFile(args(0)).getLines.map { line =>
36 |       val Array(lat, lng) = line.split("\t").map(_.toDouble)
37 |       Coordinate.fromDegrees(lat, lng)
38 |     }.toIndexedSeq
39 |     println("Loaded...")
40 | 
41 |     val xs = scala.util.Random.shuffle(cs).take(max)
42 | 
43 |     println(Coordinate.centroid(xs))
44 | 
45 |     val clusterer = new KMeans
46 |     val clusters = clusterer.cluster(xs, k)
47 |     clusters.foreach {
48 |       case c => println("<Placemark><styleUrl>" +
49 |                 style + "</styleUrl><Point><coordinates>" +
50 |                 c.getLngDegrees + "," + c.getLatDegrees +
51 |                 "</coordinates></Point></Placemark>")
52 |     }
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/topo/gaz/CorpusGazetteerReader.scala:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.topo.gaz
17 | 
18 | import java.util.Iterator
19 | import scala.collection.JavaConversions._
20 | 
21 | import opennlp.fieldspring.tr.text.Corpus
22 | import opennlp.fieldspring.tr.text.Token
23 | import opennlp.fieldspring.tr.topo.Location
24 | 
25 | class CorpusGazetteerReader(private val corpus: Corpus[_ <: Token])
26 |   extends GazetteerReader {
27 | 
28 |   private val it = corpus.flatMap(_.flatMap {
29 |     _.getToponyms.flatMap(_.getCandidates)
30 |   }).toIterator
31 | 
32 |   def hasNext: Boolean = it.hasNext
33 |   def next: Location = it.next
34 | 
35 |   def close() {
36 |     corpus.close()
37 |   }
38 | }
39 | 
40 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/topo/gaz/geonames/GeoNamesParser.scala:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.topo.gaz.geonames
17 | 
18 | import java.io._
19 | import scala.collection.JavaConversions._
20 | import scala.io._
21 | 
22 | import opennlp.fieldspring.tr.text.Corpus
23 | import opennlp.fieldspring.tr.text.Token
24 | import opennlp.fieldspring.tr.topo.Location
25 | 
26 | class GeoNamesParser(private val file: File) {
27 |   val locs = scala.collection.mutable.Map[String, List[(Double, Double)]]()
28 | 
29 |   Source.fromFile(file).getLines.foreach { line =>
30 |     val Array(lat, lng) = line.split("\t").map(_.toDouble)
31 |     
32 |   }
33 | }
34 | 
35 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/topo/util/CodeConverter.scala:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.topo.util
17 | 
18 | import java.io._
19 | import java.io.InputStream
20 | import scala.collection.mutable.Map
21 | 
22 | class CodeConverter(in: InputStream) {
23 |   def this() = this {
24 |     getClass.getResourceAsStream("/data/geo/country-codes.txt")
25 |   }
26 | 
27 |   case class Country(
28 |     name: String,
29 |     fips: Option[String],
30 |     iso:  Option[(String, String, Int)],
31 |     stanag: Option[String],
32 |     tld: Option[String])
33 | 
34 |   private val countriesF = Map[String, Country]()
35 |   private val countriesI = Map[String, Country]()
36 |   private val reader = new BufferedReader(new InputStreamReader(in))
37 | 
38 |   private var line = reader.readLine
39 |   while (line != null) {
40 |     val fs = line.split("\t")
41 |     val country = Country(
42 |       fs(0),
43 |       if (fs(1) == "-") None else Some(fs(1)),
44 |       if (fs(2) == "-") None else Some(fs(2), fs(3), fs(4).toInt),
45 |       if (fs(5) == "-") None else Some(fs(5)),
46 |       if (fs(6) == "-") None else Some(fs(6))
47 |     )
48 |     country.fips match {
49 |       case Some(fips) => countriesF(fips) = country
50 |       case _ =>
51 |     }
52 |     country.iso match {
53 |       case Some((iso2, _, _)) => countriesI(iso2) = country
54 |       case _ =>
55 |     }
56 |     line = reader.readLine
57 |   }
58 |   reader.close()
59 | 
60 |   def convertFipsToIso2(code: String): Option[String] =
61 |     countriesF.get(code).flatMap(_.iso.map(_._1))
62 | 
63 |   def convertIso2ToFips(code: String): Option[String] =
64 |     countriesI.get(code).flatMap(_.fips)
65 | }
66 | 
67 | object CodeConverter {
68 |   def main(args: Array[String]) {
69 |     val converter = new CodeConverter()
70 |     println(args(0) match {
71 |       case "f2i" => converter.convertIso2ToFips(args(1))
72 |       case "i2f" => converter.convertFipsToIso2(args(1))
73 |     })
74 |   }
75 | }
76 | 
77 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/tpp/FileTravelCoster.scala:
--------------------------------------------------------------------------------
 1 | package opennlp.fieldspring.tr.tpp
 2 | 
 3 | import opennlp.fieldspring.tr.util._
 4 | import opennlp.fieldspring.tr.text._
 5 | 
 6 | import java.io._
 7 | import java.util.ArrayList
 8 | import scala.collection.JavaConversions._
 9 | 
10 | class FileTravelCoster(val inputFile:String, val corpus:StoredCorpus, val dpc:Double) extends TravelCoster {
11 | 
12 |   val gaussianTC = new GaussianTravelCoster
13 | 
14 |   val relevantMarkets = new scala.collection.mutable.HashSet[Int]
15 | 
16 |   for(doc <- corpus) {
17 |     for(sent <- doc) {
18 |       for(toponym <- sent.getToponyms) {
19 |         for(loc <- toponym.getCandidates) {
20 |           //for(coord <- loc.getRegion.getRepresentatives) {
21 |           relevantMarkets.add(TopoUtil.getCellNumber(loc.getRegion.getCenter, dpc))
22 |           //}
23 |         }
24 |       }
25 |     }
26 |   }
27 | 
28 |   val probs = new scala.collection.mutable.HashMap[Int, scala.collection.mutable.HashMap[Int, Double]]
29 |   val costs = new scala.collection.mutable.HashMap[(Int, Int), Double]
30 | 
31 |   val in = new DataInputStream(new FileInputStream(inputFile))
32 | 
33 |   try {
34 |     while(true) {
35 |       val id1 = in.readInt
36 |       val id2 = in.readInt
37 |       val prob = in.readDouble
38 | 
39 |       if(relevantMarkets.contains(id1) && relevantMarkets.contains(id2)) {
40 |         val destinations = probs.getOrElse(id1, new scala.collection.mutable.HashMap[Int, Double])
41 |         destinations.put(id2, prob)
42 |         probs.put(id1, destinations)
43 |       }
44 |     }
45 |   } catch {
46 |     case e:Exception => 
47 |   }
48 | 
49 |   in.close
50 | 
51 |   for((id1, destinations) <- probs) {
52 |     val total = destinations.map(_._2).sum
53 |     for((id2, cost) <- destinations) {
54 |       costs.put((id1, id2), 1.0-cost/total)
55 |     }
56 |   }
57 | 
58 |   probs.clear
59 | 
60 |   println("Read "+costs.size+" relevant probabilities.")
61 | 
62 |   def apply(m1:Market, m2:Market): Double = {
63 |     //if(costs.contains((m1.id, m2.id))) println("Returned cost of "+costs((m1.id, m2.id))+" from file.")
64 |     //else println("Return default cost of "+gaussianTC(m1, m2))
65 |     costs.getOrElse((m1.id, m2.id), gaussianTC(m1, m2))
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/tpp/GaussianPurchaseCoster.scala:
--------------------------------------------------------------------------------
 1 | package opennlp.fieldspring.tr.tpp
 2 | 
 3 | import opennlp.fieldspring.tr.topo._
 4 | 
 5 | class GaussianPurchaseCoster extends PurchaseCoster {
 6 | 
 7 |   val VARIANCE_KM = 161.0
 8 |   val variance = VARIANCE_KM / 6372.8
 9 | 
10 |   def g(x:Double, y:Double) = GaussianUtil.g(x,y)
11 | 
12 |   //val maxHeight = g(0.0,0.0)
13 | 
14 |   val storedCosts = new scala.collection.mutable.HashMap[(Int, Int), Double] // (location.id, market.id) => distance
15 |   def cost(l:Location, m:Market): Double = {
16 |       val key = (l.getId, m.id)
17 |       if(storedCosts.contains(key))
18 |         storedCosts(key)
19 |       else {
20 |         val cost = 1.0-g(l.getRegion.distance(m.centroid)/variance, 0)///max
21 |         //val cost = (maxHeight-g(l.getRegion.distance(m.centroid)/variance, 0))/maxHeight///max
22 |         storedCosts.put(key, cost)
23 |         cost
24 |       }
25 |   }
26 | 
27 |   def apply(m:Market, potLoc:PotentialLocation): Double = {
28 |     cost(potLoc.loc, m)
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/tpp/GaussianTravelCoster.scala:
--------------------------------------------------------------------------------
 1 | package opennlp.fieldspring.tr.tpp
 2 | 
 3 | import opennlp.fieldspring.tr.topo._
 4 | 
 5 | class GaussianTravelCoster extends TravelCoster {
 6 | 
 7 |   val VARIANCE_KM = 1610.0
 8 |   val variance = VARIANCE_KM / 6372.8
 9 | 
10 |   def g(x:Double, y:Double) = GaussianUtil.g(x,y)
11 | 
12 |   val maxHeight = g(0.0,0.0)
13 | 
14 |   def apply(m1:Market, m2:Market): Double = {
15 |     (maxHeight-g(m1.centroid.distance(m2.centroid)/variance, 0))/maxHeight
16 |   }
17 | 
18 |   /* old implementation:
19 |   def apply(m1:Market, m2:Market): Double = {
20 |     1.0-g(m1.centroid.distance(m2.centroid)/variance, 0)
21 |   }*/
22 | }
23 | 
24 | object GaussianTravelCoster extends App {
25 |   val gtc = new GaussianTravelCoster
26 |   //println((gtc.maxHeight-gtc.g(0,0))/gtc.maxHeight)
27 |   //println((gtc.maxHeight-gtc.g(0.25,0))/gtc.maxHeight)
28 |   println((gtc.maxHeight-gtc.g(0.5,0))/gtc.maxHeight)
29 |   println((gtc.maxHeight-gtc.g(1.0,0))/gtc.maxHeight)
30 |   println((gtc.maxHeight-gtc.g(2.0,0))/gtc.maxHeight)
31 |   //println((gtc.maxHeight-gtc.g(3.0,0))/gtc.maxHeight)
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/tpp/GaussianUtil.scala:
--------------------------------------------------------------------------------
 1 | package opennlp.fieldspring.tr.tpp
 2 | 
 3 | object GaussianUtil {
 4 |   def left(sig_x:Double, sig_y:Double, rho:Double) = 1.0/(2*math.Pi*sig_x*sig_y*math.pow(1-rho*rho,.5))
 5 | 
 6 |   def right(x:Double, y:Double, mu_x:Double, mu_y:Double, sig_x:Double, sig_y:Double, rho:Double) = math.exp(-1.0/(2*(1-rho*rho))*( math.pow(x-mu_x,2)/math.pow(sig_x,2) + math.pow(y-mu_y,2)/math.pow(sig_y,2) - (2*rho*(x-mu_x)*(y-mu_y))/(sig_x*sig_y)))
 7 | 
 8 |   def f(x:Double, y:Double, mu_x:Double, mu_y:Double, sig_x:Double, sig_y:Double, rho:Double) = left(sig_x,sig_y,rho) * right(x,y,mu_x,mu_y,sig_x,sig_y,rho)
 9 |   
10 |   def g(x:Double,y:Double) = f(x,y,0,0,1,1,0)
11 | }
12 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/tpp/GridMarketCreator.scala:
--------------------------------------------------------------------------------
 1 | package opennlp.fieldspring.tr.tpp
 2 | 
 3 | import opennlp.fieldspring.tr.text._
 4 | import opennlp.fieldspring.tr.util._
 5 | 
 6 | import scala.collection.JavaConversions._
 7 | 
 8 | class GridMarketCreator(doc:Document[StoredToken], val dpc:Double, val purchaseCoster:PurchaseCoster = null) extends MarketCreator(doc) {
 9 |   override def apply:List[Market] = {
10 |     val cellNumsToPotLocs = new scala.collection.mutable.HashMap[Int, scala.collection.mutable.HashMap[ToponymMention, PotentialLocation]]
11 | 
12 |     val docAsArray = TextUtil.getDocAsArrayNoFilter(doc)
13 | 
14 |     var tokIndex = 0
15 |     for(token <- docAsArray) {
16 |       if(token.isToponym && token.asInstanceOf[Toponym].getAmbiguity > 0) {
17 |         val toponym = token.asInstanceOf[Toponym]
18 |         var gazIndex = 0
19 |         for(loc <- toponym.getCandidates) {
20 |           val topMen = new ToponymMention(doc.getId, tokIndex)
21 |           val potLoc = new PotentialLocation(doc.getId, tokIndex, gazIndex, loc)
22 |           
23 |           val cellNums = TopoUtil.getCellNumbers(loc, dpc)
24 |           for(cellNum <- cellNums) {
25 |             val potLocs = cellNumsToPotLocs.getOrElse(cellNum, new scala.collection.mutable.HashMap[ToponymMention, PotentialLocation])
26 |             val curPotLoc = potLocs.getOrElse(topMen, null)
27 | 
28 |             //if(purchaseCoster == null)
29 |             //  println("NULL!")
30 | 
31 |             /*if(curPotLoc != null) {
32 |               println("\n"+purchaseCoster(null, curPotLoc))
33 |               println(purchaseCoster(null, potLoc))
34 |             }*/
35 | 
36 |             if(purchaseCoster == null || curPotLoc == null || purchaseCoster(null, potLoc) < purchaseCoster(null, curPotLoc)) {
37 |               potLocs.put(topMen, potLoc)
38 |               cellNumsToPotLocs.put(cellNum, potLocs)
39 |             }
40 |             /*else if(purchaseCoster != null && curPotLoc != null && purchaseCoster(null, potLoc) > purchaseCoster(null, curPotLoc)) {
41 |               println("Market "+cellNum+" already had a "+potLoc+" with cost "+purchaseCoster(null, curPotLoc)+", which is cheaper than "+purchaseCoster(null, potLoc))
42 |             }*/
43 |           }
44 |           gazIndex += 1
45 |         }
46 |       }
47 |       tokIndex += 1
48 |     }
49 |     
50 |     (for(p <- cellNumsToPotLocs) yield {
51 |       new Market(p._1, p._2.toMap)
52 |     }).toList
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/tpp/MarketCreator.scala:
--------------------------------------------------------------------------------
1 | package opennlp.fieldspring.tr.tpp
2 | 
3 | import opennlp.fieldspring.tr.text._
4 | 
5 | abstract class MarketCreator(val doc:Document[StoredToken]) {
6 |   def apply:List[Market]
7 | }
8 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/tpp/MaxentPurchaseCoster.scala:
--------------------------------------------------------------------------------
 1 | package opennlp.fieldspring.tr.tpp
 2 | 
 3 | import opennlp.fieldspring.tr.text._
 4 | import opennlp.fieldspring.tr.util._
 5 | import opennlp.fieldspring.tr.resolver._
 6 | import opennlp.maxent._
 7 | import opennlp.maxent.io._
 8 | import opennlp.model._
 9 | 
10 | import java.io._
11 | 
12 | import scala.collection.JavaConversions._
13 | 
14 | class MaxentPurchaseCoster(corpus:StoredCorpus, modelDirPath:String) extends PurchaseCoster {
15 | 
16 |   val windowSize = 20
17 | 
18 |   val modelDir = new File(modelDirPath)
19 | 
20 |   val toponymsToModels:Map[String, AbstractModel] =
21 |   (for(file <- modelDir.listFiles.filter(_.getName.endsWith(".mxm"))) yield {
22 |     val dataInputStream = new DataInputStream(new FileInputStream(file));
23 |     val reader = new BinaryGISModelReader(dataInputStream)
24 |     val model = reader.getModel
25 |     
26 |     (file.getName.dropRight(4).replaceAll("_", " "), model)
27 |   }).toMap
28 | 
29 |   val potLocsToCosts = new scala.collection.mutable.HashMap[PotentialLocation, Double]
30 | 
31 |   for(doc <- corpus) {
32 |     val docAsArray = TextUtil.getDocAsArrayNoFilter(doc)
33 |     var tokIndex = 0
34 |       for(token <- docAsArray) {
35 |         if(token.isToponym && token.asInstanceOf[Toponym].getAmbiguity > 0
36 |            && toponymsToModels.containsKey(token.getForm)) {
37 |           val toponym = token.asInstanceOf[Toponym]
38 |           val contextFeatures = TextUtil.getContextFeatures(docAsArray, tokIndex, windowSize, Set[String]())
39 | 
40 |           val indexToWeightMap = MaxentResolver.getIndexToWeightMap(toponymsToModels(token.getForm), contextFeatures)
41 |           //contextFeatures.foreach(f => print(f+",")); println
42 |           for((gazIndex, weight) <- indexToWeightMap.toList.sortBy(_._1)) {
43 |             val loc = toponym.getCandidates.get(gazIndex)
44 |             val potLoc = new PotentialLocation(doc.getId, tokIndex, gazIndex, loc)
45 |             //println(" "+gazIndex+": "+(1.0-weight))
46 |             potLocsToCosts.put(potLoc, 1.0-weight) // Here's where the cost is defined in terms of the probability mass
47 |           }
48 |           
49 |         }
50 |         tokIndex += 1
51 |       }
52 |   }
53 |   
54 |   def apply(m:Market, potLoc:PotentialLocation): Double = {
55 |     //if(m.locations.map(_._2).toSet.contains(potLoc)) {
56 |       1.0//potLocsToCosts.getOrElse(potLoc, 1.0) // Not sure what the default cost should be
57 |     //}
58 |     //else
59 |     //  Double.PositiveInfinity
60 |   }
61 | }
62 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/tpp/MultiPurchaseCoster.scala:
--------------------------------------------------------------------------------
1 | package opennlp.fieldspring.tr.tpp
2 | 
3 | class MultiPurchaseCoster(val purchaseCosters:List[PurchaseCoster]) extends PurchaseCoster {
4 | 
5 |   def apply(m:Market, potLoc:PotentialLocation): Double = {
6 |     purchaseCosters.map(pc => pc(m, potLoc)).reduce(_*_)
7 |   }
8 | }
9 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/tpp/PurchaseCoster.scala:
--------------------------------------------------------------------------------
1 | package opennlp.fieldspring.tr.tpp
2 | 
3 | abstract class PurchaseCoster {
4 | 
5 |   def apply(m:Market, potLoc:PotentialLocation): Double
6 | }
7 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/tpp/SimpleContainmentPurchaseCoster.scala:
--------------------------------------------------------------------------------
 1 | package opennlp.fieldspring.tr.tpp
 2 | 
 3 | class SimpleContainmentPurchaseCoster extends PurchaseCoster {
 4 | 
 5 |   def apply(m:Market, potLoc:PotentialLocation): Double = {
 6 |     if(m.locations.map(_._2).toSet.contains(potLoc))
 7 |       1.0
 8 |     else
 9 |       Double.PositiveInfinity
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/tpp/SimpleDistanceTravelCoster.scala:
--------------------------------------------------------------------------------
 1 | package opennlp.fieldspring.tr.tpp
 2 | 
 3 | import opennlp.fieldspring.tr.util._
 4 | 
 5 | class SimpleDistanceTravelCoster extends TravelCoster {
 6 | 
 7 |   val storedDistances = new scala.collection.mutable.HashMap[(Int, Int), Double]
 8 |   val distanceTable = new DistanceTable
 9 | 
10 |   def apply(m1:Market, m2:Market): Double = {
11 | 
12 |     if(storedDistances.contains((m1.id, m2.id))) {
13 |       //println(storedDistances((m1.id, m2.id)))
14 |       storedDistances((m1.id, m2.id))
15 |     }
16 | 
17 |     else {
18 |       var minDist = Double.PositiveInfinity
19 |       for(loc1 <- m1.locations.map(_._2).map(_.loc)) {
20 |         for(loc2 <- m2.locations.map(_._2).map(_.loc)) {
21 |           val dist = distanceTable.distance(loc1, loc2)
22 |           if(dist < minDist)
23 |             minDist = dist
24 |         }
25 |       }
26 |       
27 |       storedDistances.put((m1.id, m2.id), minDist)
28 |       //println(minDist)
29 |       minDist
30 |     }
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/tpp/TPPInstance.scala:
--------------------------------------------------------------------------------
 1 | package opennlp.fieldspring.tr.tpp
 2 | 
 3 | import opennlp.fieldspring.tr.topo._
 4 | 
 5 | class TPPInstance(val purchaseCoster:PurchaseCoster,
 6 |                   val travelCoster:TravelCoster) {
 7 | 
 8 |   var markets:List[Market] = null  
 9 | 
10 |   def computeTourCost(tour:List[MarketVisit]): Double = {
11 |     if(tour == null)
12 |       Double.PositiveInfinity
13 |     else {
14 |       var cost = 0.0
15 |       var prevMV:MarketVisit = null
16 |       for(mv <- tour) {
17 |         if(prevMV != null)
18 |           cost += travelCoster(prevMV.market, mv.market)
19 |         
20 |         for((topMen, potLoc) <- mv.purchasedLocations)
21 |           cost += purchaseCoster(mv.market, potLoc)
22 |         
23 |         prevMV = mv
24 |       }
25 |       cost
26 |     }
27 |   }
28 | }
29 | 
30 | class Market(val id:Int,
31 |              val locations:Map[ToponymMention, PotentialLocation]) {
32 | 
33 |   def size = locations.size
34 | 
35 |   lazy val centroid: Coordinate = {
36 |     val lat:Double = locations.map(_._2.loc.getRegion.getCenter.getLat).sum/locations.size
37 |     val lng:Double = locations.map(_._2.loc.getRegion.getCenter.getLng).sum/locations.size
38 |     Coordinate.fromRadians(lat, lng)
39 |   }
40 | }
41 | 
42 | class PotentialLocation(val docId:String,
43 |                         val tokenIndex:Int,
44 |                         val gazIndex:Int,
45 |                         val loc:Location) {
46 | 
47 |   override def toString: String = {
48 |     docId+":"+tokenIndex+":"+gazIndex
49 |   }
50 | 
51 |   override def equals(other:Any):Boolean = {
52 |     if(!other.isInstanceOf[PotentialLocation])
53 |       false
54 |     else {
55 |       val o = other.asInstanceOf[PotentialLocation]
56 |       this.docId.equals(o.docId) && this.tokenIndex == o.tokenIndex && this.gazIndex == o.gazIndex && this.loc.equals(o.loc)
57 |     }
58 |   }
59 | 
60 |   val S = 41*41
61 |   val C = S*41
62 | 
63 |   override def hashCode: Int = {
64 |     C * (C + tokenIndex) + S * (S + docId.hashCode) + 41 * (41 * gazIndex) + loc.getId
65 |   }
66 | 
67 | }
68 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/tpp/TravelCoster.scala:
--------------------------------------------------------------------------------
 1 | package opennlp.fieldspring.tr.tpp
 2 | 
 3 | import opennlp.fieldspring.tr.text._
 4 | 
 5 | abstract class TravelCoster {
 6 | 
 7 |   var doc:Document[StoredToken] = null // In case the travel coster needs to know which document it's on
 8 | 
 9 |   def setDoc(doc:Document[StoredToken]) {
10 |     this.doc = doc
11 |   }
12 | 
13 |   def apply(m1:Market, m2:Market): Double
14 | }
15 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/util/Average.scala:
--------------------------------------------------------------------------------
1 | package opennlp.fieldspring.tr.util
2 | 
3 | object Average extends App {
4 |   println(args.map(_.toDouble).sum/args.length)
5 | }
6 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/util/DistanceTable.scala:
--------------------------------------------------------------------------------
 1 | package opennlp.fieldspring.tr.util
 2 | 
 3 | import opennlp.fieldspring.tr.topo._
 4 | 
 5 | class DistanceTable {
 6 | 
 7 |   val storedDistances = new scala.collection.mutable.HashMap[(Int, Int), Double]
 8 |   
 9 |   def distance(l1:Location, l2:Location): Double = {
10 |     var leftLoc = l1
11 |     var rightLoc = l2
12 |     if(l1.getId > l2.getId) {
13 |       leftLoc = l2
14 |       rightLoc = l1
15 |     }
16 | 
17 |     if(leftLoc.getRegion.getRepresentatives.size == 1 && rightLoc.getRegion.getRepresentatives.size == 1) {
18 |       leftLoc.distance(rightLoc)
19 |     }
20 |     else {
21 |       val key = (leftLoc.getId, rightLoc.getId)
22 |       if(storedDistances.contains(key)) {
23 |         storedDistances(key)
24 |       }
25 |       else {
26 |         val dist = leftLoc.distance(rightLoc)
27 |         storedDistances.put(key, dist)
28 |         dist
29 |       }
30 |     }
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/util/StopwordUtil.scala:
--------------------------------------------------------------------------------
 1 | package opennlp.fieldspring.tr.util
 2 | 
 3 | import java.io._
 4 | 
 5 | object StopwordUtil {
 6 | 
 7 |   def populateStoplist(filename: String): Set[String] = {
 8 |     var stoplist:Set[String] = Set()
 9 |     io.Source.fromFile(filename).getLines.foreach(line => stoplist += line)
10 |     stoplist.toSet()
11 |     stoplist
12 |   }
13 | 
14 | }
15 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/util/cluster/KMeans.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  *  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 | */
16 | package opennlp.fieldspring.tr.util.cluster
17 | 
18 | 
19 | import java.io._
20 | import scala.math._
21 | import scala.collection.immutable.Vector
22 | import scala.collection.mutable.Buffer
23 | import scala.collection.JavaConversions._
24 | import scala.util.Random
25 | 
26 | trait Geometry[A] {
27 |   def distance(x: A)(y: A): Double
28 |   def centroid(ps: Seq[A]): A
29 | 
30 |   def nearest(cs: Seq[A], p: A): Int =
31 |     cs.map(distance(p)(_)).zipWithIndex.min._2
32 | }
33 | 
34 | trait Clusterer {
35 |   def clusterList[A](ps: java.util.List[A], k: Int)(implicit g: Geometry[A]): java.util.List[A]
36 |   def cluster[A](ps: Seq[A], k: Int)(implicit g: Geometry[A]): Seq[A]
37 | }
38 | 
39 | class KMeans extends Clusterer {
40 |   def clusterList[A](ps: java.util.List[A], k: Int)(implicit g: Geometry[A]): java.util.List[A] = {
41 |     cluster(ps.toIndexedSeq, k)(g)
42 |   }
43 | 
44 |   def cluster[A](ps: Seq[A], k: Int)(implicit g: Geometry[A]): Seq[A] = {
45 |     var ips = ps.toIndexedSeq
46 |     var cs = init(ips, k)
47 |     var as = ps.map(g.nearest(cs, _))
48 |     var done = false
49 |     val clusters = IndexedSeq.fill(k)(Buffer[A]())
50 |     while (!done) {
51 |       clusters.foreach(_.clear)
52 | 
53 |       as.zipWithIndex.foreach { case (i, j) =>
54 |         clusters(i) += ips(j)
55 |       }
56 | 
57 |       cs = clusters.map(g.centroid(_))
58 | 
59 |       val bs = ips.map(g.nearest(cs, _))
60 |       done = as == bs
61 |       as = bs
62 |     }
63 |     cs
64 |   }
65 | 
66 |   def init[A](ps: Seq[A], k: Int): IndexedSeq[A] = {
67 |     (1 to k).map(_ => ps(Random.nextInt(ps.size)))
68 |   }
69 | }
70 | 
71 | object EuclideanGeometry {
72 |   type Point = (Double, Double)
73 | 
74 |   implicit def g = new Geometry[Point] {
75 |     def distance(x: Point)(y: Point): Double =
76 |       sqrt(pow(x._1 - y._1, 2) + pow(x._2 - y._2, 2))
77 | 
78 |     def centroid(ps: Seq[Point]): Point = {
79 |       def pointPlus(x: Point, y: Point) = (x._1 + y._1, x._2 + y._2)
80 |       ps.reduceLeft(pointPlus) match {
81 |         case (a, b) => (a / ps.size, b / ps.size)
82 |       }
83 |     }
84 |   }
85 | }
86 | 
87 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/tr/util/sanity/CandidateCheck.scala:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.tr.util.sanity
17 | 
18 | import java.io._
19 | import scala.collection.JavaConversions._
20 | 
21 | import opennlp.fieldspring.tr.text.Corpus
22 | import opennlp.fieldspring.tr.text.Toponym
23 | import opennlp.fieldspring.tr.text.io.TrXMLDirSource
24 | import opennlp.fieldspring.tr.text.prep.OpenNLPTokenizer
25 | import opennlp.fieldspring.tr.topo.Location
26 | 
27 | object CandidateCheck extends App {
28 |   override def main(args: Array[String]) {
29 |     val tokenizer = new OpenNLPTokenizer
30 |     val corpus = Corpus.createStreamCorpus
31 |     val cands = scala.collection.mutable.Map[java.lang.String, java.util.List[Location]]()
32 | 
33 |     corpus.addSource(new TrXMLDirSource(new File(args(0)), tokenizer))
34 |     corpus.foreach { _.foreach { _.getToponyms.foreach {
35 |       case toponym: Toponym => {
36 |         if (!cands.contains(toponym.getForm)) {
37 |           //println("Doesn't contain: " + toponym.getForm)
38 |           cands(toponym.getForm) = toponym.getCandidates
39 |         } else {
40 |           val prev = cands(toponym.getForm)
41 |           val here = toponym.getCandidates
42 |           //println("Contains: " + toponym.getForm)
43 |           if (prev.size != here.size) {
44 |             println("=====Size error for " + toponym.getForm + ": " + prev.size + " " + here.size)
45 |           } else {
46 |             prev.zip(here).foreach { case (p, h) =>
47 |               println(p.getRegion.getCenter + " " + h.getRegion.getCenter)
48 |               //case (p, h) if p != h => println("=====Mismatch for " + toponym.getForm)
49 |               //case _ => ()
50 |             }
51 |           }
52 |         }
53 |       }
54 |     }}}
55 |   }
56 | }
57 | 
58 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/util/Serializer.scala:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Serializer.scala
 3 | //
 4 | //  Copyright (C) 2011 Ben Wing, The University of Texas at Austin
 5 | //
 6 | //  Licensed under the Apache License, Version 2.0 (the "License");
 7 | //  you may not use this file except in compliance with the License.
 8 | //  You may obtain a copy of the License at
 9 | //
10 | //      http://www.apache.org/licenses/LICENSE-2.0
11 | //
12 | //  Unless required by applicable law or agreed to in writing, software
13 | //  distributed under the License is distributed on an "AS IS" BASIS,
14 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | //  See the License for the specific language governing permissions and
16 | //  limitations under the License.
17 | ///////////////////////////////////////////////////////////////////////////////
18 | 
19 | package opennlp.fieldspring.util
20 | 
21 | /** A type class for converting to and from values in serialized form. */
22 | @annotation.implicitNotFound(msg = "No implicit Serializer defined for ${T}.")
23 | trait Serializer[T] {
24 |   def deserialize(foo: String): T
25 |   def serialize(foo: T): String
26 |   /**
27 |    * Validate the serialized form of the string.  Return true if valid,
28 |    * false otherwise.  Can be overridden for efficiency.  By default,
29 |    * simply tries to deserialize, and checks whether an error was thrown.
30 |    */
31 |   def validate_serialized_form(foo: String): Boolean = {
32 |     try {
33 |       deserialize(foo)
34 |     } catch {
35 |       case _ => return false
36 |     }
37 |     return true
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/util/WikiRelFreqs.scala:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  WikiRelFreqs.scala
 3 | //
 4 | //  Copyright (C) 2012 Mike Speriosu, The University of Texas at Austin
 5 | //
 6 | //  Licensed under the Apache License, Version 2.0 (the "License");
 7 | //  you may not use this file except in compliance with the License.
 8 | //  You may obtain a copy of the License at
 9 | //
10 | //      http://www.apache.org/licenses/LICENSE-2.0
11 | //
12 | //  Unless required by applicable law or agreed to in writing, software
13 | //  distributed under the License is distributed on an "AS IS" BASIS,
14 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | //  See the License for the specific language governing permissions and
16 | //  limitations under the License.
17 | ///////////////////////////////////////////////////////////////////////////////
18 | 
19 | package opennlp.fieldspring.util
20 | 
21 | object WikiRelFreqs extends App {
22 | 
23 |   val geoFreqs = getFreqs(args(0))
24 |   val allFreqs = getFreqs(args(1))
25 | 
26 |   val relFreqs = allFreqs.map(p => (p._1, geoFreqs.getOrElse(p._1, 0.0) / p._2)).toList.sortWith((x, y) => if(x._2 != y._2) x._2 > y._2 else x._1 < y._1)
27 | 
28 |   relFreqs.foreach(println)
29 | 
30 |   def getFreqs(filename:String):Map[String, Double] = {
31 |     val wordCountRE = """^(\w+)\s=\s(\d+)$""".r
32 |     val lines = scala.io.Source.fromFile(filename).getLines
33 |     val freqs = new scala.collection.mutable.HashMap[String, Long]
34 |     var total = 0l
35 |     var lineCount = 0
36 | 
37 |     for(line <- lines) {
38 |       if(wordCountRE.findFirstIn(line) != None) {
39 |         val wordCountRE(word, count) = line
40 |         val lowerWord = word.toLowerCase
41 |         val oldCount = freqs.getOrElse(lowerWord, 0l)
42 |         freqs.put(lowerWord, oldCount + count.toInt)
43 |         total += count.toInt
44 |       }
45 |       if(lineCount % 10000000 == 0)
46 |         println(filename+" "+lineCount)
47 |       lineCount += 1
48 |     }
49 | 
50 |     freqs.map(p => (p._1, p._2.toDouble / total)).toMap
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/worddist/DirichletUnigramWordDist.scala:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  DirichletUnigramWordDist.scala
 3 | //
 4 | //  Copyright (C) 2010, 2011, 2012 Ben Wing, The University of Texas at Austin
 5 | //
 6 | //  Licensed under the Apache License, Version 2.0 (the "License");
 7 | //  you may not use this file except in compliance with the License.
 8 | //  You may obtain a copy of the License at
 9 | //
10 | //      http://www.apache.org/licenses/LICENSE-2.0
11 | //
12 | //  Unless required by applicable law or agreed to in writing, software
13 | //  distributed under the License is distributed on an "AS IS" BASIS,
14 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | //  See the License for the specific language governing permissions and
16 | //  limitations under the License.
17 | ///////////////////////////////////////////////////////////////////////////////
18 | 
19 | package opennlp.fieldspring.worddist
20 | 
21 | /**
22 |  * This class implements Dirichlet discounting, where the discount factor
23 |  * depends on the size of the document.
24 |  */ 
25 | class DirichletUnigramWordDistFactory(
26 |     interpolate_string: String,
27 |     val dirichlet_factor: Double
28 |   ) extends DiscountedUnigramWordDistFactory(interpolate_string != "no") {
29 |   def create_word_dist(note_globally: Boolean) =
30 |     new DirichletUnigramWordDist(this, note_globally)
31 | }
32 | 
33 | class DirichletUnigramWordDist(
34 |     factory: WordDistFactory,
35 |     note_globally: Boolean
36 | ) extends DiscountedUnigramWordDist(
37 |     factory, note_globally
38 |   ) {
39 |   override protected def imp_finish_after_global() {
40 |     unseen_mass = 1.0 -
41 |       (model.num_tokens.toDouble /
42 |         (model.num_tokens +
43 |           factory.asInstanceOf[DirichletUnigramWordDistFactory].
44 |             dirichlet_factor))
45 |     super.imp_finish_after_global()
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/worddist/JelinekMercerUnigramWordDist.scala:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  JelinekMercerUnigramWordDist.scala
 3 | //
 4 | //  Copyright (C) 2010, 2011, 2012 Ben Wing, The University of Texas at Austin
 5 | //
 6 | //  Licensed under the Apache License, Version 2.0 (the "License");
 7 | //  you may not use this file except in compliance with the License.
 8 | //  You may obtain a copy of the License at
 9 | //
10 | //      http://www.apache.org/licenses/LICENSE-2.0
11 | //
12 | //  Unless required by applicable law or agreed to in writing, software
13 | //  distributed under the License is distributed on an "AS IS" BASIS,
14 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | //  See the License for the specific language governing permissions and
16 | //  limitations under the License.
17 | ///////////////////////////////////////////////////////////////////////////////
18 | 
19 | package opennlp.fieldspring.worddist
20 | 
21 | /**
22 |  * This class implements Jelinek-Mercer discounting, the simplest type of
23 |  * discounting where we just use a constant discount factor.
24 |  */ 
25 | class JelinekMercerUnigramWordDistFactory(
26 |     interpolate_string: String,
27 |     val jelinek_factor: Double
28 |   ) extends DiscountedUnigramWordDistFactory(interpolate_string != "no") {
29 |   def create_word_dist(note_globally: Boolean) =
30 |     new JelinekMercerUnigramWordDist(this, note_globally)
31 | }
32 | 
33 | class JelinekMercerUnigramWordDist(
34 |     factory: WordDistFactory,
35 |     note_globally: Boolean
36 | ) extends DiscountedUnigramWordDist(
37 |     factory, note_globally
38 |   ) {
39 |   override protected def imp_finish_after_global() {
40 |     unseen_mass = (factory.asInstanceOf[JelinekMercerUnigramWordDistFactory].
41 |       jelinek_factor)
42 |     super.imp_finish_after_global()
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/src/main/scala/opennlp/fieldspring/worddist/UnsmoothedNgramWordDist.scala:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  UnsmoothedNgramWordDist.scala
 3 | //
 4 | //  Copyright (C) 2010, 2011, 2012 Ben Wing, The University of Texas at Austin
 5 | //
 6 | //  Licensed under the Apache License, Version 2.0 (the "License");
 7 | //  you may not use this file except in compliance with the License.
 8 | //  You may obtain a copy of the License at
 9 | //
10 | //      http://www.apache.org/licenses/LICENSE-2.0
11 | //
12 | //  Unless required by applicable law or agreed to in writing, software
13 | //  distributed under the License is distributed on an "AS IS" BASIS,
14 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | //  See the License for the specific language governing permissions and
16 | //  limitations under the License.
17 | ///////////////////////////////////////////////////////////////////////////////
18 | 
19 | package opennlp.fieldspring.worddist
20 | 
21 | class UnsmoothedNgramWordDistFactory extends NgramWordDistFactory {
22 |   def create_word_dist(note_globally: Boolean) =
23 |     new UnsmoothedNgramWordDist(this, note_globally)
24 | 
25 |   def finish_global_distribution() {
26 |   }
27 | }
28 | 
29 | class UnsmoothedNgramWordDist(
30 |   gen_factory: WordDistFactory,
31 |   note_globally: Boolean
32 | ) extends NgramWordDist(gen_factory, note_globally) {
33 |   import NgramStorage.Ngram
34 | 
35 |   type TThis = UnsmoothedNgramWordDist
36 | 
37 |   def innerToString = ""
38 | 
39 |   // For some reason, retrieving this value from the model is fantastically slow
40 |   var num_tokens = 0.0
41 | 
42 |   protected def imp_finish_after_global() {
43 |     num_tokens = model.num_tokens
44 |   }
45 | 
46 |   def fast_kl_divergence(cache: KLDivergenceCache, other: WordDist,
47 |       partial: Boolean = false) = {
48 |     assert(false, "Not implemented")
49 |     0.0
50 |   }
51 | 
52 |   def cosine_similarity(other: WordDist, partial: Boolean = false,
53 |       smoothed: Boolean = false) = {
54 |     assert(false, "Not implemented")
55 |     0.0
56 |   }
57 | 
58 |   def kl_divergence_34(other: NgramWordDist) = {
59 |     assert(false, "Not implemented")
60 |     0.0
61 |   }
62 |  
63 |   /**
64 |    * Actual implementation of steps 3 and 4 of KL-divergence computation, given
65 |    * a value that we may want to compute as part of step 2.
66 |    */
67 |   def inner_kl_divergence_34(other: TThis,
68 |       overall_probs_diff_words: Double) = {
69 |     assert(false, "Not implemented")
70 |     0.0
71 |   }
72 | 
73 |   def lookup_ngram(ngram: Ngram) =
74 |     model.get_item(ngram).toDouble / num_tokens
75 | }
76 | 


--------------------------------------------------------------------------------
/src/test/scala/opennlp/fieldspring/topo/Coordinate.scala:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////////
 2 | //  Copyright (C) 2010 Travis Brown, The University of Texas at Austin
 3 | //
 4 | //  Licensed under the Apache License, Version 2.0 (the "License");
 5 | //  you may not use this file except in compliance with the License.
 6 | //  You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | //  Unless required by applicable law or agreed to in writing, software
11 | //  distributed under the License is distributed on an "AS IS" BASIS,
12 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //  See the License for the specific language governing permissions and
14 | //  limitations under the License.
15 | ///////////////////////////////////////////////////////////////////////////////
16 | package opennlp.fieldspring.topo
17 | 
18 | import org.specs._
19 | import org.specs.runner._
20 | 
21 | class CoordinateTest extends JUnit4(CoordinateSpec)
22 | object CoordinateSpec extends Specification {
23 | 
24 |   "A degree-constructed coordinate" should {
25 |     val coordinate = Coordinate.fromDegrees(45, -45)
26 |     "have the correct radian value for latitude" in {
27 |       coordinate.getLat must_== math.Pi / 4
28 |     }
29 | 
30 |     "have the correct radian value for longitude" in {
31 |       coordinate.getLng must_== -math.Pi / 4
32 |     }
33 | 
34 |     "be equal to its radian-constructed equivalent" in {
35 |       coordinate must_== Coordinate.fromRadians(math.Pi / 4, -math.Pi / 4)
36 |     }
37 |   }
38 | 
39 |   "A coordinate at the origin" should {
40 |     val coordinate = Coordinate.fromDegrees(0, 0)
41 |     "have the correct angular distance from a coordinate 1 radian away horizontally" in {
42 |       coordinate.distance(Coordinate.fromRadians(0, 1)) must_== 1
43 |     }
44 | 
45 |     "have the correct distance from a coordinate 1 radian away vertically" in {
46 |       coordinate.distance(Coordinate.fromRadians(1, 0)) must_== 1
47 |     }
48 |   }
49 | }
50 | 
51 | 


--------------------------------------------------------------------------------