├── .gitignore ├── CIAWFBFixer.scala ├── LICENSE.txt ├── README.md ├── bin ├── cwarxml2txt.sh ├── cwarxml2txttgn.sh ├── download-geonames.sh ├── download-wiki-data.sh ├── fieldspring ├── fix-trconll.sh ├── prepare-cwar.sh ├── runexps.sh └── sbt-launch-0.12.0.jar ├── build ├── build.sbt ├── data ├── lists │ └── stopwords.english └── models │ └── getOpenNLPModels.sh ├── lib ├── argot_2.9.1-0.3.5-benwing.jar ├── codeanticode-GLGraphics-0.9.4.jar ├── controlP5-1.5.2.jar ├── fhpotsdam-unfolding-0.9.1.jar ├── lift-json_2.9.1-2.4.jar ├── opengl-core-20120724.jar ├── processing-opengl-20120724.jar ├── scoobi_2.9.2-0.6.0-cdh3-SNAPSHOT-benwing.jar ├── trove-scala_2.9.1-0.0.2-SNAPSHOT.jar └── upenn-junto-1.1-assembly.jar ├── project └── plugins.sbt └── src ├── main ├── java │ ├── ags │ │ └── utils │ │ │ └── KdTree.java │ └── opennlp │ │ └── fieldspring │ │ └── tr │ │ ├── app │ │ ├── BaseApp.java │ │ ├── EvaluateCorpus.java │ │ ├── ImportCorpus.java │ │ ├── ImportGazetteer.java │ │ ├── LabelPropPreproc.java │ │ ├── LabelPropPreprocOld.java │ │ ├── RunResolver.java │ │ └── WriteCorpusToKML.java │ │ ├── eval │ │ ├── AccuracyEvaluator.java │ │ ├── DistanceReport.java │ │ ├── DocDistanceEvaluator.java │ │ ├── EDEvaluator.java │ │ ├── Evaluator.java │ │ ├── Report.java │ │ ├── SharedNEEvaluator.java │ │ └── SignatureEvaluator.java │ │ ├── resolver │ │ ├── BasicMinDistResolver.java │ │ ├── LabelPropComplexResolver.java │ │ ├── LabelPropContextSensitiveResolver.java │ │ ├── LabelPropDefaultRuleResolver.java │ │ ├── RandomResolver.java │ │ ├── Resolver.java │ │ ├── SimpleDocumentResolver.java │ │ └── WeightedMinDistResolver.java │ │ ├── text │ │ ├── CompactCorpus.java │ │ ├── Corpus.java │ │ ├── Document.java │ │ ├── DocumentSource.java │ │ ├── DocumentSourceWrapper.java │ │ ├── GeoTextDocument.java │ │ ├── Sentence.java │ │ ├── SimpleSentence.java │ │ ├── SimpleToken.java │ │ ├── SimpleToponym.java │ │ ├── StoredCorpus.java │ │ ├── StoredToken.java │ │ ├── StoredToponym.java │ │ ├── StreamCorpus.java │ │ ├── Token.java │ │ ├── Toponym.java │ │ ├── io │ │ │ ├── CorpusKMLWriter.java │ │ │ ├── CorpusXMLSource.java │ │ │ ├── CorpusXMLWriter.java │ │ │ ├── GeoTextCorpusKMLWriter.java │ │ │ ├── GeoTextSource.java │ │ │ ├── PlainTextDirSource.java │ │ │ ├── PlainTextSource.java │ │ │ ├── TextSource.java │ │ │ ├── TrXMLDirSource.java │ │ │ └── TrXMLSource.java │ │ └── prep │ │ │ ├── CandidateAnnotator.java │ │ │ ├── CandidateRepopulator.java │ │ │ ├── HighRecallToponymRecognizer.java │ │ │ ├── JythonNER.java │ │ │ ├── NamedEntityRecognizer.java │ │ │ ├── NamedEntityType.java │ │ │ ├── OpenNLPRecognizer.java │ │ │ ├── OpenNLPSentenceDivider.java │ │ │ ├── OpenNLPTokenizer.java │ │ │ ├── ScriptNER.java │ │ │ ├── SentenceDivider.java │ │ │ ├── Tokenizer.java │ │ │ ├── ToponymAnnotator.java │ │ │ └── ToponymRemover.java │ │ ├── topo │ │ ├── Coordinate.java │ │ ├── Location.java │ │ ├── PointRegion.java │ │ ├── PointSetRegion.java │ │ ├── RectRegion.java │ │ ├── Region.java │ │ └── gaz │ │ │ ├── CandidateList.java │ │ │ ├── FilteredGeoNamesReader.java │ │ │ ├── Gazetteer.java │ │ │ ├── GazetteerFileReader.java │ │ │ ├── GazetteerLineReader.java │ │ │ ├── GazetteerReader.java │ │ │ ├── GeoNamesGazetteer.java │ │ │ ├── GeoNamesGazetteerWithList.java │ │ │ ├── GeoNamesReader.java │ │ │ ├── InMemoryGazetteer.java │ │ │ ├── LoadableGazetteer.java │ │ │ ├── MultiGazetteer.java │ │ │ └── WorldReader.java │ │ └── util │ │ ├── Constants.java │ │ ├── CountingLexicon.java │ │ ├── DoubleStringPair.java │ │ ├── EditMapper.java │ │ ├── FastTrig.java │ │ ├── FastTrig.java~ │ │ ├── IOUtil.java │ │ ├── KMLUtil.java │ │ ├── KMLUtil.java~ │ │ ├── Lexicon.java │ │ ├── MemoryUtil.java │ │ ├── SimpleCountingLexicon.java │ │ ├── SimpleLexicon.java │ │ ├── Span.java │ │ ├── StringDoublePair.java │ │ ├── StringEditMapper.java │ │ ├── StringUtil.java │ │ ├── TopoUtil.java │ │ ├── ToponymFinder.java │ │ └── XMLUtil.java ├── python │ ├── article_statistics.py │ ├── convert-infochimps.py │ ├── convert_to_new_article_format.py │ ├── find-first-tweet-time.py │ ├── fix_redirects.py │ ├── format-thresh-grid.py │ ├── generate-numbers.py │ ├── generate_combined.py │ ├── ner │ │ ├── DummyNER.py │ │ └── stanford2places.py │ ├── nlputil.py │ ├── parse-wex.py │ ├── permute_wiki.py │ ├── process_article_data.py │ ├── processwiki.py │ ├── run-geolocate-exper.py │ ├── split_bzip.py │ ├── splitdevtest.py │ ├── tei2txt.py │ ├── tei_entities.py │ ├── trrraw2plain.py │ ├── twitter-graphs │ │ ├── twitter.py │ │ └── twitterRelationGraphs.py │ ├── twitter_geotext_process.py │ ├── twitter_to_lda.py │ └── unescape_entities.py ├── resources │ └── data │ │ ├── deu │ │ └── stopwords.txt │ │ ├── eng │ │ ├── stopwords.txt │ │ └── stopwords.txt.old │ │ ├── geo │ │ └── country-codes.txt │ │ └── por │ │ └── stopwords.txt └── scala │ └── opennlp │ └── fieldspring │ ├── geolocate │ ├── CombinedModelCell.scala │ ├── GenerateKML.scala │ ├── Geolocate.scala │ ├── Hadoop.scala │ ├── KDTreeCell.scala │ ├── MultiRegularCell.scala │ ├── SphereCell.scala │ ├── SphereCellDist.scala │ ├── SphereDocument.scala │ ├── SphereEvaluation.scala │ ├── TwitterDocument.scala │ ├── WikipediaDocument.scala │ └── toponym │ │ └── Toponym.scala │ ├── gridlocate │ ├── Cell.scala │ ├── CellDist.scala │ ├── DistDocument.scala │ ├── Evaluation.scala │ ├── GridLocate.scala │ ├── Reranker.scala │ └── TextGrounderInfo.scala │ ├── perceptron │ ├── Memoizer.scala │ ├── Perceptron.scala │ └── package.scala │ ├── poligrounder │ ├── Poligrounder.scala │ ├── TimeCell.scala │ └── TimeDocument.scala │ ├── postprocess │ ├── DocumentPinKMLGenerator.scala │ ├── DocumentRankerByError.scala │ ├── ErrorKMLGenerator.scala │ ├── KNNKMLGenerator.scala │ ├── WordRankerByAvgError.scala │ └── WordRankerByAvgErrorUT.scala │ ├── preprocess │ ├── ConvertTwitterInfochimps.scala │ ├── ExtractGeotaggedListFromWikiDump.scala │ ├── ExtractLinksFromWikiDump.scala │ ├── FindPolitical.scala │ ├── FrobTextDB.scala │ ├── MergeMetadataAndOldCounts.scala │ ├── OldGroupCorpus.scala │ ├── ParseTweets.scala │ ├── Permute.scala │ ├── ProcessFiles.scala │ ├── ScoobiConvertTwitterInfochimps.scala │ ├── ScoobiProcessFilesApp.scala │ ├── ScoobiWordCount.scala │ └── TwitterPullLocationVariance.scala │ ├── tr │ ├── app │ │ ├── ConvertCorpusToPlaintext.scala │ │ ├── ConvertCorpusToToponymAsDoc.scala │ │ ├── ConvertCorpusToUnigramCounts.scala │ │ ├── ConvertCwarToGoldCorpus.scala │ │ ├── ConvertGeoTextToJSON.scala │ │ ├── CorpusErrorAnalyzer.scala │ │ ├── CorpusInfo.scala │ │ ├── FilterGeotaggedWiki.scala │ │ ├── GazEntryKMLPlotter.scala │ │ ├── GeoTextLabelProp.scala │ │ ├── GeoTextLabelPropDecoder.scala │ │ ├── GeoTextLabelPropPreproc.scala │ │ ├── Preprocess.scala │ │ ├── ReprocessTrApp.scala │ │ ├── SplitDevTest.scala │ │ ├── SupervisedTRMaxentModelTrainer.scala │ │ ├── TrainingDirectoriesCombiner.scala │ │ └── VisualizeCorpus.scala │ ├── model │ │ └── AltBasicMinDistModel.scala │ ├── resolver │ │ ├── BayesRuleResolver.scala │ │ ├── DocDistResolver.scala │ │ ├── GaussianTPPResolver.scala │ │ ├── HeuristicTPPResolver.scala │ │ ├── LabelPropResolver.scala │ │ ├── MaxentResolver.scala │ │ ├── PopulationResolver.scala │ │ ├── ProbabilisticResolver.scala │ │ ├── TPPResolver.scala │ │ └── ToponymAsDocDistResolver.scala │ ├── text │ │ └── io │ │ │ ├── DynamicKMLWriter.scala │ │ │ ├── GigawordSource.scala │ │ │ └── WikiTextSource.scala │ ├── topo │ │ ├── SphericalGeometry.scala │ │ ├── gaz │ │ │ ├── CorpusGazetteerReader.scala │ │ │ └── geonames │ │ │ │ └── GeoNamesParser.scala │ │ └── util │ │ │ └── CodeConverter.scala │ ├── tpp │ │ ├── ACOTPPSolver.scala │ │ ├── ClusterMarketCreator.scala │ │ ├── ConstructionTPPSolver.scala │ │ ├── FileTravelCoster.scala │ │ ├── GaussianPurchaseCoster.scala │ │ ├── GaussianTravelCoster.scala │ │ ├── GaussianUtil.scala │ │ ├── GridMarketCreator.scala │ │ ├── LinkTravelCoster.scala │ │ ├── LinkTravelWriter.scala │ │ ├── MarketCreator.scala │ │ ├── MaxentPurchaseCoster.scala │ │ ├── MultiPurchaseCoster.scala │ │ ├── PurchaseCoster.scala │ │ ├── SimpleContainmentPurchaseCoster.scala │ │ ├── SimpleDistanceTravelCoster.scala │ │ ├── TPPInstance.scala │ │ ├── TPPSolver.scala │ │ └── TravelCoster.scala │ └── util │ │ ├── Average.scala │ │ ├── DistanceTable.scala │ │ ├── LogUtil.scala │ │ ├── StopwordUtil.scala │ │ ├── TextUtil.scala │ │ ├── cluster │ │ └── KMeans.scala │ │ └── sanity │ │ └── CandidateCheck.scala │ ├── util │ ├── MeteredTask.scala │ ├── Serializer.scala │ ├── WikiRelFreqs.scala │ ├── argparser.scala │ ├── collectionutil.scala │ ├── distances.scala │ ├── experiment.scala │ ├── hadoop.scala │ ├── ioutil.scala │ ├── mathutil.scala │ ├── osutil.scala │ ├── printutil.scala │ ├── textdbutil.scala │ ├── textutil.scala │ ├── timeutil.scala │ └── twokenize.scala │ └── worddist │ ├── BigramWordDist.scala.bitrotted │ ├── DirichletUnigramWordDist.scala │ ├── DiscountedUnigramWordDist.scala │ ├── FastDiscountedUnigramWordDist.scala │ ├── JelinekMercerUnigramWordDist.scala │ ├── Memoizer.scala │ ├── NgramWordDist.scala │ ├── PseudoGoodTuringBigramWordDist.scala.bitrotted │ ├── PseudoGoodTuringUnigramWordDist.scala │ ├── UnigramWordDist.scala │ ├── UnsmoothedNgramWordDist.scala │ └── WordDist.scala └── test └── scala ├── opennlp └── fieldspring │ └── topo │ └── Coordinate.scala └── testparse.scala /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | 4 | *.kml 5 | *.gz 6 | *.class 7 | *.orig 8 | target/ 9 | tmp/ 10 | *~ 11 | project/boot 12 | lib_managed/ 13 | data/models/*.bin 14 | data/gazetteers/*.zip 15 | 16 | 17 | # sbt specific 18 | dist/* 19 | target/ 20 | lib_managed/ 21 | src_managed/ 22 | project/boot/ 23 | project/plugins/project/ 24 | 25 | # Scala-IDE specific 26 | .scala_dependencies -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | fieldspring 2 | =========== 3 | 4 | A system for disambiguating toponyms (placenames) given textual context and creating visualizations of the locations referenced in a given text or corpus. -------------------------------------------------------------------------------- /bin/cwarxml2txt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | indir=${1%/} 4 | outdir=${2%/} 5 | 6 | for f in $indir/*.xml 7 | do 8 | filename=$(basename $f) 9 | filename=${filename%.*} 10 | grep ']*>//g' > $outdir/$filename.txt 11 | done 12 | -------------------------------------------------------------------------------- /bin/cwarxml2txttgn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | indir=${1%/} 4 | outdir=${2%/} 5 | 6 | if [ ! -e $outdir ]; then 7 | mkdir $outdir 8 | fi 9 | 10 | for f in $indir/*.xml 11 | do 12 | filename=$(basename $f) 13 | filename=${filename%.*} 14 | grep '([^<]+)/>tgn,\1-\2-]]/' | sed -re 's/tgn,([^"]+)-(\w+) (\w+)-]]/tgn,\1-\2-\3-]]/' | sed 's/<[^<>]*>//g' > $outdir/$filename.txt 15 | done 16 | -------------------------------------------------------------------------------- /bin/download-geonames.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z $FIELDSPRING_DIR ]; then 4 | echo "You must set the environment variable FIELDSPRING_DIR to point to Fieldspring's installation directory." 5 | exit 6 | fi 7 | 8 | origwd=`pwd` 9 | 10 | if [ ! -e $FIELDSPRING_DIR/data/gazetteers/allCountries.zip ]; then 11 | cd $FIELDSPRING_DIR/data/gazetteers 12 | wget http://web.corral.tacc.utexas.edu/utcompling/fieldspring-data/allCountries.zip 13 | fi 14 | 15 | 16 | 17 | cd $origwd 18 | -------------------------------------------------------------------------------- /bin/download-wiki-data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | wget -r http://web.corral.tacc.utexas.edu/utcompling/fieldspring-data/wistr-models-cwardev-gt 4 | wget -r http://web.corral.tacc.utexas.edu/utcompling/fieldspring-data/wistr-models-cwartest-gt 5 | wget -r http://web.corral.tacc.utexas.edu/utcompling/fieldspring-data/wistr-models-trdev-gt 6 | wget -r http://web.corral.tacc.utexas.edu/utcompling/fieldspring-data/wistr-models-trtest-gt 7 | wget http://web.corral.tacc.utexas.edu/utcompling/fieldspring-data/wiki/enwiki-cwardev-20spd-100.log 8 | wget http://web.corral.tacc.utexas.edu/utcompling/fieldspring-data/wiki/enwiki-cwartest-20spd-100.log 9 | wget http://web.corral.tacc.utexas.edu/utcompling/fieldspring-data/wiki/enwiki-trconlldev-100.log 10 | wget http://web.corral.tacc.utexas.edu/utcompling/fieldspring-data/wiki/enwiki-trconlltest-100.log 11 | -------------------------------------------------------------------------------- /bin/fix-trconll.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | indir=${1%%/} 4 | outdir=${2%%/} 5 | 6 | # Fixes countries from CIA World Factbook that had -0 as longitude: 7 | fieldspring run CIAWFBFixer $3 $indir $outdir 8 | 9 | # Fixes states with swapped coordinates: 10 | for fullpath in $outdir/*.xml 11 | do 12 | filename=${fullpath##*/} 13 | sed -i -e's/\(^.*US_STATE.*lat=\"\)\([^"]*\)\(\".*long=\"\)\([^"]*\)\(\".*$\)/\1\4\3\2\5/' $outdir/$filename 14 | done 15 | -------------------------------------------------------------------------------- /bin/prepare-cwar.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z $FIELDSPRING_DIR ]; then 4 | echo "You must set the environment variable FIELDSPRING_DIR to point to Fieldspring's installation directory." 5 | exit 6 | fi 7 | 8 | origcwarxmldir=${1%/} 9 | pathtokml=$2 10 | pathtogaz=$3 11 | cwarxmloutdir=${4%/} 12 | 13 | echo "Converting original Cwar corpus to plain format..." 14 | cwarxml2txttgn.sh $origcwarxmldir cwarplaintgn 15 | echo "Splitting corpus into dev and test sets..." 16 | fieldspring --memory 2g run opennlp.fieldspring.tr.app.SplitDevTest cwarplaintgn 17 | if [ ! -e $cwarxmloutdir ]; then 18 | mkdir $cwarxmloutdir 19 | fi 20 | if [ ! -e $cwarxmloutdir/dev ]; then 21 | mkdir $cwarxmloutdir/dev 22 | fi 23 | if [ ! -e $cwarxmloutdir/test ]; then 24 | mkdir $cwarxmloutdir/test 25 | fi 26 | 27 | echo "Converting dev corpus to Fieldspring format..." 28 | fieldspring --memory 8g run opennlp.fieldspring.tr.app.ConvertCwarToGoldCorpus cwarplaintgndev $pathtokml $pathtogaz > $cwarxmloutdir/dev/cwar-dev.xml 29 | echo "Converting test corpus to Fieldspring format..." 30 | fieldspring --memory 8g run opennlp.fieldspring.tr.app.ConvertCwarToGoldCorpus cwarplaintgntest $pathtokml $pathtogaz > $cwarxmloutdir/test/cwar-test.xml 31 | 32 | echo "Deleting temporary files..." 33 | rm -rf cwarplaintgn 34 | rm -rf cwarplaintgndev 35 | rm -rf cwarplaintgntest 36 | echo "Done." 37 | 38 | -------------------------------------------------------------------------------- /bin/sbt-launch-0.12.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utcompling/fieldspring/a8425085188fb8f4dc06ccdcdeb2682a172cd680/bin/sbt-launch-0.12.0.jar -------------------------------------------------------------------------------- /build: -------------------------------------------------------------------------------- 1 | java -Dfile.encoding=UTF8 -XX:+CMSClassUnloadingEnabled -XX:MaxPermSize=256m -Xmx1024M -Xss2M -jar bin/sbt-launch-*.jar "$@" 2 | -------------------------------------------------------------------------------- /data/models/getOpenNLPModels.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | wget http://opennlp.sourceforge.net/models-1.5/en-ner-location.bin 4 | wget http://opennlp.sourceforge.net/models-1.5/en-token.bin 5 | wget http://opennlp.sourceforge.net/models-1.5/en-sent.bin -------------------------------------------------------------------------------- /lib/argot_2.9.1-0.3.5-benwing.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utcompling/fieldspring/a8425085188fb8f4dc06ccdcdeb2682a172cd680/lib/argot_2.9.1-0.3.5-benwing.jar -------------------------------------------------------------------------------- /lib/codeanticode-GLGraphics-0.9.4.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utcompling/fieldspring/a8425085188fb8f4dc06ccdcdeb2682a172cd680/lib/codeanticode-GLGraphics-0.9.4.jar -------------------------------------------------------------------------------- /lib/controlP5-1.5.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utcompling/fieldspring/a8425085188fb8f4dc06ccdcdeb2682a172cd680/lib/controlP5-1.5.2.jar -------------------------------------------------------------------------------- /lib/fhpotsdam-unfolding-0.9.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utcompling/fieldspring/a8425085188fb8f4dc06ccdcdeb2682a172cd680/lib/fhpotsdam-unfolding-0.9.1.jar -------------------------------------------------------------------------------- /lib/lift-json_2.9.1-2.4.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utcompling/fieldspring/a8425085188fb8f4dc06ccdcdeb2682a172cd680/lib/lift-json_2.9.1-2.4.jar -------------------------------------------------------------------------------- /lib/opengl-core-20120724.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utcompling/fieldspring/a8425085188fb8f4dc06ccdcdeb2682a172cd680/lib/opengl-core-20120724.jar -------------------------------------------------------------------------------- /lib/processing-opengl-20120724.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utcompling/fieldspring/a8425085188fb8f4dc06ccdcdeb2682a172cd680/lib/processing-opengl-20120724.jar -------------------------------------------------------------------------------- /lib/scoobi_2.9.2-0.6.0-cdh3-SNAPSHOT-benwing.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utcompling/fieldspring/a8425085188fb8f4dc06ccdcdeb2682a172cd680/lib/scoobi_2.9.2-0.6.0-cdh3-SNAPSHOT-benwing.jar -------------------------------------------------------------------------------- /lib/trove-scala_2.9.1-0.0.2-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utcompling/fieldspring/a8425085188fb8f4dc06ccdcdeb2682a172cd680/lib/trove-scala_2.9.1-0.0.2-SNAPSHOT.jar -------------------------------------------------------------------------------- /lib/upenn-junto-1.1-assembly.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utcompling/fieldspring/a8425085188fb8f4dc06ccdcdeb2682a172cd680/lib/upenn-junto-1.1-assembly.jar -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.8.3") 2 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/app/ImportGazetteer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This class imports a gazetteer from a text file and serializes it, to be read quickly by RunResolver quickly. 3 | */ 4 | 5 | package opennlp.fieldspring.tr.app; 6 | 7 | import opennlp.fieldspring.tr.topo.gaz.*; 8 | import opennlp.fieldspring.tr.util.*; 9 | import java.io.*; 10 | import java.util.zip.*; 11 | 12 | public class ImportGazetteer extends BaseApp { 13 | 14 | public static void main(String[] args) throws Exception { 15 | ImportGazetteer currentRun = new ImportGazetteer(); 16 | currentRun.initializeOptionsFromCommandLine(args); 17 | currentRun.serialize(currentRun.doImport(currentRun.getInputPath(), currentRun.isDoingKMeans()), currentRun.getOutputPath()); 18 | } 19 | 20 | public GeoNamesGazetteer doImport(String gazInputPath, boolean runKMeans) throws Exception { 21 | System.out.println("Reading GeoNames gazetteer from " + gazInputPath + " ..."); 22 | 23 | checkExists(gazInputPath); 24 | 25 | GeoNamesGazetteer gnGaz = null; 26 | if(gazInputPath.toLowerCase().endsWith(".zip")) { 27 | ZipFile zf = new ZipFile(gazInputPath); 28 | ZipInputStream zis = new ZipInputStream(new FileInputStream(gazInputPath)); 29 | ZipEntry ze = zis.getNextEntry(); 30 | gnGaz = new GeoNamesGazetteer(new BufferedReader(new InputStreamReader(zf.getInputStream(ze))), runKMeans); 31 | zis.close(); 32 | } 33 | else { 34 | gnGaz = new GeoNamesGazetteer(new BufferedReader(new FileReader(gazInputPath)), runKMeans); 35 | } 36 | 37 | System.out.println("Done."); 38 | 39 | return gnGaz; 40 | } 41 | 42 | public void serialize(GeoNamesGazetteer gnGaz, String serializedGazOutputPath) throws Exception { 43 | System.out.print("Serializing GeoNames gazetteer to " + serializedGazOutputPath + " ..."); 44 | 45 | ObjectOutputStream oos = null; 46 | if(serializedGazOutputPath.toLowerCase().endsWith(".gz")) { 47 | GZIPOutputStream gos = new GZIPOutputStream(new FileOutputStream(serializedGazOutputPath)); 48 | oos = new ObjectOutputStream(gos); 49 | } 50 | else { 51 | FileOutputStream fos = new FileOutputStream(serializedGazOutputPath); 52 | oos = new ObjectOutputStream(fos); 53 | } 54 | oos.writeObject(gnGaz); 55 | oos.close(); 56 | 57 | System.out.println("done."); 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/app/WriteCorpusToKML.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This class takes a corpus with system resolved toponyms and generates a KML file visualizable in Google Earth. 3 | */ 4 | 5 | package opennlp.fieldspring.tr.app; 6 | 7 | import opennlp.fieldspring.tr.text.*; 8 | import opennlp.fieldspring.tr.text.io.*; 9 | import opennlp.fieldspring.tr.util.*; 10 | import java.io.*; 11 | 12 | public class WriteCorpusToKML extends BaseApp { 13 | 14 | public static void main(String[] args) throws Exception { 15 | 16 | WriteCorpusToKML currentRun = new WriteCorpusToKML(); 17 | currentRun.initializeOptionsFromCommandLine(args); 18 | 19 | if(currentRun.getSerializedCorpusInputPath() == null) { 20 | System.out.println("Please specify an input corpus in serialized format via the -sci flag."); 21 | System.exit(0); 22 | } 23 | 24 | if(currentRun.getKMLOutputPath() == null) { 25 | System.out.println("Please specify a KML output path via the -ok flag."); 26 | System.exit(0); 27 | } 28 | 29 | System.out.print("Reading serialized corpus from " + currentRun.getSerializedCorpusInputPath() + " ..."); 30 | Corpus corpus = TopoUtil.readCorpusFromSerialized(currentRun.getSerializedCorpusInputPath()); 31 | System.out.println("done."); 32 | 33 | currentRun.writeToKML(corpus, currentRun.getKMLOutputPath(), currentRun.getOutputGoldLocations(), currentRun.getOutputUserKML(), currentRun.getCorpusFormat()); 34 | } 35 | 36 | public void writeToKML(Corpus corpus, String kmlOutputPath, boolean outputGoldLocations, boolean outputUserKML, Enum corpusFormat) throws Exception { 37 | System.out.print("Writing visualizable corpus in KML format to " + kmlOutputPath + " ..."); 38 | CorpusKMLWriter kw; 39 | if(corpusFormat == CORPUS_FORMAT.GEOTEXT && outputUserKML) 40 | kw = new GeoTextCorpusKMLWriter(corpus, outputGoldLocations); 41 | else 42 | kw = new CorpusKMLWriter(corpus, outputGoldLocations); 43 | kw.write(new File(kmlOutputPath)); 44 | System.out.println("done."); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/eval/AccuracyEvaluator.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This is a simple Evaluator that assumes gold named entities were used in preprocessing. For each gold disambiguated toponym, the model 3 | * either got that Location right or wrong, and a Report containing the accuracy figure on this task is returned. 4 | */ 5 | 6 | package opennlp.fieldspring.tr.eval; 7 | 8 | import opennlp.fieldspring.tr.text.*; 9 | 10 | public class AccuracyEvaluator extends Evaluator { 11 | 12 | public AccuracyEvaluator(Corpus corpus) { 13 | super(corpus); 14 | } 15 | 16 | @Override 17 | public Report evaluate() { 18 | 19 | Report report = new Report(); 20 | 21 | for(Document doc : corpus) { 22 | for(Sentence sent : doc) { 23 | for(Toponym toponym : sent.getToponyms()) { 24 | if(toponym.hasGold()) { 25 | if(toponym.getGoldIdx() == toponym.getSelectedIdx()) { 26 | report.incrementTP(); 27 | } 28 | else { 29 | report.incrementInstanceCount(); 30 | } 31 | } 32 | } 33 | } 34 | } 35 | 36 | return report; 37 | } 38 | 39 | @Override 40 | public Report evaluate(Corpus pred, boolean useSelected) { 41 | return null; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/eval/DistanceReport.java: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.eval; 2 | 3 | import java.util.*; 4 | 5 | public class DistanceReport { 6 | 7 | private List distances = new ArrayList(); 8 | private boolean isSorted = true; 9 | 10 | public void addDistance(double distance) { 11 | distances.add(distance); 12 | isSorted = false; 13 | } 14 | 15 | public double getMeanDistance() { 16 | if(distances.size() == 0) return -1; 17 | 18 | double total = 0.0; 19 | for(double distance : distances) { 20 | total += distance; 21 | } 22 | return total / distances.size(); 23 | } 24 | 25 | public double getMedianDistance() { 26 | if(distances.size() == 0) return -1; 27 | sort(); 28 | return distances.get(distances.size() / 2); 29 | } 30 | 31 | public int getNumDistances() { 32 | return distances.size(); 33 | } 34 | 35 | public double getFractionDistancesWithinThreshold(double threshold) { 36 | int count = 0; 37 | for(double distance : distances) 38 | if(distance <= threshold) 39 | count++; 40 | return ((double)count) / distances.size(); 41 | } 42 | 43 | public double getMinDistance() { 44 | if(distances.size() == 0) return -1; 45 | sort(); 46 | return distances.get(0); 47 | } 48 | 49 | public double getMaxDistance() { 50 | if(distances.size() == 0) return -1; 51 | sort(); 52 | return distances.get(distances.size()-1); 53 | } 54 | 55 | private void sort() { 56 | if(isSorted) 57 | return; 58 | Collections.sort(distances); 59 | isSorted = true; 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/eval/DocDistanceEvaluator.java: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.eval; 2 | 3 | import opennlp.fieldspring.tr.text.*; 4 | import opennlp.fieldspring.tr.topo.*; 5 | 6 | public class DocDistanceEvaluator { 7 | 8 | protected final Corpus corpus; 9 | 10 | public DocDistanceEvaluator(Corpus corpus) { 11 | this.corpus = (Corpus)corpus; 12 | } 13 | 14 | /* Evaluate the "selected" candidates in the corpus using its "gold" 15 | * candidates. */ 16 | public DistanceReport evaluate() { 17 | DistanceReport dreport = new DistanceReport(); 18 | 19 | for(Document doc : corpus) { 20 | 21 | if(!doc.isTrain()) { 22 | 23 | Coordinate systemCoord = doc.getSystemCoord(); 24 | Coordinate goldCoord = doc.getGoldCoord(); 25 | 26 | if(systemCoord != null && goldCoord != null) { 27 | dreport.addDistance(systemCoord.distanceInKm(goldCoord)); 28 | } 29 | } 30 | } 31 | 32 | return dreport; 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/eval/EDEvaluator.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.eval; 17 | 18 | import java.util.Iterator; 19 | 20 | import opennlp.fieldspring.tr.text.Document; 21 | import opennlp.fieldspring.tr.text.Corpus; 22 | import opennlp.fieldspring.tr.text.Sentence; 23 | import opennlp.fieldspring.tr.text.Token; 24 | 25 | public class EDEvaluator extends Evaluator { 26 | public EDEvaluator(Corpus corpus) { 27 | super(corpus); 28 | } 29 | 30 | public Report evaluate() { 31 | return null; 32 | } 33 | 34 | public Report evaluate(Corpus pred, boolean useSelected) { 35 | Iterator> goldDocs = this.corpus.iterator(); 36 | Iterator> predDocs = pred.iterator(); 37 | 38 | while (goldDocs.hasNext() && predDocs.hasNext()) { 39 | Iterator> goldSents = goldDocs.next().iterator(); 40 | Iterator> predSents = predDocs.next().iterator(); 41 | 42 | while (goldSents.hasNext() && predSents.hasNext()) { 43 | } 44 | 45 | assert !goldSents.hasNext() && !predSents.hasNext() : "Documents have different numbers of sentences."; 46 | } 47 | 48 | assert !goldDocs.hasNext() && !predDocs.hasNext() : "Corpora have different numbers of documents."; 49 | return null; 50 | } 51 | } 52 | 53 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/eval/Evaluator.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.eval; 17 | 18 | import opennlp.fieldspring.tr.text.Corpus; 19 | import opennlp.fieldspring.tr.text.Token; 20 | 21 | public abstract class Evaluator { 22 | protected final Corpus corpus; 23 | 24 | /* The given corpus should include either gold or selected candidates or 25 | * both. */ 26 | public Evaluator(Corpus corpus) { 27 | this.corpus = (Corpus) corpus; 28 | } 29 | 30 | /* Evaluate the "selected" candidates in the corpus using its "gold" 31 | * candidates. */ 32 | public abstract Report evaluate(); 33 | 34 | /* Evaluate the given corpus using either the gold or selected candidates in 35 | * the current corpus. */ 36 | public abstract Report evaluate(Corpus pred, boolean useSelected); 37 | 38 | /* A convenience method providing a default for evaluate. */ 39 | public Report evaluate(Corpus pred) { 40 | return this.evaluate(pred, false); 41 | } 42 | } 43 | 44 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/eval/Report.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.eval; 17 | 18 | public class Report { 19 | 20 | private int tp; 21 | private int fp; 22 | private int fn; 23 | private int totalInstances; 24 | 25 | public int getFN() { 26 | return fn; 27 | } 28 | 29 | public int getFP() { 30 | return fp; 31 | } 32 | 33 | public int getTP() { 34 | return tp; 35 | } 36 | 37 | public int getInstanceCount() { 38 | return totalInstances; 39 | } 40 | 41 | public void incrementTP() { 42 | tp++; 43 | totalInstances++; 44 | } 45 | 46 | public void incrementFP() { 47 | fp++; 48 | totalInstances++; 49 | } 50 | 51 | public void incrementFN() { 52 | fn++; 53 | totalInstances++; 54 | } 55 | 56 | public void incrementFPandFN() { 57 | fp++; 58 | fn++; 59 | totalInstances++; 60 | } 61 | 62 | public void incrementInstanceCount() { 63 | totalInstances++; 64 | } 65 | 66 | public double getAccuracy() { 67 | return (double) tp / totalInstances; 68 | } 69 | 70 | public double getPrecision() { 71 | return (double) tp / (tp + fp); 72 | } 73 | 74 | public double getRecall() { 75 | return (double) tp / (tp + fn); 76 | } 77 | 78 | public double getFScore() { 79 | double p = getPrecision(); 80 | double r = getRecall(); 81 | return (2 * p * r) / (p + r); 82 | } 83 | } 84 | 85 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/resolver/RandomResolver.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Random baseline resolver. Selects a random location for each toponym. 3 | */ 4 | 5 | package opennlp.fieldspring.tr.resolver; 6 | 7 | import opennlp.fieldspring.tr.text.*; 8 | import java.util.*; 9 | 10 | public class RandomResolver extends Resolver { 11 | 12 | private Random rand = new Random(); 13 | 14 | @Override 15 | public StoredCorpus disambiguate(StoredCorpus corpus) { 16 | 17 | for(Document doc : corpus) { 18 | for(Sentence sent : doc) { 19 | for(Toponym toponym : sent.getToponyms()) { 20 | int ambiguity = toponym.getAmbiguity(); 21 | if (ambiguity > 0 && (overwriteSelecteds || !toponym.hasSelected())) { 22 | toponym.setSelectedIdx(rand.nextInt(ambiguity)); 23 | } 24 | } 25 | } 26 | } 27 | 28 | return corpus; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/resolver/Resolver.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This version of Resolver (started 9/22/10) is just an abstract class with the disambiguate(Corpus) method. 3 | */ 4 | 5 | package opennlp.fieldspring.tr.resolver; 6 | 7 | import opennlp.fieldspring.tr.text.*; 8 | 9 | /** 10 | * @param corpus 11 | * a corpus without any selected candidates for each toponym (or ignores the selections if they are present) 12 | * @return 13 | * a corpus with selected candidates, ready for evaluation 14 | */ 15 | public abstract class Resolver { 16 | 17 | // Make this false to have a resolver only resolve toponyms that don't already have a selected candidate 18 | // (not implemented in all resolvers yet) 19 | public boolean overwriteSelecteds = true; 20 | 21 | public void train(StoredCorpus corpus) { 22 | throw new UnsupportedOperationException("This type of resolver cannot be trained."); 23 | } 24 | 25 | public abstract StoredCorpus disambiguate(StoredCorpus corpus); 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/text/Corpus.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.text; 17 | 18 | import java.util.Iterator; 19 | 20 | import opennlp.fieldspring.tr.util.Lexicon; 21 | import opennlp.fieldspring.tr.app.*; 22 | import java.io.*; 23 | 24 | public abstract class Corpus implements Iterable>, Serializable { 25 | 26 | private static Enum corpusFormat = null;//BaseApp.CORPUS_FORMAT.PLAIN; 27 | 28 | public abstract void addSource(DocumentSource source); 29 | public abstract void close(); 30 | 31 | public static Corpus createStreamCorpus() { 32 | return new StreamCorpus(); 33 | } 34 | 35 | public static StoredCorpus createStoredCorpus() { 36 | return new CompactCorpus(Corpus.createStreamCorpus()); 37 | } 38 | 39 | public DocumentSource asSource() { 40 | final Iterator> iterator = this.iterator(); 41 | 42 | return new DocumentSource() { 43 | public boolean hasNext() { 44 | return iterator.hasNext(); 45 | } 46 | 47 | public Document next() { 48 | return (Document) iterator.next(); 49 | } 50 | }; 51 | } 52 | 53 | public Enum getFormat() { 54 | return corpusFormat; 55 | } 56 | 57 | public void setFormat(Enum corpusFormat) { 58 | this.corpusFormat = corpusFormat; 59 | } 60 | } 61 | 62 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/text/DocumentSource.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.text; 17 | 18 | import java.util.Iterator; 19 | 20 | import opennlp.fieldspring.tr.text.Document; 21 | import opennlp.fieldspring.tr.text.Sentence; 22 | import opennlp.fieldspring.tr.text.Token; 23 | 24 | public abstract class DocumentSource implements Iterator> { 25 | public void close() { 26 | } 27 | 28 | public void remove() { 29 | throw new UnsupportedOperationException("Cannot remove a document from a source."); 30 | } 31 | 32 | protected abstract class SentenceIterator implements Iterator> { 33 | public void remove() { 34 | throw new UnsupportedOperationException("Cannot remove a sentence from a source."); 35 | } 36 | } 37 | } 38 | 39 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/text/DocumentSourceWrapper.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.text; 17 | 18 | import java.util.Iterator; 19 | 20 | import opennlp.fieldspring.tr.text.Document; 21 | import opennlp.fieldspring.tr.text.Sentence; 22 | import opennlp.fieldspring.tr.text.Token; 23 | 24 | /** 25 | * Wraps a document source in order to perform some operation on it. 26 | * 27 | * @author Travis Brown 28 | * @version 0.1.0 29 | */ 30 | public abstract class DocumentSourceWrapper extends DocumentSource { 31 | private final DocumentSource source; 32 | 33 | public DocumentSourceWrapper(DocumentSource source) { 34 | this.source = source; 35 | } 36 | 37 | /** 38 | * Closes the underlying source. 39 | */ 40 | public void close() { 41 | this.source.close(); 42 | } 43 | 44 | /** 45 | * Indicates whether the underlying source has more documents. 46 | */ 47 | public boolean hasNext() { 48 | return this.source.hasNext(); 49 | } 50 | 51 | /** 52 | * Returns the underlying source (for use in subclasses). 53 | */ 54 | protected DocumentSource getSource() { 55 | return this.source; 56 | } 57 | } 58 | 59 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/text/GeoTextDocument.java: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.text; 2 | 3 | import java.io.*; 4 | import java.util.*; 5 | 6 | import opennlp.fieldspring.tr.topo.*; 7 | 8 | public class GeoTextDocument extends Document { 9 | 10 | private static final long serialVersionUID = 42L; 11 | 12 | private List> sentences; 13 | 14 | public GeoTextDocument(String id, String timestamp, double goldLat, double goldLon) { 15 | super(id); 16 | this.timestamp = timestamp; 17 | this.goldCoord = Coordinate.fromDegrees(goldLat, goldLon); 18 | this.sentences = new ArrayList>(); 19 | this.systemCoord = null; 20 | this.timestamp = null; 21 | } 22 | 23 | public GeoTextDocument(String id, String timestamp, double goldLat, double goldLon, Enum section) { 24 | this(id, timestamp, goldLat, goldLon); 25 | this.section = section; 26 | } 27 | 28 | public GeoTextDocument(String id, String timestamp, double goldLat, double goldLon, long fold) { 29 | this(id, timestamp, goldLat, goldLon); 30 | if(fold >= 1 && fold <= 3) 31 | this.section = Document.SECTION.TRAIN; 32 | else if(fold == 4) 33 | this.section = Document.SECTION.DEV; 34 | else if(fold == 5) 35 | this.section = Document.SECTION.TEST; 36 | else 37 | this.section = Document.SECTION.ANY; 38 | } 39 | 40 | public void addSentence(Sentence sentence) { 41 | sentences.add(sentence); 42 | } 43 | 44 | public Iterator> iterator() { 45 | return sentences.iterator(); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/text/SimpleSentence.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.text; 17 | 18 | import java.util.ArrayList; 19 | import java.util.Iterator; 20 | import java.util.List; 21 | import java.util.NoSuchElementException; 22 | 23 | import opennlp.fieldspring.tr.util.Span; 24 | import java.io.*; 25 | 26 | public class SimpleSentence extends Sentence implements Serializable { 27 | 28 | private static final long serialVersionUID = 42L; 29 | 30 | private final List tokens; 31 | private final List> toponymSpans; 32 | 33 | public SimpleSentence(String id, List tokens) { 34 | this(id, tokens, new ArrayList>()); 35 | } 36 | 37 | public SimpleSentence(String id, List tokens, List> toponymSpans) { 38 | super(id); 39 | this.tokens = tokens; 40 | this.toponymSpans = toponymSpans; 41 | } 42 | 43 | public Iterator tokens() { 44 | return this.tokens.iterator(); 45 | } 46 | 47 | public Iterator> toponymSpans() { 48 | return this.toponymSpans.iterator(); 49 | } 50 | } 51 | 52 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/text/SimpleToken.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.text; 17 | 18 | import java.io.*; 19 | 20 | public class SimpleToken implements Token, Serializable { 21 | 22 | private static final long serialVersionUID = 42L; 23 | 24 | private final String form; 25 | 26 | public SimpleToken(String form) { 27 | this.form = form; 28 | } 29 | 30 | public String getForm() { 31 | return this.form.toLowerCase(); 32 | } 33 | 34 | public String getOrigForm() { 35 | return this.form; 36 | } 37 | 38 | public boolean isToponym() { 39 | return false; 40 | } 41 | } 42 | 43 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/text/StoredCorpus.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.text; 17 | 18 | import opennlp.fieldspring.tr.topo.Location; 19 | import opennlp.fieldspring.tr.util.CountingLexicon; 20 | import opennlp.fieldspring.tr.util.SimpleCountingLexicon; 21 | import opennlp.fieldspring.tr.util.Span; 22 | import java.io.*; 23 | 24 | public abstract class StoredCorpus extends Corpus implements Serializable { 25 | public abstract int getDocumentCount(); 26 | public abstract int getTokenTypeCount(); 27 | public abstract int getTokenOrigTypeCount(); 28 | public abstract int getToponymTypeCount(); 29 | public abstract int getToponymOrigTypeCount(); 30 | public abstract int getMaxToponymAmbiguity(); 31 | public abstract double getAvgToponymAmbiguity(); 32 | public abstract int getTokenCount(); 33 | public abstract int getToponymTokenCount(); 34 | public abstract void load(); 35 | } 36 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/text/StoredToken.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.text; 17 | import java.io.*; 18 | 19 | public interface StoredToken extends Token, Serializable { 20 | public int getIdx(); 21 | public int getOrigIdx(); 22 | public int getTypeCount(); 23 | public int getOrigTypeCount(); 24 | } 25 | 26 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/text/StoredToponym.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.text; 17 | import java.io.*; 18 | 19 | public interface StoredToponym extends StoredToken, Toponym, Serializable { 20 | } 21 | 22 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/text/StreamCorpus.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.text; 17 | 18 | import java.io.IOException; 19 | import java.util.ArrayList; 20 | import java.util.Iterator; 21 | import java.util.List; 22 | 23 | import com.google.common.collect.Iterators; 24 | 25 | public class StreamCorpus extends Corpus { 26 | 27 | private static final long serialVersionUID = 42L; 28 | 29 | private final List sources; 30 | private boolean read; 31 | 32 | StreamCorpus() { 33 | this.sources = new ArrayList(); 34 | this.read = false; 35 | } 36 | 37 | public Iterator> iterator() { 38 | if (this.read) { 39 | throw new UnsupportedOperationException("Cannot read a stream corpus more than once."); 40 | } else { 41 | this.read = true; 42 | return Iterators.concat(this.sources.iterator()); 43 | } 44 | } 45 | 46 | public void addSource(DocumentSource source) { 47 | this.sources.add(source); 48 | } 49 | 50 | public void close() { 51 | for (DocumentSource source : this.sources) { 52 | source.close(); 53 | } 54 | } 55 | } 56 | 57 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/text/Token.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.text; 17 | 18 | import java.io.*; 19 | 20 | public interface Token extends Serializable { 21 | public String getForm(); 22 | public String getOrigForm(); 23 | public boolean isToponym(); 24 | } 25 | 26 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/text/Toponym.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.text; 17 | 18 | import java.util.List; 19 | import java.io.*; 20 | 21 | import opennlp.fieldspring.tr.topo.Location; 22 | 23 | public interface Toponym extends Token, Iterable, Serializable { 24 | public boolean hasGold(); 25 | public Location getGold(); 26 | public int getGoldIdx(); 27 | public void setGoldIdx(int idx); 28 | 29 | public boolean hasSelected(); 30 | public Location getSelected(); 31 | public int getSelectedIdx(); 32 | public void setSelectedIdx(int idx); 33 | 34 | public int getAmbiguity(); 35 | public List getCandidates(); 36 | public void setCandidates(List candidates); 37 | 38 | public List getTokens(); 39 | } 40 | 41 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/text/io/GeoTextCorpusKMLWriter.java: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.text.io; 2 | 3 | import opennlp.fieldspring.tr.text.*; 4 | import javax.xml.stream.*; 5 | import opennlp.fieldspring.tr.util.*; 6 | import opennlp.fieldspring.tr.topo.*; 7 | 8 | public class GeoTextCorpusKMLWriter extends CorpusKMLWriter { 9 | public GeoTextCorpusKMLWriter(Corpus corpus, boolean outputGoldLocations) { 10 | super(corpus, outputGoldLocations); 11 | } 12 | 13 | public GeoTextCorpusKMLWriter(Corpus corpus) { 14 | this(corpus, false); 15 | } 16 | 17 | protected void writeDocument(XMLStreamWriter out, Document document) throws XMLStreamException { 18 | Coordinate coord = outputGoldLocations ? document.getGoldCoord() : document.getSystemCoord(); 19 | 20 | KMLUtil.writePlacemark(out, document.getId(), coord, KMLUtil.RADIUS); 21 | int sentIndex = 0; 22 | for(Sentence sent : document) { 23 | StringBuffer curTweetSB = new StringBuffer(); 24 | for(Token token : sent) { 25 | if(isSanitary(token.getOrigForm())) 26 | curTweetSB.append(token.getOrigForm()).append(" "); 27 | } 28 | String curTweet = curTweetSB.toString().trim(); 29 | 30 | KMLUtil.writeSpiralPoint(out, document.getId(), 31 | sentIndex, curTweet, 32 | coord.getNthSpiralPoint(sentIndex, KMLUtil.SPIRAL_RADIUS), KMLUtil.RADIUS); 33 | sentIndex++; 34 | } 35 | } 36 | 37 | private String okChars = "!?:;,'\"|+=-_*^%$#@`~(){}[]\\/"; 38 | 39 | private boolean isSanitary(String s) { 40 | for(int i = 0; i < s.length(); i++) { 41 | char curChar = s.charAt(i); 42 | if(!Character.isLetterOrDigit(curChar) && !okChars.contains(curChar + "")) { 43 | return false; 44 | } 45 | } 46 | return true; 47 | } 48 | 49 | protected void write(XMLStreamWriter out) throws Exception { 50 | 51 | KMLUtil.writeHeader(out, "corpus"); 52 | 53 | for(Document doc : corpus) { 54 | writeDocument(out, doc); 55 | } 56 | 57 | KMLUtil.writeFooter(out); 58 | 59 | out.close(); 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/text/io/TextSource.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.text.io; 17 | 18 | import java.io.BufferedReader; 19 | import java.io.Closeable; 20 | import java.io.IOException; 21 | import java.util.Iterator; 22 | 23 | import opennlp.fieldspring.tr.text.Document; 24 | import opennlp.fieldspring.tr.text.DocumentSource; 25 | import opennlp.fieldspring.tr.text.Token; 26 | 27 | public abstract class TextSource extends DocumentSource { 28 | protected final BufferedReader reader; 29 | 30 | public TextSource(BufferedReader reader) throws IOException { 31 | this.reader = reader; 32 | } 33 | 34 | protected String readLine() { 35 | String line = null; 36 | try { 37 | line = this.reader.readLine(); 38 | } catch (IOException e) { 39 | System.err.println("Error while reading document source."); 40 | } 41 | return line; 42 | } 43 | 44 | public void close() { 45 | try { 46 | this.reader.close(); 47 | } catch (IOException e) { 48 | System.err.println("Error while closing document source."); 49 | } 50 | } 51 | } 52 | 53 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/text/prep/CandidateRepopulator.java: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.text.prep; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Iterator; 5 | import java.util.List; 6 | 7 | import opennlp.fieldspring.tr.text.Corpus; 8 | import opennlp.fieldspring.tr.text.Document; 9 | import opennlp.fieldspring.tr.text.DocumentSource; 10 | import opennlp.fieldspring.tr.text.DocumentSourceWrapper; 11 | import opennlp.fieldspring.tr.text.Sentence; 12 | import opennlp.fieldspring.tr.text.SimpleSentence; 13 | import opennlp.fieldspring.tr.text.SimpleToponym; 14 | import opennlp.fieldspring.tr.text.Token; 15 | import opennlp.fieldspring.tr.text.Toponym; 16 | import opennlp.fieldspring.tr.topo.gaz.Gazetteer; 17 | import opennlp.fieldspring.tr.topo.Location; 18 | import opennlp.fieldspring.tr.util.Span; 19 | 20 | 21 | public class CandidateRepopulator extends DocumentSourceWrapper { 22 | 23 | private final Gazetteer gazetteer; 24 | 25 | public CandidateRepopulator(DocumentSource source, Gazetteer gazetteer) { 26 | super(source); 27 | this.gazetteer = gazetteer; 28 | } 29 | 30 | public Document next() { 31 | final Document document = this.getSource().next(); 32 | final Iterator> sentences = document.iterator(); 33 | 34 | return new Document(document.getId()) { 35 | private static final long serialVersionUID = 42L; 36 | public Iterator> iterator() { 37 | return new SentenceIterator() { 38 | public boolean hasNext() { 39 | return sentences.hasNext(); 40 | } 41 | 42 | public Sentence next() { 43 | Sentence sentence = sentences.next(); 44 | for(Token token : sentence) { 45 | if(token.isToponym()) { 46 | Toponym toponym = (Toponym) token; 47 | List candidates = gazetteer.lookup(toponym.getForm()); 48 | if(candidates == null) candidates = new ArrayList(); 49 | toponym.setCandidates(candidates); 50 | toponym.setGoldIdx(-1); 51 | } 52 | } 53 | return sentence; 54 | //return new SimpleSentence(sentence.getId(), sentence.getTokens()); 55 | } 56 | }; 57 | } 58 | }; 59 | } 60 | } 61 | 62 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/text/prep/JythonNER.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.text.prep; 17 | 18 | import javax.script.ScriptEngine; 19 | import javax.script.ScriptException; 20 | import java.util.ArrayList; 21 | import java.util.List; 22 | 23 | import opennlp.fieldspring.tr.util.Span; 24 | 25 | public class JythonNER extends ScriptNER { 26 | public JythonNER(String name, NamedEntityType type) { 27 | super("python", name, type); 28 | } 29 | 30 | public JythonNER(String name) { 31 | this(name, NamedEntityType.LOCATION); 32 | } 33 | 34 | public List> recognize(List tokens) { 35 | ScriptEngine engine = this.getEngine(); 36 | engine.put("tokens", tokens); 37 | 38 | try { 39 | engine.eval("spans = recognize(tokens)"); 40 | } catch (ScriptException e) { 41 | return null; 42 | } 43 | 44 | List> tuples = (List>) engine.get("spans"); 45 | List> spans = 46 | new ArrayList>(tuples.size()); 47 | 48 | for (List tuple : tuples) { 49 | spans.add(new Span(tuple.get(0), tuple.get(1), this.getType())); 50 | } 51 | 52 | return spans; 53 | } 54 | } 55 | 56 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/text/prep/NamedEntityRecognizer.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.text.prep; 17 | 18 | import java.util.List; 19 | 20 | import opennlp.fieldspring.tr.util.Span; 21 | 22 | public interface NamedEntityRecognizer { 23 | public List> recognize(List tokens); 24 | } 25 | 26 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/text/prep/NamedEntityType.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.text.prep; 17 | 18 | public enum NamedEntityType { 19 | DATE, 20 | LOCATION, 21 | MONEY, 22 | ORGANIZATION, 23 | PERCENTAGE, 24 | PERSON, 25 | TIME; 26 | } 27 | 28 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/text/prep/OpenNLPRecognizer.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.text.prep; 17 | 18 | import java.io.File; 19 | import java.io.FileInputStream; 20 | import java.io.IOException; 21 | import java.io.InputStream; 22 | import java.util.ArrayList; 23 | import java.util.Iterator; 24 | import java.util.List; 25 | 26 | import opennlp.tools.namefind.NameFinderME; 27 | import opennlp.tools.namefind.TokenNameFinder; 28 | import opennlp.tools.namefind.TokenNameFinderModel; 29 | import opennlp.tools.util.InvalidFormatException; 30 | 31 | import opennlp.fieldspring.tr.util.Constants; 32 | import opennlp.fieldspring.tr.util.Span; 33 | 34 | public class OpenNLPRecognizer implements NamedEntityRecognizer { 35 | protected final TokenNameFinder finder; 36 | protected final NamedEntityType type; 37 | 38 | public OpenNLPRecognizer() throws IOException, InvalidFormatException { 39 | this(new FileInputStream( 40 | Constants.getOpenNLPModelsDir() + File.separator + "en-ner-location.bin"), 41 | NamedEntityType.LOCATION); 42 | } 43 | 44 | public OpenNLPRecognizer(InputStream in, NamedEntityType type) 45 | throws IOException, InvalidFormatException { 46 | this.finder = new NameFinderME(new TokenNameFinderModel(in)); 47 | this.type = type; 48 | } 49 | 50 | public List> recognize(List tokens) { 51 | List> spans = new ArrayList>(); 52 | for (opennlp.tools.util.Span span : this.finder.find(tokens.toArray(new String[0]))) { 53 | spans.add(new Span(span.getStart(), span.getEnd(), this.type)); 54 | } 55 | return spans; 56 | } 57 | } 58 | 59 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/text/prep/OpenNLPSentenceDivider.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.text.prep; 17 | 18 | import java.io.File; 19 | import java.io.FileInputStream; 20 | import java.io.IOException; 21 | import java.io.InputStream; 22 | import java.util.Arrays; 23 | import java.util.List; 24 | 25 | import opennlp.tools.sentdetect.SentenceDetector; 26 | import opennlp.tools.sentdetect.SentenceDetectorME; 27 | import opennlp.tools.sentdetect.SentenceModel; 28 | import opennlp.tools.util.InvalidFormatException; 29 | 30 | import opennlp.fieldspring.tr.util.Constants; 31 | 32 | public class OpenNLPSentenceDivider implements SentenceDivider { 33 | private final SentenceDetector detector; 34 | 35 | public OpenNLPSentenceDivider() throws IOException, InvalidFormatException { 36 | this(new FileInputStream(Constants.getOpenNLPModelsDir() + File.separator + "en-sent.bin")); 37 | } 38 | 39 | public OpenNLPSentenceDivider(InputStream in) throws IOException, InvalidFormatException { 40 | this.detector = new SentenceDetectorME(new SentenceModel(in)); 41 | } 42 | 43 | public List divide(String text) { 44 | return Arrays.asList(this.detector.sentDetect(text)); 45 | } 46 | } 47 | 48 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/text/prep/OpenNLPTokenizer.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.text.prep; 17 | 18 | import java.io.File; 19 | import java.io.FileInputStream; 20 | import java.io.IOException; 21 | import java.io.InputStream; 22 | import java.util.Arrays; 23 | import java.util.List; 24 | 25 | import opennlp.tools.tokenize.TokenizerME; 26 | import opennlp.tools.tokenize.TokenizerModel; 27 | import opennlp.tools.util.InvalidFormatException; 28 | 29 | import opennlp.fieldspring.tr.util.Constants; 30 | 31 | public class OpenNLPTokenizer implements Tokenizer { 32 | private final opennlp.tools.tokenize.Tokenizer tokenizer; 33 | 34 | public OpenNLPTokenizer() throws IOException, InvalidFormatException { 35 | this(new FileInputStream(Constants.getOpenNLPModelsDir() + File.separator + "en-token.bin")); 36 | } 37 | 38 | public OpenNLPTokenizer(InputStream in) throws IOException, InvalidFormatException { 39 | this.tokenizer = new TokenizerME(new TokenizerModel(in)); 40 | } 41 | 42 | public List tokenize(String text) { 43 | return Arrays.asList(this.tokenizer.tokenize(text)); 44 | } 45 | } 46 | 47 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/text/prep/ScriptNER.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.text.prep; 17 | 18 | import java.io.InputStream; 19 | import java.io.InputStreamReader; 20 | import java.io.IOException; 21 | import javax.script.ScriptEngine; 22 | import javax.script.ScriptEngineManager; 23 | import javax.script.ScriptException; 24 | 25 | public abstract class ScriptNER implements NamedEntityRecognizer { 26 | private final String language; 27 | private final String name; 28 | private final NamedEntityType type; 29 | private final ScriptEngine engine; 30 | 31 | /** 32 | * Constructor for classes that use the JSR-223 scripting engine to perform 33 | * named entity recognition. 34 | * 35 | * @param language The JSR-223 name of the scripting language 36 | * @param name The path to the resource containing the script 37 | * @param type The kind of named entity that is recognized 38 | */ 39 | public ScriptNER(String language, String name, NamedEntityType type) { 40 | this.language = language; 41 | this.name = name; 42 | this.type = type; 43 | 44 | ScriptEngineManager manager = new ScriptEngineManager(); 45 | this.engine = manager.getEngineByName(this.language); 46 | 47 | try { 48 | InputStream stream = ScriptNER.class.getResourceAsStream(this.name); 49 | InputStreamReader reader = new InputStreamReader(stream); 50 | this.engine.eval(reader); 51 | stream.close(); 52 | } catch (ScriptException e) { 53 | System.err.println(e); 54 | System.exit(1); 55 | } catch (IOException e) { 56 | System.err.println(e); 57 | System.exit(1); 58 | } 59 | } 60 | 61 | public ScriptNER(String language, String name) { 62 | this(language, name, NamedEntityType.LOCATION); 63 | } 64 | 65 | protected ScriptEngine getEngine() { 66 | return this.engine; 67 | } 68 | 69 | protected NamedEntityType getType() { 70 | return this.type; 71 | } 72 | } 73 | 74 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/text/prep/SentenceDivider.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.text.prep; 17 | 18 | import java.util.List; 19 | 20 | public interface SentenceDivider { 21 | public List divide(String text); 22 | } 23 | 24 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/text/prep/Tokenizer.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.text.prep; 17 | 18 | import java.util.List; 19 | 20 | public interface Tokenizer { 21 | public List tokenize(String text); 22 | } 23 | 24 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/text/prep/ToponymRemover.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.text.prep; 17 | 18 | import java.util.ArrayList; 19 | import java.util.Iterator; 20 | import java.util.List; 21 | 22 | import opennlp.fieldspring.tr.text.Corpus; 23 | import opennlp.fieldspring.tr.text.Document; 24 | import opennlp.fieldspring.tr.text.DocumentSource; 25 | import opennlp.fieldspring.tr.text.DocumentSourceWrapper; 26 | import opennlp.fieldspring.tr.text.Sentence; 27 | import opennlp.fieldspring.tr.text.SimpleSentence; 28 | import opennlp.fieldspring.tr.text.SimpleToponym; 29 | import opennlp.fieldspring.tr.text.Token; 30 | import opennlp.fieldspring.tr.text.Toponym; 31 | import opennlp.fieldspring.tr.topo.gaz.Gazetteer; 32 | import opennlp.fieldspring.tr.util.Span; 33 | 34 | /** 35 | * Wraps a document source and removes any toponyms spans that it contains, 36 | * returning only the tokens. 37 | * 38 | * @author Travis Brown 39 | * @version 0.1.0 40 | */ 41 | public class ToponymRemover extends DocumentSourceWrapper { 42 | public ToponymRemover(DocumentSource source) { 43 | super(source); 44 | } 45 | 46 | public Document next() { 47 | final Document document = this.getSource().next(); 48 | final Iterator> sentences = document.iterator(); 49 | 50 | return new Document(document.getId()) { 51 | private static final long serialVersionUID = 42L; 52 | public Iterator> iterator() { 53 | return new SentenceIterator() { 54 | public boolean hasNext() { 55 | return sentences.hasNext(); 56 | } 57 | 58 | public Sentence next() { 59 | Sentence sentence = sentences.next(); 60 | return new SimpleSentence(sentence.getId(), sentence.getTokens()); 61 | } 62 | }; 63 | } 64 | }; 65 | } 66 | } 67 | 68 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/topo/PointRegion.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.topo; 17 | 18 | import java.util.ArrayList; 19 | import java.util.List; 20 | 21 | public class PointRegion extends Region { 22 | 23 | private static final long serialVersionUID = 42L; 24 | 25 | private Coordinate coordinate; 26 | 27 | public PointRegion(Coordinate coordinate) { 28 | this.coordinate = coordinate; 29 | } 30 | 31 | public Coordinate getCenter() { 32 | return this.coordinate; 33 | } 34 | 35 | public void setCenter(Coordinate coord) { 36 | this.coordinate = coord; 37 | } 38 | 39 | public boolean contains(double lat, double lng) { 40 | return lat == this.coordinate.getLat() && lng == this.coordinate.getLng(); 41 | } 42 | 43 | public double getMinLat() { 44 | return this.coordinate.getLat(); 45 | } 46 | 47 | public double getMaxLat() { 48 | return this.coordinate.getLat(); 49 | } 50 | 51 | public double getMinLng() { 52 | return this.coordinate.getLng(); 53 | } 54 | 55 | public double getMaxLng() { 56 | return this.coordinate.getLng(); 57 | } 58 | 59 | public List getRepresentatives() { 60 | List representatives = new ArrayList(1); 61 | representatives.add(this.coordinate); 62 | return representatives; 63 | } 64 | 65 | public void setRepresentatives(List coordinates) { 66 | this.coordinate = coordinates.get(0); 67 | } 68 | } 69 | 70 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/topo/gaz/FilteredGeoNamesReader.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.topo.gaz; 17 | 18 | import java.io.File; 19 | import java.io.FileInputStream; 20 | import java.io.FileNotFoundException; 21 | import java.io.InputStreamReader; 22 | import java.io.IOException; 23 | import java.io.BufferedReader; 24 | import java.util.zip.GZIPInputStream; 25 | 26 | import opennlp.fieldspring.tr.topo.Location; 27 | 28 | public class FilteredGeoNamesReader extends GeoNamesReader { 29 | public FilteredGeoNamesReader(File file) throws FileNotFoundException, IOException { 30 | this(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file))))); 31 | } 32 | 33 | public FilteredGeoNamesReader(BufferedReader reader) 34 | throws FileNotFoundException, IOException { 35 | super(reader); 36 | } 37 | 38 | protected Location parseLine(String line, int currentId) { 39 | Location location = super.parseLine(line, currentId); 40 | if (location != null) { 41 | Location.Type type = location.getType(); 42 | if (type != Location.Type.STATE && type != Location.Type.CITY) { 43 | location = null; 44 | } 45 | } 46 | return location; 47 | } 48 | } 49 | 50 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/topo/gaz/Gazetteer.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.topo.gaz; 17 | 18 | import java.util.List; 19 | 20 | import opennlp.fieldspring.tr.topo.Location; 21 | 22 | /** 23 | * Represents a mapping from toponym strings to lists of location candidates. 24 | * 25 | * @author Travis Brown 26 | */ 27 | public interface Gazetteer { 28 | /** 29 | * Lookup a toponym in the gazetteer, returning null if no candidate list is 30 | * found. 31 | */ 32 | public List lookup(String query); 33 | } 34 | 35 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/topo/gaz/GazetteerFileReader.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.topo.gaz; 17 | 18 | import java.io.BufferedReader; 19 | import java.io.FileInputStream; 20 | import java.io.FileNotFoundException; 21 | import java.io.InputStreamReader; 22 | import java.io.IOException; 23 | import java.util.Iterator; 24 | import java.util.zip.GZIPInputStream; 25 | 26 | public abstract class GazetteerFileReader extends GazetteerReader { 27 | private final BufferedReader reader; 28 | 29 | protected GazetteerFileReader(BufferedReader reader) 30 | throws FileNotFoundException, IOException { 31 | this.reader = reader; 32 | } 33 | 34 | protected String readLine() { 35 | String line = null; 36 | try { 37 | line = this.reader.readLine(); 38 | } catch (IOException e) { 39 | System.err.format("Error while reading gazetteer file: %s\n", e); 40 | e.printStackTrace(); 41 | } 42 | return line; 43 | } 44 | 45 | public void close() { 46 | try { 47 | this.reader.close(); 48 | } catch (IOException e) { 49 | System.err.format("Error closing gazetteer file: %s\n", e); 50 | e.printStackTrace(); 51 | System.exit(1); 52 | } 53 | } 54 | } 55 | 56 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/topo/gaz/GazetteerLineReader.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.topo.gaz; 17 | 18 | import java.io.BufferedReader; 19 | import java.io.FileNotFoundException; 20 | import java.io.IOException; 21 | import java.util.Iterator; 22 | 23 | import opennlp.fieldspring.tr.topo.Location; 24 | 25 | public abstract class GazetteerLineReader extends GazetteerFileReader { 26 | private Location current; 27 | private int currentId; 28 | 29 | protected GazetteerLineReader(BufferedReader reader) 30 | throws FileNotFoundException, IOException { 31 | super(reader); 32 | this.current = this.nextLocation(); 33 | this.currentId = 1; 34 | } 35 | 36 | protected abstract Location parseLine(String line, int currentId); 37 | 38 | private Location nextLocation() { 39 | Location location = null; 40 | for (String line = this.readLine(); line != null; line = this.readLine()) { 41 | location = this.parseLine(line, this.currentId); 42 | if (location != null) break; 43 | } 44 | this.currentId++; 45 | //if (this.currentId % 50000 == 0) { System.out.format("At location id: %d.\n", this.currentId); } 46 | return location; 47 | } 48 | 49 | public boolean hasNext() { 50 | return this.current != null; 51 | } 52 | 53 | public Location next() { 54 | Location location = this.current; 55 | this.current = this.nextLocation(); 56 | return location; 57 | } 58 | } 59 | 60 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/topo/gaz/GazetteerReader.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.topo.gaz; 17 | 18 | import java.util.Iterator; 19 | import opennlp.fieldspring.tr.topo.Location; 20 | 21 | public abstract class GazetteerReader implements Iterable, 22 | Iterator { 23 | public abstract void close(); 24 | 25 | protected Location.Type getLocationType(String code) { 26 | return Location.Type.UNKNOWN; 27 | } 28 | 29 | protected Location.Type getLocationType(String code, String fine) { 30 | return this.getLocationType(code); 31 | } 32 | 33 | public Iterator iterator() { 34 | return this; 35 | } 36 | 37 | public void remove() { 38 | throw new UnsupportedOperationException("Cannot remove location from gazetteer."); 39 | } 40 | } 41 | 42 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/topo/gaz/InMemoryGazetteer.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.topo.gaz; 17 | 18 | import java.util.ArrayList; 19 | import java.util.HashMap; 20 | import java.util.List; 21 | import java.util.Map; 22 | 23 | import opennlp.fieldspring.tr.topo.Location; 24 | 25 | public class InMemoryGazetteer extends LoadableGazetteer { 26 | private final Map> map; 27 | 28 | public InMemoryGazetteer() { 29 | this.map = new HashMap>(); 30 | } 31 | 32 | public void add(String name, Location location) { 33 | name = name.toLowerCase(); 34 | List locations = this.map.get(name); 35 | if (locations == null) { 36 | locations = new ArrayList(); 37 | } 38 | locations.add(location); 39 | this.map.put(name, locations); 40 | } 41 | 42 | public List lookup(String query) { 43 | return this.map.get(query.toLowerCase()); 44 | } 45 | } 46 | 47 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/topo/gaz/LoadableGazetteer.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.topo.gaz; 17 | 18 | import java.util.List; 19 | import opennlp.fieldspring.tr.topo.Location; 20 | 21 | public abstract class LoadableGazetteer implements Gazetteer { 22 | public abstract void add(String name, Location location); 23 | 24 | public int load(GazetteerReader reader) { 25 | int count = 0; 26 | for (Location location : reader) { 27 | count++; 28 | this.add(location.getName(), location); 29 | } 30 | reader.close(); 31 | this.finishLoading(); 32 | return count; 33 | } 34 | 35 | public void finishLoading() {} 36 | public void close() {} 37 | } 38 | 39 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/topo/gaz/MultiGazetteer.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.topo.gaz; 17 | 18 | import java.util.ArrayList; 19 | import java.util.List; 20 | import opennlp.fieldspring.tr.topo.Location; 21 | 22 | public class MultiGazetteer implements Gazetteer { 23 | private final List gazetteers; 24 | 25 | public MultiGazetteer(List gazetteers) { 26 | this.gazetteers = gazetteers; 27 | } 28 | 29 | public List lookup(String query) { 30 | for (Gazetteer gazetteer : this.gazetteers) { 31 | List candidates = gazetteer.lookup(query); 32 | if (candidates != null) { 33 | return candidates; 34 | } 35 | } 36 | return null; 37 | } 38 | } 39 | 40 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/util/CountingLexicon.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.util; 17 | 18 | import java.io.Serializable; 19 | import java.util.List; 20 | 21 | public interface CountingLexicon extends Lexicon, Serializable { 22 | public int count(A entry); 23 | public int countAtIndex(int index); 24 | } 25 | 26 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/util/DoubleStringPair.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Taesun Moon, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.util; 17 | 18 | /** 19 | * A pair of an double and String 20 | * 21 | * @author Taesun Moon 22 | */ 23 | public final class DoubleStringPair implements Comparable { 24 | /** 25 | * 26 | */ 27 | public double doubleValue; 28 | /** 29 | * 30 | */ 31 | public String stringValue; 32 | 33 | /** 34 | * 35 | * 36 | * @param d 37 | * @param s 38 | */ 39 | public DoubleStringPair (double d, String s) { 40 | doubleValue = d; 41 | stringValue = s; 42 | } 43 | 44 | /** 45 | * sorting order is reversed -- higher (int) values come first 46 | * 47 | * @param p 48 | * @return 49 | */ 50 | public int compareTo (DoubleStringPair p) { 51 | if (doubleValue < p.doubleValue) 52 | return 1; 53 | else if (doubleValue > p.doubleValue) 54 | return -1; 55 | else 56 | return 0; 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/util/Lexicon.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.util; 17 | 18 | import java.io.Serializable; 19 | import java.util.List; 20 | 21 | public interface Lexicon 22 | extends Serializable, Iterable { 23 | public boolean contains(A entry); 24 | public int get(A entry); 25 | public int getOrAdd(A entry); 26 | public A atIndex(int index); 27 | public int size(); 28 | public boolean isGrowing(); 29 | public void stopGrowing(); 30 | public void startGrowing(); 31 | public List concatenate(Lexicon other); 32 | } 33 | 34 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/util/MemoryUtil.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This class contains tools for checking memory usage during runtime. 3 | */ 4 | 5 | package opennlp.fieldspring.tr.util; 6 | 7 | public class MemoryUtil { 8 | public static long getMemoryUsage(){ 9 | takeOutGarbage(); 10 | long totalMemory = Runtime.getRuntime().totalMemory(); 11 | 12 | takeOutGarbage(); 13 | long freeMemory = Runtime.getRuntime().freeMemory(); 14 | 15 | return (totalMemory - freeMemory); 16 | } 17 | 18 | private static void takeOutGarbage() { 19 | collectGarbage(); 20 | collectGarbage(); 21 | } 22 | 23 | private static void collectGarbage() { 24 | try { 25 | System.gc(); 26 | Thread.currentThread().sleep(100); 27 | System.runFinalization(); 28 | Thread.currentThread().sleep(100); 29 | } 30 | catch (Exception ex){ 31 | ex.printStackTrace(); 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/util/Span.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.util; 17 | 18 | import java.util.ArrayList; 19 | import java.util.List; 20 | import java.io.*; 21 | 22 | public class Span implements Serializable { 23 | 24 | private static final long serialVersionUID = 42L; 25 | 26 | private final int start; 27 | private final int end; 28 | private final A item; 29 | 30 | public Span(int start, int end, A item) { 31 | this.start = start; 32 | this.end = end; 33 | this.item = item; 34 | } 35 | 36 | public int getStart() { 37 | return this.start; 38 | } 39 | 40 | public int getEnd() { 41 | return this.end; 42 | } 43 | 44 | public A getItem() { 45 | return this.item; 46 | } 47 | } 48 | 49 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/util/StringDoublePair.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Taesun Moon, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | 17 | package opennlp.fieldspring.tr.util; 18 | 19 | /** 20 | * 21 | * @author tsmoon 22 | */ 23 | public class StringDoublePair implements Comparable { 24 | 25 | public String stringValue; 26 | public double doubleValue; 27 | 28 | public StringDoublePair(String s, double d) { 29 | stringValue = s; 30 | doubleValue = d; 31 | } 32 | 33 | public int compareTo(StringDoublePair p) { 34 | return stringValue.compareTo(p.stringValue); 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/util/StringEditMapper.java: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.util; 17 | 18 | import java.util.ArrayList; 19 | import java.util.Collections; 20 | import java.util.Iterator; 21 | import java.util.List; 22 | 23 | import opennlp.fieldspring.tr.util.Span; 24 | 25 | public class StringEditMapper extends EditMapper { 26 | public StringEditMapper(List s, List t) { 27 | super(s, t); 28 | } 29 | 30 | @Override 31 | protected int delCost(String x) { 32 | return x.length(); 33 | } 34 | 35 | @Override 36 | protected int insCost(String x) { 37 | return x.length(); 38 | } 39 | 40 | @Override 41 | protected int subCost(String x, String y) { 42 | int[][] ds = new int[x.length() + 1][y.length() + 1]; 43 | for (int i = 0; i <= x.length(); i++) { ds[i][0] = i; } 44 | for (int j = 0; j <= y.length(); j++) { ds[0][j] = j; } 45 | 46 | for (int i = 1; i <= x.length(); i++) { 47 | for (int j = 1; j <= x.length(); j++) { 48 | int del = ds[i - 1][j] + 1; 49 | int ins = ds[1][j - 1] + 1; 50 | int sub = ds[i - 1][j - 1] + (x.charAt(i - 1) == y.charAt(j - 1) ? 0 : 1); 51 | ds[i][j] = StringEditMapper.minimum(del, ins, sub); 52 | } 53 | } 54 | 55 | return ds[x.length()][y.length()]; 56 | } 57 | 58 | private static int minimum(int a, int b, int c) { 59 | return Math.min(Math.min(a, b), c); 60 | } 61 | } 62 | 63 | -------------------------------------------------------------------------------- /src/main/java/opennlp/fieldspring/tr/util/ToponymFinder.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | package opennlp.fieldspring.tr.util; 5 | 6 | import java.io.BufferedReader; 7 | import java.io.FileInputStream; 8 | import java.io.FileReader; 9 | import java.io.IOException; 10 | import java.io.InputStreamReader; 11 | import java.util.ArrayList; 12 | import java.util.HashSet; 13 | import java.util.List; 14 | import java.util.zip.ZipEntry; 15 | import java.util.zip.ZipFile; 16 | import java.util.zip.ZipInputStream; 17 | 18 | import opennlp.fieldspring.tr.text.prep.HighRecallToponymRecognizer; 19 | import opennlp.fieldspring.tr.text.prep.NamedEntityRecognizer; 20 | import opennlp.fieldspring.tr.text.prep.NamedEntityType; 21 | import opennlp.fieldspring.tr.text.prep.OpenNLPRecognizer; 22 | import opennlp.fieldspring.tr.text.prep.OpenNLPSentenceDivider; 23 | import opennlp.fieldspring.tr.text.prep.OpenNLPTokenizer; 24 | import opennlp.fieldspring.tr.text.prep.SentenceDivider; 25 | import opennlp.fieldspring.tr.text.prep.Tokenizer; 26 | import opennlp.fieldspring.tr.topo.gaz.GeoNamesGazetteer; 27 | import opennlp.fieldspring.tr.util.Span; 28 | import opennlp.tools.util.InvalidFormatException; 29 | 30 | /** 31 | * @author abhimanu kumar 32 | * 33 | */ 34 | public class ToponymFinder { 35 | 36 | /** 37 | * @param args 38 | */ 39 | private final SentenceDivider sentDivider; 40 | private final Tokenizer tokenizer; 41 | private final NamedEntityRecognizer recognizer; 42 | private BufferedReader input; 43 | 44 | public ToponymFinder(BufferedReader reader, String gazPath) throws Exception{ 45 | sentDivider = new OpenNLPSentenceDivider(); 46 | tokenizer = new OpenNLPTokenizer(); 47 | recognizer = new HighRecallToponymRecognizer(gazPath); 48 | this.input = reader; 49 | } 50 | 51 | 52 | public static void main(String[] args) throws Exception { 53 | ToponymFinder finder = new ToponymFinder(new BufferedReader(new FileReader(args[0]/*"TheStoryTemp.txt"*/)),args[1]/*"data/gazetteers/US.ser.gz"*/); 54 | // long startTime = System.currentTimeMillis(); 55 | finder.find(); 56 | // long stopTime = System.currentTimeMillis(); 57 | // System.out.println((stopTime-startTime)/1000 + "secs"); 58 | } 59 | 60 | 61 | private HashSet find() throws IOException { 62 | String line; 63 | HashSet resultSet = new HashSet(); 64 | while((line=input.readLine())!=null){ 65 | List sentencesString = sentDivider.divide(line); 66 | for (String sentence : sentencesString){ 67 | List tokens = new ArrayList(); 68 | for(String token : tokenizer.tokenize(sentence)){ 69 | tokens.add(token); 70 | } 71 | List> spans =recognizer.recognize(tokens); 72 | for(Span span:spans){ 73 | StringBuilder resultToken= new StringBuilder(); 74 | for(int i=span.getStart();i 0): 12 | print lineToPrint 13 | curLine = inFile.readline() 14 | 15 | def processDirectory(dirname): 16 | fileList = os.listdir(dirname) 17 | if(not dirname[-1] == "/"): 18 | dirname += "/" 19 | for filename in fileList: 20 | if(os.path.isdir(dirname + filename)): 21 | processDirectory(dirname + filename) 22 | elif(os.path.isfile(dirname + filename)): 23 | processFile(dirname + filename) 24 | 25 | for filename in sys.argv[1:]: 26 | processFile(filename) 27 | -------------------------------------------------------------------------------- /src/main/python/splitdevtest.py: -------------------------------------------------------------------------------- 1 | import sys, shutil, os 2 | 3 | def processDirectory(dirname): 4 | fileList = os.listdir(dirname) 5 | if(not dirname[-1] == "/"): 6 | dirname += "/" 7 | count = 0 8 | for filename in fileList: 9 | if(count % 3 == 2): 10 | shutil.copy(dirname + filename, sys.argv[3]) 11 | print (dirname + filename) + " --> " + sys.argv[3] 12 | else: 13 | shutil.copy(dirname + filename, sys.argv[2]) 14 | print (dirname + filename) + " --> " + sys.argv[2] 15 | count += 1 16 | 17 | processDirectory(sys.argv[1]) 18 | -------------------------------------------------------------------------------- /src/main/python/tei2txt.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import sys 4 | import os 5 | import re 6 | import gzip 7 | import fnmatch 8 | 9 | from codecs import latin_1_decode 10 | from unicodedata import normalize 11 | from tei_entities import pcl_tei_entities 12 | 13 | commaRE = re.compile(",") 14 | nonAlpha = re.compile("[^A-Za-z]") 15 | 16 | pte = pcl_tei_entities() 17 | 18 | def cleanWord(word): 19 | word = word.lower() 20 | if len(word) < 2: 21 | word = "" 22 | return word 23 | 24 | def strip_text (text): 25 | text = latin_1_decode(text)[0] 26 | text = normalize('NFD',text).encode('ascii','ignore') 27 | 28 | text = re.sub('&mdash+;', ' ', text) # convert mdash to " " 29 | # text = re.sub('&', ' and ', text) # convert mdash to " " 30 | text = pte.replace_entities(text) 31 | # text = re.sub('&[A-Za-z]+;', '', text) # convert ampersand stuff to "" 32 | text = re.sub('<[^>]*>', ' ', text) # strip HTML markup 33 | text = re.sub('\s+', ' ', text) # strip whitespace 34 | 35 | return text 36 | 37 | directory_name = sys.argv[1] 38 | output_raw_dir = sys.argv[2] 39 | 40 | if not os.path.exists(output_raw_dir): 41 | os.makedirs(output_raw_dir) 42 | 43 | files = os.listdir(directory_name) 44 | for file in files: 45 | add_line = False 46 | write_text = False 47 | if fnmatch.fnmatch(file,"*.xml"): 48 | print "******",file 49 | newname = file[:-4]+".txt" 50 | raw_writer = open(output_raw_dir+"/"+newname,"w") 51 | file_reader = open(directory_name+"/"+file) 52 | text = "" 53 | 54 | header_end = False 55 | while not header_end: 56 | line = file_reader.readline() 57 | m = re.search('\s*\]>', line) 58 | if m: 59 | header_end = True 60 | 61 | for line in file_reader.readlines(): 62 | text = line.strip() 63 | text = strip_text(text).strip() 64 | if text != "": 65 | raw_writer.write(text) 66 | raw_writer.write("\n") 67 | 68 | raw_writer.close() 69 | 70 | -------------------------------------------------------------------------------- /src/main/python/trrraw2plain.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | 3 | outDir = sys.argv[2] 4 | if(not outDir[-1] == "/"): 5 | outDir += "/" 6 | 7 | def processFile(filename): 8 | global outDir 9 | inFile = open(filename,'r') 10 | newFilename = filename[filename.rfind("/")+1:-3] + ".txt" 11 | outFile = open(outDir + newFilename, 'w') 12 | wroteSomething = False 13 | while(True): 14 | curLine = inFile.readline() 15 | if(curLine == ""): break 16 | if(curLine.startswith(" ") or curLine.startswith("\t")): continue 17 | nextToken = curLine.split()[0] 18 | processedToken = nextToken.replace("&equo;", "'").replace("&dquo;", '"').replace("$", "$").replace("‐", "-").replace("&", "&").replace("×", "*") 19 | if(processedToken[0].isalnum() and wroteSomething): 20 | outFile.write(" ") 21 | outFile.write(processedToken) 22 | wroteSomething = True 23 | inFile.close() 24 | 25 | def processDirectory(dirname): 26 | fileList = os.listdir(dirname) 27 | if(not dirname[-1] == "/"): 28 | dirname += "/" 29 | for filename in fileList: 30 | if(os.path.isdir(dirname + filename)): 31 | processDirectory(dirname + filename) 32 | elif(os.path.isfile(dirname + filename)): 33 | processFile(dirname + filename) 34 | 35 | processDirectory(sys.argv[1]) 36 | -------------------------------------------------------------------------------- /src/main/python/unescape_entities.py: -------------------------------------------------------------------------------- 1 | import re, htmlentitydefs 2 | 3 | # NOTE: Courtesy of Frederik Lundh. 4 | # 5 | # http://effbot.org/zone/re-sub.htm#unescape-html 6 | 7 | ## 8 | # Removes HTML or XML character references and entities from a text string. 9 | # 10 | # @param text The HTML (or XML) source text. 11 | # @return The plain text, as a Unicode string, if necessary. 12 | 13 | def unescape(text): 14 | def fixup(m): 15 | text = m.group(0) 16 | if text[:2] == "&#": 17 | # character reference 18 | try: 19 | if text[:3] == "&#x": 20 | return unichr(int(text[3:-1], 16)) 21 | else: 22 | return unichr(int(text[2:-1])) 23 | except ValueError: 24 | pass 25 | else: 26 | # named entity 27 | try: 28 | text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) 29 | except KeyError: 30 | pass 31 | return text # leave as is 32 | return re.sub("&#?\w+;", fixup, text) 33 | -------------------------------------------------------------------------------- /src/main/resources/data/deu/stopwords.txt: -------------------------------------------------------------------------------- 1 | aber 2 | alle 3 | allem 4 | allen 5 | aller 6 | alles 7 | als 8 | also 9 | am 10 | an 11 | ander 12 | andere 13 | anderem 14 | anderen 15 | anderer 16 | anderes 17 | anderm 18 | andern 19 | anderr 20 | anders 21 | auch 22 | auf 23 | aus 24 | bei 25 | bin 26 | bis 27 | bist 28 | da 29 | damit 30 | dann 31 | der 32 | den 33 | des 34 | dem 35 | die 36 | das 37 | daß 38 | derselbe 39 | derselben 40 | denselben 41 | desselben 42 | demselben 43 | dieselbe 44 | dieselben 45 | dasselbe 46 | dazu 47 | dein 48 | deine 49 | deinem 50 | deinen 51 | deiner 52 | deines 53 | denn 54 | derer 55 | dessen 56 | dich 57 | dir 58 | du 59 | dies 60 | diese 61 | diesem 62 | diesen 63 | dieser 64 | dieses 65 | doch 66 | dort 67 | durch 68 | ein 69 | eine 70 | einem 71 | einen 72 | einer 73 | eines 74 | einig 75 | einige 76 | einigem 77 | einigen 78 | einiger 79 | einiges 80 | einmal 81 | er 82 | ihn 83 | ihm 84 | es 85 | etwas 86 | euer 87 | eure 88 | eurem 89 | euren 90 | eurer 91 | eures 92 | für 93 | gegen 94 | gewesen 95 | hab 96 | habe 97 | haben 98 | hat 99 | hatte 100 | hatten 101 | hier 102 | hin 103 | hinter 104 | ich 105 | mich 106 | mir 107 | ihr 108 | ihre 109 | ihrem 110 | ihren 111 | ihrer 112 | ihres 113 | euch 114 | im 115 | in 116 | indem 117 | ins 118 | ist 119 | jede 120 | jedem 121 | jeden 122 | jeder 123 | jedes 124 | jene 125 | jenem 126 | jenen 127 | jener 128 | jenes 129 | jetzt 130 | kann 131 | kein 132 | keine 133 | keinem 134 | keinen 135 | keiner 136 | keines 137 | können 138 | könnte 139 | machen 140 | man 141 | manche 142 | manchem 143 | manchen 144 | mancher 145 | manches 146 | mein 147 | meine 148 | meinem 149 | meinen 150 | meiner 151 | meines 152 | mit 153 | muss 154 | musste 155 | nach 156 | nicht 157 | nichts 158 | noch 159 | nun 160 | nur 161 | ob 162 | oder 163 | ohne 164 | sehr 165 | sein 166 | seine 167 | seinem 168 | seinen 169 | seiner 170 | seines 171 | selbst 172 | sich 173 | sie 174 | ihnen 175 | sind 176 | so 177 | solche 178 | solchem 179 | solchen 180 | solcher 181 | solches 182 | soll 183 | sollte 184 | sondern 185 | sonst 186 | über 187 | um 188 | und 189 | uns 190 | unse 191 | unsem 192 | unsen 193 | unser 194 | unses 195 | unter 196 | viel 197 | vom 198 | von 199 | vor 200 | während 201 | war 202 | waren 203 | warst 204 | was 205 | weg 206 | weil 207 | weiter 208 | welche 209 | welchem 210 | welchen 211 | welcher 212 | welches 213 | wenn 214 | werde 215 | werden 216 | wie 217 | wieder 218 | will 219 | wir 220 | wird 221 | wirst 222 | wo 223 | wollen 224 | wollte 225 | würde 226 | würden 227 | zu 228 | zum 229 | zur 230 | zwar 231 | zwischen 232 | -------------------------------------------------------------------------------- /src/main/resources/data/por/stopwords.txt: -------------------------------------------------------------------------------- 1 | de 2 | a 3 | o 4 | que 5 | e 6 | do 7 | da 8 | em 9 | um 10 | para 11 | com 12 | não 13 | uma 14 | os 15 | no 16 | se 17 | na 18 | por 19 | mais 20 | as 21 | dos 22 | como 23 | mas 24 | ao 25 | ele 26 | das 27 | à 28 | seu 29 | sua 30 | ou 31 | quando 32 | muito 33 | nos 34 | já 35 | eu 36 | também 37 | só 38 | pelo 39 | pela 40 | até 41 | isso 42 | ela 43 | entre 44 | depois 45 | sem 46 | mesmo 47 | aos 48 | seus 49 | quem 50 | nas 51 | me 52 | esse 53 | eles 54 | você 55 | essa 56 | num 57 | nem 58 | suas 59 | meu 60 | às 61 | minha 62 | numa 63 | pelos 64 | elas 65 | qual 66 | nós 67 | lhe 68 | deles 69 | essas 70 | esses 71 | pelas 72 | este 73 | dele 74 | tu 75 | te 76 | vocês 77 | vos 78 | lhes 79 | meus 80 | minhas 81 | teu 82 | tua 83 | teus 84 | tuas 85 | nosso 86 | nossa 87 | nossos 88 | nossas 89 | dela 90 | delas 91 | esta 92 | estes 93 | estas 94 | aquele 95 | aquela 96 | aqueles 97 | aquelas 98 | isto 99 | aquilo 100 | estou 101 | está 102 | estamos 103 | estão 104 | estive 105 | esteve 106 | estivemos 107 | estiveram 108 | estava 109 | estávamos 110 | estavam 111 | estivera 112 | estivéramos 113 | esteja 114 | estejamos 115 | estejam 116 | estivesse 117 | estivéssemos 118 | estivessem 119 | estiver 120 | estivermos 121 | estiverem 122 | hei 123 | há 124 | havemos 125 | hão 126 | houve 127 | houvemos 128 | houveram 129 | houvera 130 | houvéramos 131 | haja 132 | hajamos 133 | hajam 134 | houvesse 135 | houvéssemos 136 | houvessem 137 | houver 138 | houvermos 139 | houverem 140 | houverei 141 | houverá 142 | houveremos 143 | houverão 144 | houveria 145 | houveríamos 146 | houveriam 147 | sou 148 | somos 149 | são 150 | era 151 | éramos 152 | eram 153 | fui 154 | foi 155 | fomos 156 | foram 157 | fora 158 | fôramos 159 | seja 160 | sejamos 161 | sejam 162 | fosse 163 | fôssemos 164 | fossem 165 | for 166 | formos 167 | forem 168 | serei 169 | será 170 | seremos 171 | serão 172 | seria 173 | seríamos 174 | seriam 175 | tenho 176 | tem 177 | temos 178 | tém 179 | tinha 180 | tínhamos 181 | tinham 182 | tive 183 | teve 184 | tivemos 185 | tiveram 186 | tivera 187 | tivéramos 188 | tenha 189 | tenhamos 190 | tenham 191 | tivesse 192 | tivéssemos 193 | tivessem 194 | tiver 195 | tivermos 196 | tiverem 197 | terei 198 | terá 199 | teremos 200 | terão 201 | teria 202 | teríamos 203 | teriam 204 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/geolocate/CombinedModelCell.scala: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // CombinedModelCellGrid.scala 3 | // 4 | // Copyright (C) 2012 Stephen Roller, The University of Texas at Austin 5 | // 6 | // Licensed under the Apache License, Version 2.0 (the "License"); 7 | // you may not use this file except in compliance with the License. 8 | // You may obtain a copy of the License at 9 | // 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | /////////////////////////////////////////////////////////////////////////////// 18 | 19 | package opennlp.fieldspring.geolocate 20 | 21 | import opennlp.fieldspring.util.distances.spheredist 22 | import opennlp.fieldspring.util.distances.SphereCoord 23 | import opennlp.fieldspring.util.experiment._ 24 | import opennlp.fieldspring.util.printutil.{errprint, warning} 25 | 26 | class CombinedModelCellGrid(table: SphereDocumentTable, 27 | models: Seq[SphereCellGrid]) 28 | extends SphereCellGrid(table) { 29 | 30 | override var total_num_cells: Int = models.map(_.total_num_cells).sum 31 | override val num_training_passes: Int = models.map(_.num_training_passes).max 32 | 33 | var current_training_pass: Int = 0 34 | 35 | override def begin_training_pass(pass: Int) = { 36 | current_training_pass = pass 37 | for (model <- models) { 38 | if (pass <= model.num_training_passes) { 39 | model.begin_training_pass(pass) 40 | } 41 | } 42 | } 43 | 44 | def find_best_cell_for_document(doc: SphereDocument, 45 | create_non_recorded: Boolean) = { 46 | val candidates = 47 | models.map(_.find_best_cell_for_document(doc, create_non_recorded)) 48 | .filter(_ != null) 49 | candidates.minBy((cell: SphereCell) => 50 | spheredist(cell.get_center_coord, doc.coord)) 51 | } 52 | 53 | def add_document_to_cell(document: SphereDocument) { 54 | for (model <- models) { 55 | if (current_training_pass <= model.num_training_passes) { 56 | model.add_document_to_cell(document) 57 | } 58 | } 59 | } 60 | 61 | def initialize_cells() { 62 | } 63 | 64 | override def finish() { 65 | for (model <- models) { 66 | model.finish() 67 | } 68 | num_non_empty_cells = models.map(_.num_non_empty_cells).sum 69 | } 70 | 71 | def iter_nonempty_cells(nonempty_word_dist: Boolean = false): Iterable[SphereCell] = { 72 | models.map(_.iter_nonempty_cells(nonempty_word_dist)) 73 | .reduce(_ ++ _) 74 | } 75 | } 76 | 77 | 78 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/geolocate/TwitterDocument.scala: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // TwitterDocument.scala 3 | // 4 | // Copyright (C) 2011 Ben Wing, The University of Texas at Austin 5 | // 6 | // Licensed under the Apache License, Version 2.0 (the "License"); 7 | // you may not use this file except in compliance with the License. 8 | // You may obtain a copy of the License at 9 | // 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | /////////////////////////////////////////////////////////////////////////////// 18 | 19 | package opennlp.fieldspring.geolocate 20 | 21 | import opennlp.fieldspring.util.textdbutil.Schema 22 | 23 | import opennlp.fieldspring.worddist.WordDist.memoizer._ 24 | 25 | class TwitterTweetDocument( 26 | schema: Schema, 27 | subtable: TwitterTweetDocumentSubtable 28 | ) extends SphereDocument(schema, subtable.table) { 29 | var id = 0L 30 | def title = id.toString 31 | 32 | override def set_field(field: String, value: String) { 33 | field match { 34 | case "title" => id = value.toLong 35 | case _ => super.set_field(field, value) 36 | } 37 | } 38 | 39 | def struct = 40 | 41 | { id } 42 | { 43 | if (has_coord) 44 | { coord } 45 | } 46 | 47 | } 48 | 49 | class TwitterTweetDocumentSubtable( 50 | table: SphereDocumentTable 51 | ) extends SphereDocumentSubtable[TwitterTweetDocument](table) { 52 | def create_document(schema: Schema) = new TwitterTweetDocument(schema, this) 53 | } 54 | 55 | class TwitterUserDocument( 56 | schema: Schema, 57 | subtable: TwitterUserDocumentSubtable 58 | ) extends SphereDocument(schema, subtable.table) { 59 | var userind = blank_memoized_string 60 | def title = unmemoize_string(userind) 61 | 62 | override def set_field(field: String, value: String) { 63 | field match { 64 | case "user" => userind = memoize_string(value) 65 | case _ => super.set_field(field, value) 66 | } 67 | } 68 | 69 | def struct = 70 | 71 | { unmemoize_string(userind) } 72 | { 73 | if (has_coord) 74 | { coord } 75 | } 76 | 77 | } 78 | 79 | class TwitterUserDocumentSubtable( 80 | table: SphereDocumentTable 81 | ) extends SphereDocumentSubtable[TwitterUserDocument](table) { 82 | def create_document(schema: Schema) = new TwitterUserDocument(schema, this) 83 | } 84 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/gridlocate/TextGrounderInfo.scala: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // FieldspringInfo.scala 3 | // 4 | // Copyright (C) 2011 Ben Wing, The University of Texas at Austin 5 | // 6 | // Licensed under the Apache License, Version 2.0 (the "License"); 7 | // you may not use this file except in compliance with the License. 8 | // You may obtain a copy of the License at 9 | // 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | /////////////////////////////////////////////////////////////////////////////// 18 | 19 | package opennlp.fieldspring.gridlocate 20 | 21 | import opennlp.fieldspring.util.printutil.errprint 22 | 23 | /** 24 | Fieldspring-specific information (e.g. env vars). 25 | */ 26 | 27 | object FieldspringInfo { 28 | var fieldspring_dir: String = null 29 | 30 | def set_fieldspring_dir(dir: String) { 31 | fieldspring_dir = dir 32 | } 33 | 34 | def get_fieldspring_dir() = { 35 | if (fieldspring_dir == null) 36 | fieldspring_dir = System.getenv("FIELDSPRING_DIR") 37 | if (fieldspring_dir == null) { 38 | errprint("""FIELDSPRING_DIR must be set to the top-level directory where 39 | Fieldspring is installed.""") 40 | require(fieldspring_dir != null) 41 | } 42 | fieldspring_dir 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/perceptron/package.scala: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // package.scala 3 | // 4 | // Copyright (C) 2012 Ben Wing, The University of Texas at Austin 5 | // 6 | // Licensed under the Apache License, Version 2.0 (the "License"); 7 | // you may not use this file except in compliance with the License. 8 | // You may obtain a copy of the License at 9 | // 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | /////////////////////////////////////////////////////////////////////////////// 18 | 19 | package opennlp.fieldspring 20 | 21 | package object perceptron { 22 | type WeightVector = Array[Double] 23 | } 24 | 25 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/poligrounder/TimeDocument.scala: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // TimeDocument.scala 3 | // 4 | // Copyright (C) 2011, 2012 Ben Wing, The University of Texas at Austin 5 | // 6 | // Licensed under the Apache License, Version 2.0 (the "License"); 7 | // you may not use this file except in compliance with the License. 8 | // You may obtain a copy of the License at 9 | // 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | /////////////////////////////////////////////////////////////////////////////// 18 | 19 | package opennlp.fieldspring.poligrounder 20 | 21 | import collection.mutable 22 | 23 | import opennlp.fieldspring.util.distances._ 24 | import opennlp.fieldspring.util.textdbutil.Schema 25 | import opennlp.fieldspring.util.printutil._ 26 | 27 | import opennlp.fieldspring.gridlocate.{DistDocument,DistDocumentTable,CellGrid} 28 | import opennlp.fieldspring.gridlocate.DistDocumentConverters._ 29 | 30 | import opennlp.fieldspring.worddist.WordDistFactory 31 | 32 | class TimeDocument( 33 | schema: Schema, 34 | table: TimeDocumentTable 35 | ) extends DistDocument[TimeCoord](schema, table) { 36 | var coord: TimeCoord = _ 37 | var user: String = _ 38 | def has_coord = coord != null 39 | def title = if (coord != null) coord.toString else "unknown time" 40 | 41 | def struct = 42 | 43 | { 44 | if (has_coord) 45 | { coord } 46 | } 47 | 48 | 49 | override def set_field(name: String, value: String) { 50 | name match { 51 | case "min-timestamp" => coord = get_x_or_null[TimeCoord](value) 52 | case "user" => user = value 53 | case _ => super.set_field(name, value) 54 | } 55 | } 56 | 57 | def coord_as_double(coor: TimeCoord) = coor match { 58 | case null => Double.NaN 59 | case TimeCoord(x) => x.toDouble / 1000 60 | } 61 | 62 | def distance_to_coord(coord2: TimeCoord) = { 63 | (coord_as_double(coord2) - coord_as_double(coord)).abs 64 | } 65 | def output_distance(dist: Double) = "%s seconds" format dist 66 | } 67 | 68 | /** 69 | * A DistDocumentTable specifically for documents with coordinates described 70 | * by a TimeCoord. 71 | * We delegate the actual document creation to a subtable specific to the 72 | * type of corpus (e.g. Wikipedia or Twitter). 73 | */ 74 | class TimeDocumentTable( 75 | override val driver: PoligrounderDriver, 76 | word_dist_factory: WordDistFactory 77 | ) extends DistDocumentTable[TimeCoord, TimeDocument, TimeCellGrid]( 78 | driver, word_dist_factory 79 | ) { 80 | def create_document(schema: Schema) = new TimeDocument(schema, this) 81 | } 82 | 83 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/postprocess/DocumentRankerByError.scala: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // DocumentRankerByError.scala 3 | // 4 | // Copyright (C) 2012 Mike Speriosu, The University of Texas at Austin 5 | // 6 | // Licensed under the Apache License, Version 2.0 (the "License"); 7 | // you may not use this file except in compliance with the License. 8 | // You may obtain a copy of the License at 9 | // 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | /////////////////////////////////////////////////////////////////////////////// 18 | 19 | package opennlp.fieldspring.postprocess 20 | 21 | // This program takes a log file and outputs the document names to standard out, ranked by prediction error. 22 | 23 | import org.clapper.argot._ 24 | import opennlp.fieldspring.tr.topo._ 25 | import opennlp.fieldspring.tr.util.LogUtil 26 | 27 | object DocumentRankerByError { 28 | 29 | import ArgotConverters._ 30 | 31 | val parser = new ArgotParser("fieldspring run opennlp.fieldspring.postprocess.DocumentRankerByError", preUsage = Some("Fieldspring")) 32 | val logFile = parser.option[String](List("l", "log"), "log", "log input file") 33 | 34 | def main(args: Array[String]) { 35 | try { 36 | parser.parse(args) 37 | } 38 | catch { 39 | case e: ArgotUsageException => println(e.message); sys.exit(0) 40 | } 41 | 42 | if(logFile.value == None) { 43 | println("You must specify a log input file via -l.") 44 | sys.exit(0) 45 | } 46 | 47 | val docsAndErrors:List[(String, Double, Coordinate, Coordinate)] = 48 | (for(pe <- LogUtil.parseLogFile(logFile.value.get)) yield { 49 | val dist = pe.trueCoord.distanceInKm(pe.predCoord) 50 | 51 | (pe.docName, dist, pe.trueCoord, pe.predCoord) 52 | }).sortWith((x, y) => x._2 < y._2) 53 | 54 | for((docName, dist, trueCoord, predCoord) <- docsAndErrors) { 55 | println(docName+"\t"+dist+"\t"+trueCoord+"\t"+predCoord) 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/preprocess/Permute.scala: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Permute.scala 3 | // 4 | // Copyright (C) 2012 Stephen Roller, The University of Texas at Austin 5 | // 6 | // Licensed under the Apache License, Version 2.0 (the "License"); 7 | // you may not use this file except in compliance with the License. 8 | // You may obtain a copy of the License at 9 | // 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | /////////////////////////////////////////////////////////////////////////////// 18 | 19 | package opennlp.fieldspring.preprocess 20 | 21 | import util.Random 22 | import com.nicta.scoobi.Scoobi._ 23 | import java.io._ 24 | 25 | /* 26 | * This program randomly permutes all the lines in a text file, using Hadoop 27 | * and Scoobi. 28 | */ 29 | 30 | object Permute extends ScoobiApp { 31 | val rnd = new Random 32 | 33 | def generate_key(line: String): (Double, String) = { 34 | (rnd.nextDouble, line) 35 | } 36 | 37 | def remove_key(kvs: (Double, Iterable[String])): Iterable[String] = { 38 | val (key, values) = kvs 39 | for (v <- values) 40 | yield v 41 | } 42 | 43 | def run() { 44 | // make sure we get all the input 45 | val (inputPath, outputPath) = 46 | if (args.length == 2) { 47 | (args(0), args(1)) 48 | } else { 49 | sys.error("Expecting input and output path.") 50 | } 51 | 52 | // Firstly we load up all the (new-line-seperated) json lines 53 | val lines: DList[String] = TextInput.fromTextFile(inputPath) 54 | 55 | // randomly generate keys 56 | val with_keys = lines.map(generate_key) 57 | 58 | // sort by keys 59 | val keys_sorted = with_keys.groupByKey 60 | 61 | // remove keys 62 | val keys_removed = keys_sorted.flatMap(remove_key) 63 | 64 | // save to disk 65 | persist(TextOutput.toTextFile(keys_removed, outputPath)) 66 | 67 | } 68 | } 69 | 70 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/preprocess/ProcessFiles.scala: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // ProcessFiles.scala 3 | // 4 | // Copyright (C) 2011 Ben Wing, The University of Texas at Austin 5 | // 6 | // Licensed under the Apache License, Version 2.0 (the "License"); 7 | // you may not use this file except in compliance with the License. 8 | // You may obtain a copy of the License at 9 | // 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | /////////////////////////////////////////////////////////////////////////////// 18 | 19 | package opennlp.fieldspring.preprocess 20 | 21 | import opennlp.fieldspring.util.argparser._ 22 | import opennlp.fieldspring.util.experiment._ 23 | import opennlp.fieldspring.util.ioutil._ 24 | 25 | /* 26 | Common code for doing basic file-processing operations. 27 | 28 | FIXME: It's unclear there's enough code to justify factoring it out 29 | like this. 30 | */ 31 | 32 | ///////////////////////////////////////////////////////////////////////////// 33 | // Main code // 34 | ///////////////////////////////////////////////////////////////////////////// 35 | 36 | /** 37 | * Class for defining and retrieving command-line arguments. Consistent 38 | * with "field-style" access to an ArgParser, this class needs to be 39 | * instantiated twice with the same ArgParser object, before and after parsing 40 | * the command line. The first instance defines the allowed arguments in the 41 | * ArgParser, while the second one retrieves the values stored into the 42 | * ArgParser as a result of parsing. 43 | * 44 | * @param ap ArgParser object. 45 | */ 46 | class ProcessFilesParameters(ap: ArgParser) extends 47 | ArgParserParameters(ap) { 48 | val output_dir = 49 | ap.option[String]("o", "output-dir", 50 | metavar = "DIR", 51 | help = """Directory to store output files in. It must not already 52 | exist, and will be created (including any parent directories).""") 53 | } 54 | 55 | abstract class ProcessFilesDriver extends HadoopableArgParserExperimentDriver { 56 | override type TParam <: ProcessFilesParameters 57 | type TRunRes = Unit 58 | 59 | def handle_parameters() { 60 | need(params.output_dir, "output-dir") 61 | } 62 | 63 | def setup_for_run() { } 64 | 65 | def run_after_setup() { 66 | if (!get_file_handler.make_directories(params.output_dir)) 67 | param_error("Output dir %s must not already exist" format 68 | params.output_dir) 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/preprocess/ScoobiWordCount.scala: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.preprocess 2 | 3 | import com.nicta.scoobi.Scoobi._ 4 | // import com.nicta.scoobi.testing.HadoopLogFactory 5 | import com.nicta.scoobi.application.HadoopLogFactory 6 | import org.apache.commons.logging.LogFactory 7 | import org.apache.hadoop.fs.FileSystem 8 | import java.io._ 9 | 10 | object ScoobiWordCount extends ScoobiApp { 11 | def run() { 12 | // There's some magic here in the source code to make the get() call 13 | // work -- there's an implicit conversion in object ScoobiConfiguration 14 | // from a ScoobiConfiguration to a Hadoop Configuration, which has get() 15 | // defined on it. Evidently implicit conversions in the companion object 16 | // get made available automatically for classes or something? 17 | System.err.println("mapred.job.tracker " + 18 | configuration.get("mapred.job.tracker", "value not found")) 19 | // System.err.println("job tracker " + jobTracker) 20 | // System.err.println("file system " + fs) 21 | System.err.println("configure file system " + configuration.fs) 22 | System.err.println("file system key " + 23 | configuration.get(FileSystem.FS_DEFAULT_NAME_KEY, "value not found")) 24 | 25 | val lines = 26 | // Test fromTextFileWithPath, but currently appears to trigger an 27 | // infinite loop. 28 | // TextInput.fromTextFileWithPath(args(0)) 29 | TextInput.fromTextFile(args(0)).map(x => (args(0), x)) 30 | 31 | def splitit(x: String) = { 32 | HadoopLogFactory.setQuiet(false) 33 | // val logger = LogFactory.getLog("foo.bar") 34 | // logger.info("Processing " + x) 35 | // System.err.println("Processing", x) 36 | x.split(" ") 37 | } 38 | //val counts = lines.flatMap(_.split(" ")) 39 | val counts = lines.map(_._2).flatMap(splitit) 40 | .map(word => (word, 1)) 41 | .groupByKey 42 | .filter { case (word, lens) => word.length < 8 } 43 | .filter { case (word, lens) => lens.exists(x => true) } 44 | .combine((a: Int, b: Int) => a + b) 45 | persist(toTextFile(counts, args(1))) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/app/ConvertCorpusToPlaintext.scala: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.app 2 | 3 | import java.io._ 4 | 5 | import opennlp.fieldspring.tr.topo._ 6 | import opennlp.fieldspring.tr.text._ 7 | import opennlp.fieldspring.tr.text.prep._ 8 | import opennlp.fieldspring.tr.text.io._ 9 | 10 | import scala.collection.JavaConversions._ 11 | 12 | object ConvertCorpusToPlaintext extends App { 13 | 14 | val outDirName = if(args(1).endsWith("/")) args(1) else args(1)+"/" 15 | val outDir = new File(outDirName) 16 | if(!outDir.exists) 17 | outDir.mkdir 18 | 19 | val tokenizer = new OpenNLPTokenizer 20 | 21 | val corpus = Corpus.createStoredCorpus 22 | corpus.addSource(new TrXMLDirSource(new File(args(0)), tokenizer)) 23 | corpus.setFormat(BaseApp.CORPUS_FORMAT.TRCONLL) 24 | corpus.load 25 | 26 | for(doc <- corpus) { 27 | val out = new BufferedWriter(new FileWriter(outDirName+doc.getId+".txt")) 28 | for(sent <- doc) { 29 | for(token <- sent) { 30 | out.write(token.getForm+" ") 31 | } 32 | out.write("\n") 33 | } 34 | out.close 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/app/ConvertCorpusToToponymAsDoc.scala: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.app 2 | 3 | import java.io._ 4 | 5 | import opennlp.fieldspring.tr.topo._ 6 | import opennlp.fieldspring.tr.text._ 7 | import opennlp.fieldspring.tr.text.prep._ 8 | import opennlp.fieldspring.tr.text.io._ 9 | import opennlp.fieldspring.tr.util._ 10 | 11 | import scala.collection.JavaConversions._ 12 | 13 | object ConvertCorpusToToponymAsDoc extends App { 14 | 15 | val windowSize = if(args.length >= 2) args(1).toInt else 0 16 | 17 | val alphanumRE = """^[a-zA-Z0-9]+$""".r 18 | 19 | val tokenizer = new OpenNLPTokenizer 20 | 21 | val corpus = Corpus.createStoredCorpus 22 | corpus.addSource(new TrXMLDirSource(new File(args(0)), tokenizer)) 23 | corpus.setFormat(BaseApp.CORPUS_FORMAT.TRCONLL) 24 | corpus.load 25 | 26 | for(doc <- corpus) { 27 | val docAsArray = TextUtil.getDocAsArray(doc) 28 | var tokIndex = 0 29 | for(token <- docAsArray) { 30 | if(token.isToponym && token.asInstanceOf[Toponym].hasGold) { 31 | val goldCoord = token.asInstanceOf[Toponym].getGold.getRegion.getCenter 32 | 33 | val unigramCounts = getUnigramCounts(docAsArray, tokIndex, windowSize) 34 | 35 | print(doc.getId.drop(1)+"_"+tokIndex+"\t") 36 | print(doc.getId+"_"+tokIndex+"\t") 37 | print(goldCoord.getLatDegrees+","+goldCoord.getLngDegrees+"\t") 38 | print("1\t\tMain\tno\tno\tno\t") 39 | //print(token.getForm+":"+1+" ")\ 40 | for((word, count) <- unigramCounts) { 41 | print(word+":"+count+" ") 42 | } 43 | println 44 | } 45 | tokIndex += 1 46 | } 47 | } 48 | 49 | def getUnigramCounts(docAsArray:Array[StoredToken], tokIndex:Int, windowSize:Int): Map[String, Int] = { 50 | 51 | val startIndex = math.max(0, tokIndex - windowSize) 52 | val endIndex = math.min(docAsArray.length, tokIndex + windowSize + 1) 53 | 54 | val unigramCounts = new collection.mutable.HashMap[String, Int] 55 | 56 | for(rawToken <- docAsArray.slice(startIndex, endIndex)) { 57 | for(token <- rawToken.getForm.split(" ")) { 58 | val prevCount = unigramCounts.getOrElse(token, 0) 59 | unigramCounts.put(token, prevCount + 1) 60 | } 61 | } 62 | 63 | unigramCounts.toMap 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/app/ConvertCorpusToUnigramCounts.scala: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.app 2 | 3 | import java.io._ 4 | import java.util.zip._ 5 | 6 | import opennlp.fieldspring.tr.util._ 7 | import opennlp.fieldspring.tr.topo._ 8 | import opennlp.fieldspring.tr.topo.gaz._ 9 | import opennlp.fieldspring.tr.text._ 10 | import opennlp.fieldspring.tr.text.prep._ 11 | import opennlp.fieldspring.tr.text.io._ 12 | 13 | import scala.collection.JavaConversions._ 14 | 15 | object ConvertCorpusToUnigramCounts extends BaseApp { 16 | 17 | val alphanumRE = """^[a-z0-9]+$""".r 18 | 19 | //val tokenizer = new OpenNLPTokenizer 20 | 21 | def main(args:Array[String]) { 22 | 23 | initializeOptionsFromCommandLine(args); 24 | 25 | /*var corpus = Corpus.createStoredCorpus 26 | 27 | if(getCorpusFormat == BaseApp.CORPUS_FORMAT.PLAIN/**/) { 28 | /* 29 | val tokenizer = new OpenNLPTokenizer 30 | //val recognizer = new OpenNLPRecognizer 31 | //val gis = new GZIPInputStream(new FileInputStream(args(1))) 32 | //val ois = new ObjectInputStream(gis) 33 | //val gnGaz = ois.readObject.asInstanceOf[GeoNamesGazetteer] 34 | //gis.close 35 | corpus.addSource(new PlainTextSource( 36 | new BufferedReader(new FileReader(args(0))), new OpenNLPSentenceDivider(), tokenizer)) 37 | //corpus.addSource(new ToponymAnnotator(new PlainTextSource( 38 | // new BufferedReader(new FileReader(args(0))), new OpenNLPSentenceDivider(), tokenizer), 39 | // recognizer, gnGaz, null)) 40 | corpus.setFormat(BaseApp.CORPUS_FORMAT.PLAIN) 41 | */ 42 | val importCorpus = new ImportCorpus 43 | //if(args(0).endsWith("txt")) 44 | corpus = importCorpus.doImport(getCorpusInputPath, , getCorpusFormat, false) 45 | //else 46 | // corpus = importCorpus 47 | } 48 | else if(getCorpusFormat == BaseApp.CORPUS_FORMAT.TRCONLL) { 49 | corpus.addSource(new TrXMLDirSource(new File(args(0)), tokenizer)) 50 | corpus.setFormat(BaseApp.CORPUS_FORMAT.TRCONLL) 51 | corpus.load 52 | } 53 | //corpus.load*/ 54 | 55 | val corpus = TopoUtil.readStoredCorpusFromSerialized(getSerializedCorpusInputPath) 56 | 57 | var i = 0 58 | for(doc <- corpus) { 59 | val unigramCounts = new collection.mutable.HashMap[String, Int] 60 | for(sent <- doc) { 61 | for(rawToken <- sent) { 62 | for(token <- rawToken.getForm.split(" ")) { 63 | val ltoken = token.toLowerCase 64 | if(alphanumRE.findFirstIn(ltoken) != None) { 65 | val prevCount = unigramCounts.getOrElse(ltoken, 0) 66 | unigramCounts.put(ltoken, prevCount + 1) 67 | } 68 | } 69 | } 70 | } 71 | 72 | print(i/*doc.getId.drop(1)*/ +"\t") 73 | print(doc.getId+"\t") 74 | print("0,0\t") 75 | print("1\t\tMain\tno\tno\tno\t") 76 | for((word, count) <- unigramCounts) { 77 | print(word+":"+count+" ") 78 | } 79 | println 80 | i += 1 81 | } 82 | 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/app/ConvertGeoTextToJSON.scala: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.app 2 | 3 | import com.codahale.jerkson.Json._ 4 | 5 | object ConvertGeoTextToJSON extends App { 6 | for(line <- scala.io.Source.fromFile(args(0), "ISO-8859-1").getLines) { 7 | val tokens = line.split("\t") 8 | println(generate(new tweet(tokens(3).toDouble, tokens(4).toDouble, tokens(5)))) 9 | } 10 | } 11 | 12 | case class tweet(val lat:Double, val lon:Double, val text:String) 13 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/app/CorpusErrorAnalyzer.scala: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.app 2 | 3 | import java.io._ 4 | import java.util.zip._ 5 | 6 | import opennlp.fieldspring.tr.util._ 7 | import opennlp.fieldspring.tr.topo._ 8 | import opennlp.fieldspring.tr.text._ 9 | import opennlp.fieldspring.tr.text.prep._ 10 | import opennlp.fieldspring.tr.topo.gaz._ 11 | import opennlp.fieldspring.tr.text.io._ 12 | 13 | import scala.collection.JavaConversions._ 14 | 15 | object CorpusErrorAnalyzer extends BaseApp { 16 | 17 | def main(args:Array[String]) { 18 | initializeOptionsFromCommandLine(args) 19 | 20 | val corpus = TopoUtil.readStoredCorpusFromSerialized(getSerializedCorpusInputPath) 21 | 22 | for(doc <- corpus) { 23 | for(sent <- doc) { 24 | for(toponym <- sent.getToponyms.filter(_.getAmbiguity > 0)) { 25 | 26 | } 27 | } 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/app/FilterGeotaggedWiki.scala: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.app 2 | 3 | import java.io._ 4 | import java.util.zip._ 5 | 6 | import opennlp.fieldspring.tr.util._ 7 | import opennlp.fieldspring.tr.topo._ 8 | import opennlp.fieldspring.tr.topo.gaz._ 9 | import opennlp.fieldspring.tr.text._ 10 | import opennlp.fieldspring.tr.text.prep._ 11 | import opennlp.fieldspring.tr.text.io._ 12 | 13 | import scala.collection.JavaConversions._ 14 | 15 | import org.apache.commons.compress.compressors.bzip2._ 16 | import org.clapper.argot._ 17 | import ArgotConverters._ 18 | 19 | object FilterGeotaggedWiki extends App { 20 | val parser = new ArgotParser("fieldspring run opennlp.fieldspring.tr.app.FilterGeotaggedWiki", preUsage = Some("Fieldspring")) 21 | 22 | val wikiTextInputFile = parser.option[String](List("w", "wiki"), "wiki", "wiki text input file") 23 | val wikiCorpusInputFile = parser.option[String](List("c", "corpus"), "corpus", "wiki corpus input file") 24 | 25 | try { 26 | parser.parse(args) 27 | } 28 | catch { 29 | case e: ArgotUsageException => println(e.message); sys.exit(0) 30 | } 31 | 32 | val ids = new collection.mutable.HashSet[String] 33 | 34 | val fis = new FileInputStream(wikiCorpusInputFile.value.get) 35 | fis.read; fis.read 36 | val cbzis = new BZip2CompressorInputStream(fis) 37 | val in = new BufferedReader(new InputStreamReader(cbzis)) 38 | var curLine = in.readLine 39 | while(curLine != null) { 40 | ids += curLine.split("\t")(0) 41 | curLine = in.readLine 42 | } 43 | in.close 44 | 45 | val wikiTextCorpus = Corpus.createStreamCorpus 46 | 47 | wikiTextCorpus.addSource(new WikiTextSource(new BufferedReader(new FileReader(wikiTextInputFile.value.get)))) 48 | wikiTextCorpus.setFormat(BaseApp.CORPUS_FORMAT.WIKITEXT) 49 | 50 | for(doc <- wikiTextCorpus) { 51 | if(ids contains doc.getId) { 52 | println("Article title: " + doc.title) 53 | println("Article ID: " + doc.getId) 54 | for(sent <- doc) { 55 | for(token <- sent) { 56 | println(token.getOrigForm) 57 | } 58 | } 59 | } 60 | else { 61 | for(sent <- doc) { for(token <- sent) {} } 62 | } 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/app/GazEntryKMLPlotter.scala: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.app 2 | 3 | import java.io._ 4 | import java.util.zip._ 5 | 6 | import opennlp.fieldspring.tr.util._ 7 | import opennlp.fieldspring.tr.topo._ 8 | import opennlp.fieldspring.tr.topo.gaz._ 9 | import opennlp.fieldspring.tr.text._ 10 | import opennlp.fieldspring.tr.text.prep._ 11 | import opennlp.fieldspring.tr.text.io._ 12 | 13 | import scala.collection.JavaConversions._ 14 | 15 | object GazEntryKMLPlotter /*extends BaseApp*/ { 16 | 17 | def main(args:Array[String]) { 18 | 19 | val toponym = args(0).replaceAll("_", " ") 20 | //val gaz = println("Reading serialized gazetteer from " + args(1) + " ...") 21 | val gis = new GZIPInputStream(new FileInputStream(args(1))) 22 | val ois = new ObjectInputStream(gis) 23 | val gnGaz = ois.readObject.asInstanceOf[GeoNamesGazetteer] 24 | gis.close 25 | 26 | val entries = gnGaz.lookup(toponym) 27 | if(entries != null) { 28 | var loc = entries(0) 29 | for(entry <- entries) 30 | if(entry.getRegion.getRepresentatives.size > 1) 31 | loc = entry 32 | if(loc != null) 33 | for(coord <- loc.getRegion.getRepresentatives) { 34 | println("") 35 | println("#My_Style") 36 | println("") 37 | println(""+coord.getLngDegrees+","+coord.getLatDegrees+",0") 38 | println("") 39 | println("") 40 | } 41 | } 42 | 43 | /*initializeOptionsFromCommandLine(args) 44 | 45 | val corpus = TopoUtil.readStoredCorpusFromSerialized(getSerializedCorpusInputPath) 46 | 47 | for(doc <- corpus) { 48 | for(sent <- doc) { 49 | for(toponym <- sent.getToponyms.filter(_.getAmbiguity > 0)) { 50 | 51 | } 52 | } 53 | }*/ 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/app/GeoTextLabelPropDecoder.scala: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.app 2 | 3 | import java.io._ 4 | import java.util._ 5 | 6 | import opennlp.fieldspring.tr.text._ 7 | import opennlp.fieldspring.tr.text.io._ 8 | import opennlp.fieldspring.tr.text.prep._ 9 | import opennlp.fieldspring.tr.topo._ 10 | import opennlp.fieldspring.tr.app._ 11 | import opennlp.fieldspring.tr.util.TopoUtil 12 | 13 | import scala.collection.JavaConversions._ 14 | 15 | object GeoTextLabelPropDecoder extends BaseApp { 16 | 17 | import BaseApp._ 18 | 19 | def DPC = 1.0 20 | 21 | def CELL_ = "cell_" 22 | def CELL_LABEL_ = "cell_label_" 23 | //def DOC_ = "doc_" 24 | def USER_ = "USER_" 25 | def UNI_ = "uni_" 26 | def BI_ = "bi_" 27 | 28 | def main(args: Array[String]) = { 29 | 30 | this.initializeOptionsFromCommandLine(args) 31 | this.doDecode 32 | 33 | } 34 | 35 | def doDecode() = { 36 | checkExists(getSerializedCorpusInputPath) 37 | checkExists(getGraphInputPath) 38 | 39 | val corpus = TopoUtil.readStoredCorpusFromSerialized(getSerializedCorpusInputPath) 40 | 41 | val docIdsToCells = new collection.mutable.HashMap[String, Int] 42 | 43 | val lines = scala.io.Source.fromFile(getGraphInputPath).getLines 44 | 45 | for(line <- lines) { 46 | val tokens = line.split("\t") 47 | 48 | if(tokens.length >= 4 && tokens(0).startsWith(USER_)) { 49 | val docId = tokens(0) 50 | 51 | val innertokens = tokens(3).split(" ") 52 | 53 | docIdsToCells.put(docId, findGreatestCell(innertokens)) 54 | } 55 | } 56 | 57 | for(document <- corpus) { 58 | if(document.isDev || document.isTest) { 59 | if(docIdsToCells.containsKey(document.getId)) { 60 | val cellNumber = docIdsToCells(document.getId) 61 | if(cellNumber != -1) { 62 | val lat = ((cellNumber / 1000) * DPC) + DPC/2.0 63 | val lon = ((cellNumber % 1000) * DPC) + DPC/2.0 64 | document.setSystemCoord(Coordinate.fromDegrees(lat, lon)) 65 | } 66 | } 67 | } 68 | } 69 | 70 | val eval = new EvaluateCorpus 71 | eval.doEval(corpus, corpus, CORPUS_FORMAT.GEOTEXT, true) 72 | } 73 | 74 | def findGreatestCell(innertokens: Array[String]): Int = { 75 | 76 | for(innertoken <- innertokens) { 77 | if(innertoken.startsWith(CELL_LABEL_)) { 78 | return innertoken.substring(CELL_LABEL_.length).toInt 79 | } 80 | } 81 | 82 | return -1 83 | } 84 | 85 | } 86 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/app/Preprocess.scala: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.app 17 | 18 | import java.io._ 19 | 20 | import opennlp.fieldspring.tr.topo.gaz._ 21 | import opennlp.fieldspring.tr.text._ 22 | import opennlp.fieldspring.tr.text.io._ 23 | import opennlp.fieldspring.tr.text.prep._ 24 | import opennlp.fieldspring.tr.util.Constants 25 | 26 | object Preprocess extends App { 27 | override def main(args: Array[String]) { 28 | val divider = new OpenNLPSentenceDivider 29 | val tokenizer = new OpenNLPTokenizer 30 | val recognizer = new OpenNLPRecognizer 31 | val gazetteer = new InMemoryGazetteer 32 | 33 | gazetteer.load(new WorldReader(new File( 34 | Constants.getGazetteersDir() + File.separator + "dataen-fixed.txt.gz" 35 | ))) 36 | 37 | val corpus = Corpus.createStreamCorpus 38 | 39 | val in = new BufferedReader(new FileReader(args(0))) 40 | corpus.addSource( 41 | new ToponymAnnotator(new PlainTextSource(in, divider, tokenizer, args(0)), 42 | recognizer, gazetteer 43 | )) 44 | 45 | val writer = new CorpusXMLWriter(corpus) 46 | writer.write(new File(args(1))) 47 | } 48 | } 49 | 50 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/app/ReprocessTrApp.scala: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.app 17 | 18 | import java.io._ 19 | 20 | import opennlp.fieldspring.tr.eval._ 21 | import opennlp.fieldspring.tr.resolver._ 22 | import opennlp.fieldspring.tr.topo.gaz._ 23 | import opennlp.fieldspring.tr.text._ 24 | import opennlp.fieldspring.tr.text.io._ 25 | import opennlp.fieldspring.tr.text.prep._ 26 | import opennlp.fieldspring.tr.util.Constants 27 | 28 | object ReprocessTrApp { 29 | def main(args: Array[String]) { 30 | val tokenizer = new OpenNLPTokenizer 31 | val recognizer = new OpenNLPRecognizer 32 | 33 | val gazetteer = new InMemoryGazetteer 34 | gazetteer.load(new WorldReader(new File( 35 | Constants.getGazetteersDir() + File.separator + "dataen-fixed.txt.gz" 36 | ))) 37 | 38 | val corpus = Corpus.createStreamCorpus 39 | val source = new TrXMLDirSource(new File(args(0)), tokenizer) 40 | val stripped = new ToponymRemover(source) 41 | corpus.addSource(new ToponymAnnotator(stripped, recognizer, gazetteer)) 42 | 43 | val writer = new CorpusXMLWriter(corpus) 44 | writer.write(new File(args(1))) 45 | } 46 | } 47 | 48 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/app/SplitDevTest.scala: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.app 2 | 3 | import java.io._ 4 | 5 | object SplitDevTest extends App { 6 | val dir = new File(args(0)) 7 | 8 | val devDir = new File(dir.getCanonicalPath+"dev") 9 | val testDir = new File(dir.getCanonicalPath+"test") 10 | devDir.mkdir 11 | testDir.mkdir 12 | 13 | val files = dir.listFiles 14 | 15 | var i = 1 16 | for(file <- files) { 17 | if(i % 3 == 0) 18 | file.renameTo(new File(testDir, file.getName)) 19 | else 20 | file.renameTo(new File(devDir, file.getName)) 21 | i += 1 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/app/TrainingDirectoriesCombiner.scala: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.app 2 | 3 | import java.io._ 4 | 5 | object TrainingDirectoriesCombiner extends App { 6 | val inDir1 = new File(args(0)) 7 | val inDir2 = new File(args(1)) 8 | val outDir = new File(args(2)) 9 | 10 | if(!outDir.exists) 11 | outDir.mkdir 12 | 13 | // First clear the source directory: 14 | for(file <- outDir.listFiles) 15 | file.delete 16 | 17 | lineByLineCopy(inDir1, outDir) 18 | lineByLineCopy(inDir2, outDir) 19 | 20 | def lineByLineCopy(inDir:File, outDir:File) { 21 | for(file <- inDir.listFiles.filter(_.getName.endsWith(".txt"))) { 22 | val in = new BufferedReader(new FileReader(file)) 23 | val out = new BufferedWriter(new FileWriter(outDir.getCanonicalPath+"/"+file.getName, true)) 24 | println(inDir.getCanonicalPath+"/"+file.getName+" >> "+outDir.getCanonicalPath+"/"+file.getName) 25 | var line = "i" 26 | while(line != null) { 27 | try { 28 | line = in.readLine 29 | } catch { 30 | case e: java.nio.charset.MalformedInputException => line = "E" 31 | } 32 | if(line != null && line.size > 1) 33 | out.write(line+"\n") 34 | } 35 | out.close 36 | in.close 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/model/AltBasicMinDistModel.scala: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.resolver 17 | 18 | import scala.collection.JavaConversions._ 19 | import opennlp.fieldspring.tr.text._ 20 | 21 | 22 | class AltBasicMinDistResolver extends Resolver { 23 | def disambiguate(corpus: StoredCorpus): StoredCorpus = { 24 | 25 | /* Iterate over documents. */ 26 | corpus.foreach { document => 27 | 28 | /* Collect a list of toponyms with candidates for each document. */ 29 | val toponyms = document.flatMap(_.getToponyms).filter(_.getAmbiguity > 0).toList 30 | 31 | /* For each toponym, pick the best candidate. */ 32 | toponyms.foreach { toponym => 33 | 34 | /* Compute all valid totals with indices. */ 35 | toponym.zipWithIndex.flatMap { case (candidate, idx) => 36 | toponyms.filterNot(_ == toponym) match { 37 | case Nil => None 38 | case ts => Some(ts.map(_.map(_.distance(candidate)).min).sum, idx) 39 | } 40 | } match { 41 | case Nil => () 42 | case xs => toponym.setSelectedIdx(xs.min._2) 43 | } 44 | } 45 | } 46 | 47 | return corpus 48 | } 49 | } 50 | 51 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/resolver/DocDistResolver.scala: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.resolver 2 | 3 | import opennlp.fieldspring.tr.text._ 4 | import opennlp.fieldspring.tr.topo._ 5 | import opennlp.fieldspring.tr.util._ 6 | 7 | import scala.collection.JavaConversions._ 8 | 9 | class DocDistResolver(val logFilePath:String) extends Resolver { 10 | 11 | def disambiguate(corpus:StoredCorpus): StoredCorpus = { 12 | 13 | val predDocLocations = (for(pe <- LogUtil.parseLogFile(logFilePath)) yield { 14 | (pe.docName, pe.predCoord) 15 | }).toMap 16 | 17 | for(doc <- corpus) { 18 | for(sent <- doc) { 19 | for(toponym <- sent.getToponyms.filter(_.getAmbiguity > 0)) { 20 | if(overwriteSelecteds || !toponym.hasSelected) { 21 | val predDocLocation = predDocLocations.getOrElse(doc.getId, null) 22 | if(predDocLocation != null) { 23 | val indexToSelect = toponym.getCandidates.zipWithIndex.minBy( 24 | p => p._1.getRegion.distance(predDocLocation))._2 25 | if(indexToSelect != -1) { 26 | toponym.setSelectedIdx(indexToSelect) 27 | } 28 | } 29 | } 30 | } 31 | } 32 | } 33 | 34 | corpus 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/resolver/PopulationResolver.scala: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.resolver 2 | 3 | import opennlp.fieldspring.tr.text._ 4 | import opennlp.fieldspring.tr.topo._ 5 | import opennlp.fieldspring.tr.util._ 6 | 7 | import scala.collection.JavaConversions._ 8 | 9 | class PopulationResolver extends Resolver { 10 | 11 | def disambiguate(corpus:StoredCorpus): StoredCorpus = { 12 | 13 | val rand = new scala.util.Random 14 | 15 | for(doc <- corpus) { 16 | for(sent <- doc) { 17 | for(toponym <- sent.getToponyms.filter(_.getAmbiguity > 0)) { 18 | val maxPopLocPair = toponym.getCandidates.zipWithIndex.maxBy(_._1.getPopulation) 19 | if(maxPopLocPair._1.getPopulation > 0) 20 | toponym.setSelectedIdx(maxPopLocPair._2) 21 | else 22 | toponym.setSelectedIdx(rand.nextInt(toponym.getAmbiguity)) // back off to random 23 | } 24 | } 25 | } 26 | 27 | corpus 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/resolver/TPPResolver.scala: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.resolver 2 | 3 | import opennlp.fieldspring.tr.tpp._ 4 | 5 | abstract class TPPResolver(val tppInstance:TPPInstance) extends Resolver { 6 | 7 | } 8 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/resolver/ToponymAsDocDistResolver.scala: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.resolver 2 | 3 | import opennlp.fieldspring.tr.text._ 4 | import opennlp.fieldspring.tr.topo._ 5 | import opennlp.fieldspring.tr.util._ 6 | 7 | import scala.collection.JavaConversions._ 8 | 9 | class ToponymAsDocDistResolver(val logFilePath:String) extends Resolver { 10 | 11 | val docTokRE = """(.+)_([0-9]+)""".r 12 | val alphanumRE = """^[a-zA-Z0-9]+$""".r 13 | 14 | def disambiguate(corpus:StoredCorpus): StoredCorpus = { 15 | 16 | val predLocations = (for(pe <- LogUtil.parseLogFile(logFilePath)) yield { 17 | val docTokRE(docName, tokenIndex) = pe.docName 18 | ((docName, tokenIndex.toInt), pe.predCoord) 19 | }).toMap 20 | 21 | for(doc <- corpus) { 22 | var tokenIndex = 0 23 | for(sent <- doc) { 24 | for(token <- sent.filter(t => alphanumRE.findFirstIn(t.getForm) != None)) { 25 | if(token.isToponym && token.asInstanceOf[Toponym].getAmbiguity > 0) { 26 | val toponym = token.asInstanceOf[Toponym] 27 | val predLocation = predLocations.getOrElse((doc.getId, tokenIndex), null) 28 | if(predLocation != null) { 29 | val indexToSelect = toponym.getCandidates.zipWithIndex.minBy(p => p._1.getRegion.distance(predLocation))._2 30 | if(indexToSelect != -1) { 31 | toponym.setSelectedIdx(indexToSelect) 32 | } 33 | } 34 | } 35 | tokenIndex += 1 36 | } 37 | } 38 | } 39 | 40 | corpus 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/text/io/DynamicKMLWriter.scala: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.text.io 2 | 3 | import java.io._ 4 | import java.util._ 5 | import javax.xml.datatype._ 6 | import javax.xml.stream._ 7 | 8 | import opennlp.fieldspring.tr.text._ 9 | import opennlp.fieldspring.tr.topo._ 10 | import opennlp.fieldspring.tr.util._ 11 | 12 | import scala.collection.JavaConversions._ 13 | 14 | class DynamicKMLWriter(val corpus:StoredCorpus/*, 15 | val outputGoldLocations:Boolean = false*/) { 16 | 17 | lazy val factory = XMLOutputFactory.newInstance 18 | 19 | val CONTEXT_SIZE = 20 20 | 21 | def write(out:XMLStreamWriter) { 22 | KMLUtil.writeHeader(out, "corpus") 23 | 24 | var globalTokIndex = 0 25 | var globalTopIndex = 1 26 | for(doc <- corpus) { 27 | val docArray = TextUtil.getDocAsArray(doc) 28 | var tokIndex = 0 29 | for(token <- docArray) { 30 | if(token.isToponym) { 31 | val toponym = token.asInstanceOf[Toponym] 32 | if(toponym.getAmbiguity > 0 && toponym.hasSelected) { 33 | val coord = toponym.getSelected.getRegion.getCenter 34 | val context = TextUtil.getContext(docArray, tokIndex, CONTEXT_SIZE) 35 | KMLUtil.writePinTimeStampPlacemark(out, toponym.getOrigForm, coord, context, globalTopIndex) 36 | globalTopIndex += 1 37 | } 38 | } 39 | tokIndex += 1 40 | globalTokIndex += 1 41 | } 42 | } 43 | 44 | KMLUtil.writeFooter(out) 45 | out.close 46 | } 47 | 48 | def write(file:File) { 49 | val stream = new BufferedOutputStream(new FileOutputStream(file)) 50 | this.write(this.factory.createXMLStreamWriter(stream, "UTF-8")) 51 | stream.close() 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/text/io/GigawordSource.scala: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.text.io 17 | 18 | import java.io.BufferedReader 19 | import java.io.File 20 | import java.io.FileReader 21 | import java.util.ArrayList 22 | import java.util.List 23 | import scala.collection.JavaConversions._ 24 | import scala.collection.mutable.Buffer 25 | 26 | import opennlp.fieldspring.tr.text._ 27 | import opennlp.fieldspring.tr.text.prep._ 28 | 29 | class GigawordSource( 30 | reader: BufferedReader, 31 | private val sentencesPerDocument: Int, 32 | private val numberOfDocuments: Int) 33 | extends TextSource(reader) { 34 | 35 | def this(reader: BufferedReader, sentencesPerDocument: Int) = 36 | this(reader, sentencesPerDocument, Int.MaxValue) 37 | def this(reader: BufferedReader) = this(reader, 50) 38 | 39 | val sentences = new Iterator[Sentence[Token]] { 40 | var current = GigawordSource.this.readLine 41 | def hasNext: Boolean = current != null 42 | def next: Sentence[Token] = new Sentence[Token](null) { 43 | val buffer = Buffer(new SimpleToken(current)) 44 | current = GigawordSource.this.readLine 45 | while (current.trim.length > 0) { 46 | buffer += new SimpleToken(current) 47 | current = GigawordSource.this.readLine 48 | } 49 | current = GigawordSource.this.readLine 50 | 51 | def tokens: java.util.Iterator[Token] = buffer.toIterator 52 | } 53 | }.grouped(sentencesPerDocument).take(numberOfDocuments) 54 | 55 | def hasNext: Boolean = sentences.hasNext 56 | 57 | def next: Document[Token] = new Document[Token](null) { 58 | def iterator: java.util.Iterator[Sentence[Token]] = 59 | sentences.next.toIterator 60 | } 61 | } 62 | 63 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/text/io/WikiTextSource.scala: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.text.io 2 | 3 | import java.io._ 4 | 5 | import scala.collection.JavaConversions._ 6 | import scala.collection.mutable.Buffer 7 | import opennlp.fieldspring.tr.text._ 8 | import opennlp.fieldspring.tr.text.prep._ 9 | 10 | class WikiTextSource( 11 | reader: BufferedReader 12 | ) extends TextSource(reader) { 13 | 14 | val TITLE_PREFIX = "Article title: " 15 | val TITLE_INDEX = TITLE_PREFIX.length 16 | val ID_INDEX = "Article ID: ".length 17 | 18 | var id = "-1" 19 | var title = "" 20 | 21 | val sentences = new Iterator[Sentence[Token]] { 22 | var current = WikiTextSource.this.readLine 23 | 24 | def hasNext: Boolean = current != null 25 | def next: Sentence[Token] = new Sentence[Token](null) { 26 | if(current != null) { 27 | title = current.drop(TITLE_INDEX).trim 28 | current = WikiTextSource.this.readLine 29 | id = current.drop(ID_INDEX).trim 30 | current = WikiTextSource.this.readLine 31 | } 32 | val buffer = Buffer(new SimpleToken(current)) 33 | current = WikiTextSource.this.readLine 34 | while (current != null && !current.trim.startsWith(TITLE_PREFIX)) { 35 | buffer += new SimpleToken(current) 36 | current = WikiTextSource.this.readLine 37 | } 38 | 39 | def tokens: java.util.Iterator[Token] = buffer.toIterator 40 | } 41 | }.grouped(1) // assume each document is a whole sentence, since we don't have sentence boundaries 42 | 43 | def hasNext: Boolean = sentences.hasNext 44 | 45 | def next: Document[Token] = { 46 | new Document[Token](id, title) { 47 | def iterator: java.util.Iterator[Sentence[Token]] = { 48 | sentences.next.toIterator 49 | } 50 | } 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/topo/SphericalGeometry.scala: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.topo 17 | 18 | import scala.io._ 19 | 20 | import scala.collection.JavaConversions._ 21 | 22 | import opennlp.fieldspring.tr.util.cluster._ 23 | 24 | object SphericalGeometry { 25 | implicit def g: Geometry[Coordinate] = new Geometry[Coordinate] { 26 | def distance(x: Coordinate)(y: Coordinate): Double = x.distance(y) 27 | def centroid(ps: Seq[Coordinate]): Coordinate = Coordinate.centroid(ps) 28 | } 29 | 30 | def main(args: Array[String]) { 31 | val max = args(1).toInt 32 | val k = args(2).toInt 33 | val style = args(3) 34 | 35 | val cs = Source.fromFile(args(0)).getLines.map { line => 36 | val Array(lat, lng) = line.split("\t").map(_.toDouble) 37 | Coordinate.fromDegrees(lat, lng) 38 | }.toIndexedSeq 39 | println("Loaded...") 40 | 41 | val xs = scala.util.Random.shuffle(cs).take(max) 42 | 43 | println(Coordinate.centroid(xs)) 44 | 45 | val clusterer = new KMeans 46 | val clusters = clusterer.cluster(xs, k) 47 | clusters.foreach { 48 | case c => println("" + 49 | style + "" + 50 | c.getLngDegrees + "," + c.getLatDegrees + 51 | "") 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/topo/gaz/CorpusGazetteerReader.scala: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.topo.gaz 17 | 18 | import java.util.Iterator 19 | import scala.collection.JavaConversions._ 20 | 21 | import opennlp.fieldspring.tr.text.Corpus 22 | import opennlp.fieldspring.tr.text.Token 23 | import opennlp.fieldspring.tr.topo.Location 24 | 25 | class CorpusGazetteerReader(private val corpus: Corpus[_ <: Token]) 26 | extends GazetteerReader { 27 | 28 | private val it = corpus.flatMap(_.flatMap { 29 | _.getToponyms.flatMap(_.getCandidates) 30 | }).toIterator 31 | 32 | def hasNext: Boolean = it.hasNext 33 | def next: Location = it.next 34 | 35 | def close() { 36 | corpus.close() 37 | } 38 | } 39 | 40 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/topo/gaz/geonames/GeoNamesParser.scala: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.topo.gaz.geonames 17 | 18 | import java.io._ 19 | import scala.collection.JavaConversions._ 20 | import scala.io._ 21 | 22 | import opennlp.fieldspring.tr.text.Corpus 23 | import opennlp.fieldspring.tr.text.Token 24 | import opennlp.fieldspring.tr.topo.Location 25 | 26 | class GeoNamesParser(private val file: File) { 27 | val locs = scala.collection.mutable.Map[String, List[(Double, Double)]]() 28 | 29 | Source.fromFile(file).getLines.foreach { line => 30 | val Array(lat, lng) = line.split("\t").map(_.toDouble) 31 | 32 | } 33 | } 34 | 35 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/topo/util/CodeConverter.scala: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.topo.util 17 | 18 | import java.io._ 19 | import java.io.InputStream 20 | import scala.collection.mutable.Map 21 | 22 | class CodeConverter(in: InputStream) { 23 | def this() = this { 24 | getClass.getResourceAsStream("/data/geo/country-codes.txt") 25 | } 26 | 27 | case class Country( 28 | name: String, 29 | fips: Option[String], 30 | iso: Option[(String, String, Int)], 31 | stanag: Option[String], 32 | tld: Option[String]) 33 | 34 | private val countriesF = Map[String, Country]() 35 | private val countriesI = Map[String, Country]() 36 | private val reader = new BufferedReader(new InputStreamReader(in)) 37 | 38 | private var line = reader.readLine 39 | while (line != null) { 40 | val fs = line.split("\t") 41 | val country = Country( 42 | fs(0), 43 | if (fs(1) == "-") None else Some(fs(1)), 44 | if (fs(2) == "-") None else Some(fs(2), fs(3), fs(4).toInt), 45 | if (fs(5) == "-") None else Some(fs(5)), 46 | if (fs(6) == "-") None else Some(fs(6)) 47 | ) 48 | country.fips match { 49 | case Some(fips) => countriesF(fips) = country 50 | case _ => 51 | } 52 | country.iso match { 53 | case Some((iso2, _, _)) => countriesI(iso2) = country 54 | case _ => 55 | } 56 | line = reader.readLine 57 | } 58 | reader.close() 59 | 60 | def convertFipsToIso2(code: String): Option[String] = 61 | countriesF.get(code).flatMap(_.iso.map(_._1)) 62 | 63 | def convertIso2ToFips(code: String): Option[String] = 64 | countriesI.get(code).flatMap(_.fips) 65 | } 66 | 67 | object CodeConverter { 68 | def main(args: Array[String]) { 69 | val converter = new CodeConverter() 70 | println(args(0) match { 71 | case "f2i" => converter.convertIso2ToFips(args(1)) 72 | case "i2f" => converter.convertFipsToIso2(args(1)) 73 | }) 74 | } 75 | } 76 | 77 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/tpp/FileTravelCoster.scala: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.tpp 2 | 3 | import opennlp.fieldspring.tr.util._ 4 | import opennlp.fieldspring.tr.text._ 5 | 6 | import java.io._ 7 | import java.util.ArrayList 8 | import scala.collection.JavaConversions._ 9 | 10 | class FileTravelCoster(val inputFile:String, val corpus:StoredCorpus, val dpc:Double) extends TravelCoster { 11 | 12 | val gaussianTC = new GaussianTravelCoster 13 | 14 | val relevantMarkets = new scala.collection.mutable.HashSet[Int] 15 | 16 | for(doc <- corpus) { 17 | for(sent <- doc) { 18 | for(toponym <- sent.getToponyms) { 19 | for(loc <- toponym.getCandidates) { 20 | //for(coord <- loc.getRegion.getRepresentatives) { 21 | relevantMarkets.add(TopoUtil.getCellNumber(loc.getRegion.getCenter, dpc)) 22 | //} 23 | } 24 | } 25 | } 26 | } 27 | 28 | val probs = new scala.collection.mutable.HashMap[Int, scala.collection.mutable.HashMap[Int, Double]] 29 | val costs = new scala.collection.mutable.HashMap[(Int, Int), Double] 30 | 31 | val in = new DataInputStream(new FileInputStream(inputFile)) 32 | 33 | try { 34 | while(true) { 35 | val id1 = in.readInt 36 | val id2 = in.readInt 37 | val prob = in.readDouble 38 | 39 | if(relevantMarkets.contains(id1) && relevantMarkets.contains(id2)) { 40 | val destinations = probs.getOrElse(id1, new scala.collection.mutable.HashMap[Int, Double]) 41 | destinations.put(id2, prob) 42 | probs.put(id1, destinations) 43 | } 44 | } 45 | } catch { 46 | case e:Exception => 47 | } 48 | 49 | in.close 50 | 51 | for((id1, destinations) <- probs) { 52 | val total = destinations.map(_._2).sum 53 | for((id2, cost) <- destinations) { 54 | costs.put((id1, id2), 1.0-cost/total) 55 | } 56 | } 57 | 58 | probs.clear 59 | 60 | println("Read "+costs.size+" relevant probabilities.") 61 | 62 | def apply(m1:Market, m2:Market): Double = { 63 | //if(costs.contains((m1.id, m2.id))) println("Returned cost of "+costs((m1.id, m2.id))+" from file.") 64 | //else println("Return default cost of "+gaussianTC(m1, m2)) 65 | costs.getOrElse((m1.id, m2.id), gaussianTC(m1, m2)) 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/tpp/GaussianPurchaseCoster.scala: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.tpp 2 | 3 | import opennlp.fieldspring.tr.topo._ 4 | 5 | class GaussianPurchaseCoster extends PurchaseCoster { 6 | 7 | val VARIANCE_KM = 161.0 8 | val variance = VARIANCE_KM / 6372.8 9 | 10 | def g(x:Double, y:Double) = GaussianUtil.g(x,y) 11 | 12 | //val maxHeight = g(0.0,0.0) 13 | 14 | val storedCosts = new scala.collection.mutable.HashMap[(Int, Int), Double] // (location.id, market.id) => distance 15 | def cost(l:Location, m:Market): Double = { 16 | val key = (l.getId, m.id) 17 | if(storedCosts.contains(key)) 18 | storedCosts(key) 19 | else { 20 | val cost = 1.0-g(l.getRegion.distance(m.centroid)/variance, 0)///max 21 | //val cost = (maxHeight-g(l.getRegion.distance(m.centroid)/variance, 0))/maxHeight///max 22 | storedCosts.put(key, cost) 23 | cost 24 | } 25 | } 26 | 27 | def apply(m:Market, potLoc:PotentialLocation): Double = { 28 | cost(potLoc.loc, m) 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/tpp/GaussianTravelCoster.scala: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.tpp 2 | 3 | import opennlp.fieldspring.tr.topo._ 4 | 5 | class GaussianTravelCoster extends TravelCoster { 6 | 7 | val VARIANCE_KM = 1610.0 8 | val variance = VARIANCE_KM / 6372.8 9 | 10 | def g(x:Double, y:Double) = GaussianUtil.g(x,y) 11 | 12 | val maxHeight = g(0.0,0.0) 13 | 14 | def apply(m1:Market, m2:Market): Double = { 15 | (maxHeight-g(m1.centroid.distance(m2.centroid)/variance, 0))/maxHeight 16 | } 17 | 18 | /* old implementation: 19 | def apply(m1:Market, m2:Market): Double = { 20 | 1.0-g(m1.centroid.distance(m2.centroid)/variance, 0) 21 | }*/ 22 | } 23 | 24 | object GaussianTravelCoster extends App { 25 | val gtc = new GaussianTravelCoster 26 | //println((gtc.maxHeight-gtc.g(0,0))/gtc.maxHeight) 27 | //println((gtc.maxHeight-gtc.g(0.25,0))/gtc.maxHeight) 28 | println((gtc.maxHeight-gtc.g(0.5,0))/gtc.maxHeight) 29 | println((gtc.maxHeight-gtc.g(1.0,0))/gtc.maxHeight) 30 | println((gtc.maxHeight-gtc.g(2.0,0))/gtc.maxHeight) 31 | //println((gtc.maxHeight-gtc.g(3.0,0))/gtc.maxHeight) 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/tpp/GaussianUtil.scala: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.tpp 2 | 3 | object GaussianUtil { 4 | def left(sig_x:Double, sig_y:Double, rho:Double) = 1.0/(2*math.Pi*sig_x*sig_y*math.pow(1-rho*rho,.5)) 5 | 6 | def right(x:Double, y:Double, mu_x:Double, mu_y:Double, sig_x:Double, sig_y:Double, rho:Double) = math.exp(-1.0/(2*(1-rho*rho))*( math.pow(x-mu_x,2)/math.pow(sig_x,2) + math.pow(y-mu_y,2)/math.pow(sig_y,2) - (2*rho*(x-mu_x)*(y-mu_y))/(sig_x*sig_y))) 7 | 8 | def f(x:Double, y:Double, mu_x:Double, mu_y:Double, sig_x:Double, sig_y:Double, rho:Double) = left(sig_x,sig_y,rho) * right(x,y,mu_x,mu_y,sig_x,sig_y,rho) 9 | 10 | def g(x:Double,y:Double) = f(x,y,0,0,1,1,0) 11 | } 12 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/tpp/GridMarketCreator.scala: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.tpp 2 | 3 | import opennlp.fieldspring.tr.text._ 4 | import opennlp.fieldspring.tr.util._ 5 | 6 | import scala.collection.JavaConversions._ 7 | 8 | class GridMarketCreator(doc:Document[StoredToken], val dpc:Double, val purchaseCoster:PurchaseCoster = null) extends MarketCreator(doc) { 9 | override def apply:List[Market] = { 10 | val cellNumsToPotLocs = new scala.collection.mutable.HashMap[Int, scala.collection.mutable.HashMap[ToponymMention, PotentialLocation]] 11 | 12 | val docAsArray = TextUtil.getDocAsArrayNoFilter(doc) 13 | 14 | var tokIndex = 0 15 | for(token <- docAsArray) { 16 | if(token.isToponym && token.asInstanceOf[Toponym].getAmbiguity > 0) { 17 | val toponym = token.asInstanceOf[Toponym] 18 | var gazIndex = 0 19 | for(loc <- toponym.getCandidates) { 20 | val topMen = new ToponymMention(doc.getId, tokIndex) 21 | val potLoc = new PotentialLocation(doc.getId, tokIndex, gazIndex, loc) 22 | 23 | val cellNums = TopoUtil.getCellNumbers(loc, dpc) 24 | for(cellNum <- cellNums) { 25 | val potLocs = cellNumsToPotLocs.getOrElse(cellNum, new scala.collection.mutable.HashMap[ToponymMention, PotentialLocation]) 26 | val curPotLoc = potLocs.getOrElse(topMen, null) 27 | 28 | //if(purchaseCoster == null) 29 | // println("NULL!") 30 | 31 | /*if(curPotLoc != null) { 32 | println("\n"+purchaseCoster(null, curPotLoc)) 33 | println(purchaseCoster(null, potLoc)) 34 | }*/ 35 | 36 | if(purchaseCoster == null || curPotLoc == null || purchaseCoster(null, potLoc) < purchaseCoster(null, curPotLoc)) { 37 | potLocs.put(topMen, potLoc) 38 | cellNumsToPotLocs.put(cellNum, potLocs) 39 | } 40 | /*else if(purchaseCoster != null && curPotLoc != null && purchaseCoster(null, potLoc) > purchaseCoster(null, curPotLoc)) { 41 | println("Market "+cellNum+" already had a "+potLoc+" with cost "+purchaseCoster(null, curPotLoc)+", which is cheaper than "+purchaseCoster(null, potLoc)) 42 | }*/ 43 | } 44 | gazIndex += 1 45 | } 46 | } 47 | tokIndex += 1 48 | } 49 | 50 | (for(p <- cellNumsToPotLocs) yield { 51 | new Market(p._1, p._2.toMap) 52 | }).toList 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/tpp/MarketCreator.scala: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.tpp 2 | 3 | import opennlp.fieldspring.tr.text._ 4 | 5 | abstract class MarketCreator(val doc:Document[StoredToken]) { 6 | def apply:List[Market] 7 | } 8 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/tpp/MaxentPurchaseCoster.scala: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.tpp 2 | 3 | import opennlp.fieldspring.tr.text._ 4 | import opennlp.fieldspring.tr.util._ 5 | import opennlp.fieldspring.tr.resolver._ 6 | import opennlp.maxent._ 7 | import opennlp.maxent.io._ 8 | import opennlp.model._ 9 | 10 | import java.io._ 11 | 12 | import scala.collection.JavaConversions._ 13 | 14 | class MaxentPurchaseCoster(corpus:StoredCorpus, modelDirPath:String) extends PurchaseCoster { 15 | 16 | val windowSize = 20 17 | 18 | val modelDir = new File(modelDirPath) 19 | 20 | val toponymsToModels:Map[String, AbstractModel] = 21 | (for(file <- modelDir.listFiles.filter(_.getName.endsWith(".mxm"))) yield { 22 | val dataInputStream = new DataInputStream(new FileInputStream(file)); 23 | val reader = new BinaryGISModelReader(dataInputStream) 24 | val model = reader.getModel 25 | 26 | (file.getName.dropRight(4).replaceAll("_", " "), model) 27 | }).toMap 28 | 29 | val potLocsToCosts = new scala.collection.mutable.HashMap[PotentialLocation, Double] 30 | 31 | for(doc <- corpus) { 32 | val docAsArray = TextUtil.getDocAsArrayNoFilter(doc) 33 | var tokIndex = 0 34 | for(token <- docAsArray) { 35 | if(token.isToponym && token.asInstanceOf[Toponym].getAmbiguity > 0 36 | && toponymsToModels.containsKey(token.getForm)) { 37 | val toponym = token.asInstanceOf[Toponym] 38 | val contextFeatures = TextUtil.getContextFeatures(docAsArray, tokIndex, windowSize, Set[String]()) 39 | 40 | val indexToWeightMap = MaxentResolver.getIndexToWeightMap(toponymsToModels(token.getForm), contextFeatures) 41 | //contextFeatures.foreach(f => print(f+",")); println 42 | for((gazIndex, weight) <- indexToWeightMap.toList.sortBy(_._1)) { 43 | val loc = toponym.getCandidates.get(gazIndex) 44 | val potLoc = new PotentialLocation(doc.getId, tokIndex, gazIndex, loc) 45 | //println(" "+gazIndex+": "+(1.0-weight)) 46 | potLocsToCosts.put(potLoc, 1.0-weight) // Here's where the cost is defined in terms of the probability mass 47 | } 48 | 49 | } 50 | tokIndex += 1 51 | } 52 | } 53 | 54 | def apply(m:Market, potLoc:PotentialLocation): Double = { 55 | //if(m.locations.map(_._2).toSet.contains(potLoc)) { 56 | 1.0//potLocsToCosts.getOrElse(potLoc, 1.0) // Not sure what the default cost should be 57 | //} 58 | //else 59 | // Double.PositiveInfinity 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/tpp/MultiPurchaseCoster.scala: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.tpp 2 | 3 | class MultiPurchaseCoster(val purchaseCosters:List[PurchaseCoster]) extends PurchaseCoster { 4 | 5 | def apply(m:Market, potLoc:PotentialLocation): Double = { 6 | purchaseCosters.map(pc => pc(m, potLoc)).reduce(_*_) 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/tpp/PurchaseCoster.scala: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.tpp 2 | 3 | abstract class PurchaseCoster { 4 | 5 | def apply(m:Market, potLoc:PotentialLocation): Double 6 | } 7 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/tpp/SimpleContainmentPurchaseCoster.scala: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.tpp 2 | 3 | class SimpleContainmentPurchaseCoster extends PurchaseCoster { 4 | 5 | def apply(m:Market, potLoc:PotentialLocation): Double = { 6 | if(m.locations.map(_._2).toSet.contains(potLoc)) 7 | 1.0 8 | else 9 | Double.PositiveInfinity 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/tpp/SimpleDistanceTravelCoster.scala: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.tpp 2 | 3 | import opennlp.fieldspring.tr.util._ 4 | 5 | class SimpleDistanceTravelCoster extends TravelCoster { 6 | 7 | val storedDistances = new scala.collection.mutable.HashMap[(Int, Int), Double] 8 | val distanceTable = new DistanceTable 9 | 10 | def apply(m1:Market, m2:Market): Double = { 11 | 12 | if(storedDistances.contains((m1.id, m2.id))) { 13 | //println(storedDistances((m1.id, m2.id))) 14 | storedDistances((m1.id, m2.id)) 15 | } 16 | 17 | else { 18 | var minDist = Double.PositiveInfinity 19 | for(loc1 <- m1.locations.map(_._2).map(_.loc)) { 20 | for(loc2 <- m2.locations.map(_._2).map(_.loc)) { 21 | val dist = distanceTable.distance(loc1, loc2) 22 | if(dist < minDist) 23 | minDist = dist 24 | } 25 | } 26 | 27 | storedDistances.put((m1.id, m2.id), minDist) 28 | //println(minDist) 29 | minDist 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/tpp/TPPInstance.scala: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.tpp 2 | 3 | import opennlp.fieldspring.tr.topo._ 4 | 5 | class TPPInstance(val purchaseCoster:PurchaseCoster, 6 | val travelCoster:TravelCoster) { 7 | 8 | var markets:List[Market] = null 9 | 10 | def computeTourCost(tour:List[MarketVisit]): Double = { 11 | if(tour == null) 12 | Double.PositiveInfinity 13 | else { 14 | var cost = 0.0 15 | var prevMV:MarketVisit = null 16 | for(mv <- tour) { 17 | if(prevMV != null) 18 | cost += travelCoster(prevMV.market, mv.market) 19 | 20 | for((topMen, potLoc) <- mv.purchasedLocations) 21 | cost += purchaseCoster(mv.market, potLoc) 22 | 23 | prevMV = mv 24 | } 25 | cost 26 | } 27 | } 28 | } 29 | 30 | class Market(val id:Int, 31 | val locations:Map[ToponymMention, PotentialLocation]) { 32 | 33 | def size = locations.size 34 | 35 | lazy val centroid: Coordinate = { 36 | val lat:Double = locations.map(_._2.loc.getRegion.getCenter.getLat).sum/locations.size 37 | val lng:Double = locations.map(_._2.loc.getRegion.getCenter.getLng).sum/locations.size 38 | Coordinate.fromRadians(lat, lng) 39 | } 40 | } 41 | 42 | class PotentialLocation(val docId:String, 43 | val tokenIndex:Int, 44 | val gazIndex:Int, 45 | val loc:Location) { 46 | 47 | override def toString: String = { 48 | docId+":"+tokenIndex+":"+gazIndex 49 | } 50 | 51 | override def equals(other:Any):Boolean = { 52 | if(!other.isInstanceOf[PotentialLocation]) 53 | false 54 | else { 55 | val o = other.asInstanceOf[PotentialLocation] 56 | this.docId.equals(o.docId) && this.tokenIndex == o.tokenIndex && this.gazIndex == o.gazIndex && this.loc.equals(o.loc) 57 | } 58 | } 59 | 60 | val S = 41*41 61 | val C = S*41 62 | 63 | override def hashCode: Int = { 64 | C * (C + tokenIndex) + S * (S + docId.hashCode) + 41 * (41 * gazIndex) + loc.getId 65 | } 66 | 67 | } 68 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/tpp/TravelCoster.scala: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.tpp 2 | 3 | import opennlp.fieldspring.tr.text._ 4 | 5 | abstract class TravelCoster { 6 | 7 | var doc:Document[StoredToken] = null // In case the travel coster needs to know which document it's on 8 | 9 | def setDoc(doc:Document[StoredToken]) { 10 | this.doc = doc 11 | } 12 | 13 | def apply(m1:Market, m2:Market): Double 14 | } 15 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/util/Average.scala: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.util 2 | 3 | object Average extends App { 4 | println(args.map(_.toDouble).sum/args.length) 5 | } 6 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/util/DistanceTable.scala: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.util 2 | 3 | import opennlp.fieldspring.tr.topo._ 4 | 5 | class DistanceTable { 6 | 7 | val storedDistances = new scala.collection.mutable.HashMap[(Int, Int), Double] 8 | 9 | def distance(l1:Location, l2:Location): Double = { 10 | var leftLoc = l1 11 | var rightLoc = l2 12 | if(l1.getId > l2.getId) { 13 | leftLoc = l2 14 | rightLoc = l1 15 | } 16 | 17 | if(leftLoc.getRegion.getRepresentatives.size == 1 && rightLoc.getRegion.getRepresentatives.size == 1) { 18 | leftLoc.distance(rightLoc) 19 | } 20 | else { 21 | val key = (leftLoc.getId, rightLoc.getId) 22 | if(storedDistances.contains(key)) { 23 | storedDistances(key) 24 | } 25 | else { 26 | val dist = leftLoc.distance(rightLoc) 27 | storedDistances.put(key, dist) 28 | dist 29 | } 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/util/StopwordUtil.scala: -------------------------------------------------------------------------------- 1 | package opennlp.fieldspring.tr.util 2 | 3 | import java.io._ 4 | 5 | object StopwordUtil { 6 | 7 | def populateStoplist(filename: String): Set[String] = { 8 | var stoplist:Set[String] = Set() 9 | io.Source.fromFile(filename).getLines.foreach(line => stoplist += line) 10 | stoplist.toSet() 11 | stoplist 12 | } 13 | 14 | } 15 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/util/cluster/KMeans.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package opennlp.fieldspring.tr.util.cluster 17 | 18 | 19 | import java.io._ 20 | import scala.math._ 21 | import scala.collection.immutable.Vector 22 | import scala.collection.mutable.Buffer 23 | import scala.collection.JavaConversions._ 24 | import scala.util.Random 25 | 26 | trait Geometry[A] { 27 | def distance(x: A)(y: A): Double 28 | def centroid(ps: Seq[A]): A 29 | 30 | def nearest(cs: Seq[A], p: A): Int = 31 | cs.map(distance(p)(_)).zipWithIndex.min._2 32 | } 33 | 34 | trait Clusterer { 35 | def clusterList[A](ps: java.util.List[A], k: Int)(implicit g: Geometry[A]): java.util.List[A] 36 | def cluster[A](ps: Seq[A], k: Int)(implicit g: Geometry[A]): Seq[A] 37 | } 38 | 39 | class KMeans extends Clusterer { 40 | def clusterList[A](ps: java.util.List[A], k: Int)(implicit g: Geometry[A]): java.util.List[A] = { 41 | cluster(ps.toIndexedSeq, k)(g) 42 | } 43 | 44 | def cluster[A](ps: Seq[A], k: Int)(implicit g: Geometry[A]): Seq[A] = { 45 | var ips = ps.toIndexedSeq 46 | var cs = init(ips, k) 47 | var as = ps.map(g.nearest(cs, _)) 48 | var done = false 49 | val clusters = IndexedSeq.fill(k)(Buffer[A]()) 50 | while (!done) { 51 | clusters.foreach(_.clear) 52 | 53 | as.zipWithIndex.foreach { case (i, j) => 54 | clusters(i) += ips(j) 55 | } 56 | 57 | cs = clusters.map(g.centroid(_)) 58 | 59 | val bs = ips.map(g.nearest(cs, _)) 60 | done = as == bs 61 | as = bs 62 | } 63 | cs 64 | } 65 | 66 | def init[A](ps: Seq[A], k: Int): IndexedSeq[A] = { 67 | (1 to k).map(_ => ps(Random.nextInt(ps.size))) 68 | } 69 | } 70 | 71 | object EuclideanGeometry { 72 | type Point = (Double, Double) 73 | 74 | implicit def g = new Geometry[Point] { 75 | def distance(x: Point)(y: Point): Double = 76 | sqrt(pow(x._1 - y._1, 2) + pow(x._2 - y._2, 2)) 77 | 78 | def centroid(ps: Seq[Point]): Point = { 79 | def pointPlus(x: Point, y: Point) = (x._1 + y._1, x._2 + y._2) 80 | ps.reduceLeft(pointPlus) match { 81 | case (a, b) => (a / ps.size, b / ps.size) 82 | } 83 | } 84 | } 85 | } 86 | 87 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/tr/util/sanity/CandidateCheck.scala: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.tr.util.sanity 17 | 18 | import java.io._ 19 | import scala.collection.JavaConversions._ 20 | 21 | import opennlp.fieldspring.tr.text.Corpus 22 | import opennlp.fieldspring.tr.text.Toponym 23 | import opennlp.fieldspring.tr.text.io.TrXMLDirSource 24 | import opennlp.fieldspring.tr.text.prep.OpenNLPTokenizer 25 | import opennlp.fieldspring.tr.topo.Location 26 | 27 | object CandidateCheck extends App { 28 | override def main(args: Array[String]) { 29 | val tokenizer = new OpenNLPTokenizer 30 | val corpus = Corpus.createStreamCorpus 31 | val cands = scala.collection.mutable.Map[java.lang.String, java.util.List[Location]]() 32 | 33 | corpus.addSource(new TrXMLDirSource(new File(args(0)), tokenizer)) 34 | corpus.foreach { _.foreach { _.getToponyms.foreach { 35 | case toponym: Toponym => { 36 | if (!cands.contains(toponym.getForm)) { 37 | //println("Doesn't contain: " + toponym.getForm) 38 | cands(toponym.getForm) = toponym.getCandidates 39 | } else { 40 | val prev = cands(toponym.getForm) 41 | val here = toponym.getCandidates 42 | //println("Contains: " + toponym.getForm) 43 | if (prev.size != here.size) { 44 | println("=====Size error for " + toponym.getForm + ": " + prev.size + " " + here.size) 45 | } else { 46 | prev.zip(here).foreach { case (p, h) => 47 | println(p.getRegion.getCenter + " " + h.getRegion.getCenter) 48 | //case (p, h) if p != h => println("=====Mismatch for " + toponym.getForm) 49 | //case _ => () 50 | } 51 | } 52 | } 53 | } 54 | }}} 55 | } 56 | } 57 | 58 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/util/Serializer.scala: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Serializer.scala 3 | // 4 | // Copyright (C) 2011 Ben Wing, The University of Texas at Austin 5 | // 6 | // Licensed under the Apache License, Version 2.0 (the "License"); 7 | // you may not use this file except in compliance with the License. 8 | // You may obtain a copy of the License at 9 | // 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | /////////////////////////////////////////////////////////////////////////////// 18 | 19 | package opennlp.fieldspring.util 20 | 21 | /** A type class for converting to and from values in serialized form. */ 22 | @annotation.implicitNotFound(msg = "No implicit Serializer defined for ${T}.") 23 | trait Serializer[T] { 24 | def deserialize(foo: String): T 25 | def serialize(foo: T): String 26 | /** 27 | * Validate the serialized form of the string. Return true if valid, 28 | * false otherwise. Can be overridden for efficiency. By default, 29 | * simply tries to deserialize, and checks whether an error was thrown. 30 | */ 31 | def validate_serialized_form(foo: String): Boolean = { 32 | try { 33 | deserialize(foo) 34 | } catch { 35 | case _ => return false 36 | } 37 | return true 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/util/WikiRelFreqs.scala: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // WikiRelFreqs.scala 3 | // 4 | // Copyright (C) 2012 Mike Speriosu, The University of Texas at Austin 5 | // 6 | // Licensed under the Apache License, Version 2.0 (the "License"); 7 | // you may not use this file except in compliance with the License. 8 | // You may obtain a copy of the License at 9 | // 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | /////////////////////////////////////////////////////////////////////////////// 18 | 19 | package opennlp.fieldspring.util 20 | 21 | object WikiRelFreqs extends App { 22 | 23 | val geoFreqs = getFreqs(args(0)) 24 | val allFreqs = getFreqs(args(1)) 25 | 26 | val relFreqs = allFreqs.map(p => (p._1, geoFreqs.getOrElse(p._1, 0.0) / p._2)).toList.sortWith((x, y) => if(x._2 != y._2) x._2 > y._2 else x._1 < y._1) 27 | 28 | relFreqs.foreach(println) 29 | 30 | def getFreqs(filename:String):Map[String, Double] = { 31 | val wordCountRE = """^(\w+)\s=\s(\d+)$""".r 32 | val lines = scala.io.Source.fromFile(filename).getLines 33 | val freqs = new scala.collection.mutable.HashMap[String, Long] 34 | var total = 0l 35 | var lineCount = 0 36 | 37 | for(line <- lines) { 38 | if(wordCountRE.findFirstIn(line) != None) { 39 | val wordCountRE(word, count) = line 40 | val lowerWord = word.toLowerCase 41 | val oldCount = freqs.getOrElse(lowerWord, 0l) 42 | freqs.put(lowerWord, oldCount + count.toInt) 43 | total += count.toInt 44 | } 45 | if(lineCount % 10000000 == 0) 46 | println(filename+" "+lineCount) 47 | lineCount += 1 48 | } 49 | 50 | freqs.map(p => (p._1, p._2.toDouble / total)).toMap 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/worddist/DirichletUnigramWordDist.scala: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // DirichletUnigramWordDist.scala 3 | // 4 | // Copyright (C) 2010, 2011, 2012 Ben Wing, The University of Texas at Austin 5 | // 6 | // Licensed under the Apache License, Version 2.0 (the "License"); 7 | // you may not use this file except in compliance with the License. 8 | // You may obtain a copy of the License at 9 | // 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | /////////////////////////////////////////////////////////////////////////////// 18 | 19 | package opennlp.fieldspring.worddist 20 | 21 | /** 22 | * This class implements Dirichlet discounting, where the discount factor 23 | * depends on the size of the document. 24 | */ 25 | class DirichletUnigramWordDistFactory( 26 | interpolate_string: String, 27 | val dirichlet_factor: Double 28 | ) extends DiscountedUnigramWordDistFactory(interpolate_string != "no") { 29 | def create_word_dist(note_globally: Boolean) = 30 | new DirichletUnigramWordDist(this, note_globally) 31 | } 32 | 33 | class DirichletUnigramWordDist( 34 | factory: WordDistFactory, 35 | note_globally: Boolean 36 | ) extends DiscountedUnigramWordDist( 37 | factory, note_globally 38 | ) { 39 | override protected def imp_finish_after_global() { 40 | unseen_mass = 1.0 - 41 | (model.num_tokens.toDouble / 42 | (model.num_tokens + 43 | factory.asInstanceOf[DirichletUnigramWordDistFactory]. 44 | dirichlet_factor)) 45 | super.imp_finish_after_global() 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/worddist/JelinekMercerUnigramWordDist.scala: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // JelinekMercerUnigramWordDist.scala 3 | // 4 | // Copyright (C) 2010, 2011, 2012 Ben Wing, The University of Texas at Austin 5 | // 6 | // Licensed under the Apache License, Version 2.0 (the "License"); 7 | // you may not use this file except in compliance with the License. 8 | // You may obtain a copy of the License at 9 | // 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | /////////////////////////////////////////////////////////////////////////////// 18 | 19 | package opennlp.fieldspring.worddist 20 | 21 | /** 22 | * This class implements Jelinek-Mercer discounting, the simplest type of 23 | * discounting where we just use a constant discount factor. 24 | */ 25 | class JelinekMercerUnigramWordDistFactory( 26 | interpolate_string: String, 27 | val jelinek_factor: Double 28 | ) extends DiscountedUnigramWordDistFactory(interpolate_string != "no") { 29 | def create_word_dist(note_globally: Boolean) = 30 | new JelinekMercerUnigramWordDist(this, note_globally) 31 | } 32 | 33 | class JelinekMercerUnigramWordDist( 34 | factory: WordDistFactory, 35 | note_globally: Boolean 36 | ) extends DiscountedUnigramWordDist( 37 | factory, note_globally 38 | ) { 39 | override protected def imp_finish_after_global() { 40 | unseen_mass = (factory.asInstanceOf[JelinekMercerUnigramWordDistFactory]. 41 | jelinek_factor) 42 | super.imp_finish_after_global() 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/scala/opennlp/fieldspring/worddist/UnsmoothedNgramWordDist.scala: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // UnsmoothedNgramWordDist.scala 3 | // 4 | // Copyright (C) 2010, 2011, 2012 Ben Wing, The University of Texas at Austin 5 | // 6 | // Licensed under the Apache License, Version 2.0 (the "License"); 7 | // you may not use this file except in compliance with the License. 8 | // You may obtain a copy of the License at 9 | // 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | /////////////////////////////////////////////////////////////////////////////// 18 | 19 | package opennlp.fieldspring.worddist 20 | 21 | class UnsmoothedNgramWordDistFactory extends NgramWordDistFactory { 22 | def create_word_dist(note_globally: Boolean) = 23 | new UnsmoothedNgramWordDist(this, note_globally) 24 | 25 | def finish_global_distribution() { 26 | } 27 | } 28 | 29 | class UnsmoothedNgramWordDist( 30 | gen_factory: WordDistFactory, 31 | note_globally: Boolean 32 | ) extends NgramWordDist(gen_factory, note_globally) { 33 | import NgramStorage.Ngram 34 | 35 | type TThis = UnsmoothedNgramWordDist 36 | 37 | def innerToString = "" 38 | 39 | // For some reason, retrieving this value from the model is fantastically slow 40 | var num_tokens = 0.0 41 | 42 | protected def imp_finish_after_global() { 43 | num_tokens = model.num_tokens 44 | } 45 | 46 | def fast_kl_divergence(cache: KLDivergenceCache, other: WordDist, 47 | partial: Boolean = false) = { 48 | assert(false, "Not implemented") 49 | 0.0 50 | } 51 | 52 | def cosine_similarity(other: WordDist, partial: Boolean = false, 53 | smoothed: Boolean = false) = { 54 | assert(false, "Not implemented") 55 | 0.0 56 | } 57 | 58 | def kl_divergence_34(other: NgramWordDist) = { 59 | assert(false, "Not implemented") 60 | 0.0 61 | } 62 | 63 | /** 64 | * Actual implementation of steps 3 and 4 of KL-divergence computation, given 65 | * a value that we may want to compute as part of step 2. 66 | */ 67 | def inner_kl_divergence_34(other: TThis, 68 | overall_probs_diff_words: Double) = { 69 | assert(false, "Not implemented") 70 | 0.0 71 | } 72 | 73 | def lookup_ngram(ngram: Ngram) = 74 | model.get_item(ngram).toDouble / num_tokens 75 | } 76 | -------------------------------------------------------------------------------- /src/test/scala/opennlp/fieldspring/topo/Coordinate.scala: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (C) 2010 Travis Brown, The University of Texas at Austin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////// 16 | package opennlp.fieldspring.topo 17 | 18 | import org.specs._ 19 | import org.specs.runner._ 20 | 21 | class CoordinateTest extends JUnit4(CoordinateSpec) 22 | object CoordinateSpec extends Specification { 23 | 24 | "A degree-constructed coordinate" should { 25 | val coordinate = Coordinate.fromDegrees(45, -45) 26 | "have the correct radian value for latitude" in { 27 | coordinate.getLat must_== math.Pi / 4 28 | } 29 | 30 | "have the correct radian value for longitude" in { 31 | coordinate.getLng must_== -math.Pi / 4 32 | } 33 | 34 | "be equal to its radian-constructed equivalent" in { 35 | coordinate must_== Coordinate.fromRadians(math.Pi / 4, -math.Pi / 4) 36 | } 37 | } 38 | 39 | "A coordinate at the origin" should { 40 | val coordinate = Coordinate.fromDegrees(0, 0) 41 | "have the correct angular distance from a coordinate 1 radian away horizontally" in { 42 | coordinate.distance(Coordinate.fromRadians(0, 1)) must_== 1 43 | } 44 | 45 | "have the correct distance from a coordinate 1 radian away vertically" in { 46 | coordinate.distance(Coordinate.fromRadians(1, 0)) must_== 1 47 | } 48 | } 49 | } 50 | 51 | --------------------------------------------------------------------------------