├── .classpath ├── .gitignore ├── .project ├── AwsCredentials.properties ├── LICENSE.txt ├── README ├── build.xml ├── example ├── counts │ ├── corpus.a │ ├── corpus.en │ ├── corpus.es │ └── thrax-phrase.conf ├── europarl.unified.1 ├── hiero.conf ├── nist09.unified.1 └── samt.conf ├── lib ├── aws-java-sdk-1.1.3.jar ├── commons-lang3-3.1.jar ├── hadoop-common-2.5.2.jar ├── hadoop-mapreduce-client-core-2.5.2.jar ├── jerboa.jar └── testng-5.8-jdk15.jar ├── scripts ├── berant_to_reference.py ├── create_glue_grammar.sh ├── filter_rules.sh └── run_on_amazon.sh ├── src └── edu │ └── jhu │ └── thrax │ ├── Thrax.java │ ├── datatypes │ ├── AlignedSentencePair.java │ ├── Alignment.java │ ├── ArrayAlignment.java │ ├── HierarchicalRule.java │ ├── IntPair.java │ └── PhrasePair.java │ ├── distributional │ ├── ContextPhrase.java │ ├── ContextPhraseExtractor.java │ ├── FeatureClass.java │ ├── FeatureEncoder.java │ ├── FeatureSet.java │ └── FeatureTypes.java │ ├── extraction │ ├── HierarchicalRuleExtractor.java │ ├── HieroLabeler.java │ ├── LabelCache.java │ ├── Labeling.java │ ├── ManualSpanLabeler.java │ ├── SAMTLabeler.java │ └── SpanLabeler.java │ ├── hadoop │ ├── comparators │ │ ├── FieldComparator.java │ │ ├── PrimitiveArrayMarginalComparator.java │ │ └── TextMarginalComparator.java │ ├── datatypes │ │ ├── AlignedRuleWritable.java │ │ ├── AlignmentWritable.java │ │ ├── Annotation.java │ │ ├── FeatureMap.java │ │ ├── FeaturePair.java │ │ ├── FeatureValue.java │ │ ├── IntPair.java │ │ ├── PrimitiveUtils.java │ │ ├── RuleWritable.java │ │ └── TextPair.java │ ├── distributional │ │ ├── CommonLSH.java │ │ ├── ContextWritable.java │ │ ├── DistributionalContextCombiner.java │ │ ├── DistributionalContextMapper.java │ │ ├── DistributionalContextReducer.java │ │ └── SignatureWritable.java │ ├── extraction │ │ ├── ExtractionCombiner.java │ │ ├── ExtractionMapper.java │ │ ├── ExtractionReducer.java │ │ ├── HierarchicalRuleWritableExtractor.java │ │ ├── RuleWritableExtractor.java │ │ └── RuleWritableExtractorFactory.java │ ├── features │ │ ├── AbstractnessFeature.java │ │ ├── AdjacentNonTerminalsFeature.java │ │ ├── CharacterCompressionRatioFeature.java │ │ ├── CharacterCountDifferenceFeature.java │ │ ├── ConsumeSourceTerminalsFeature.java │ │ ├── Feature.java │ │ ├── GlueRuleFeature.java │ │ ├── IdentityFeature.java │ │ ├── LexicalityFeature.java │ │ ├── MonotonicFeature.java │ │ ├── PhrasePenaltyFeature.java │ │ ├── ProduceTargetTerminalsFeature.java │ │ ├── SimpleFeature.java │ │ ├── SimpleFeatureFactory.java │ │ ├── SourceWordCounterFeature.java │ │ ├── TargetWordCounterFeature.java │ │ ├── WordCompressionRatioFeature.java │ │ ├── WordCountDifferenceFeature.java │ │ ├── WordLengthDifferenceFeature.java │ │ ├── WordLexicalProbabilityCalculator.java │ │ ├── XRuleFeature.java │ │ ├── annotation │ │ │ ├── AlignmentFeature.java │ │ │ ├── AnnotationFeature.java │ │ │ ├── AnnotationFeatureFactory.java │ │ │ ├── AnnotationFeatureJob.java │ │ │ ├── AnnotationPassthroughFeature.java │ │ │ ├── AnnotationReducer.java │ │ │ ├── CountFeature.java │ │ │ ├── LogCountFeature.java │ │ │ ├── RarityPenaltyFeature.java │ │ │ ├── SourceGivenTargetLexicalProbabilityFeature.java │ │ │ ├── TargetGivenSourceLexicalProbabilityFeature.java │ │ │ ├── UnalignedSourceCounterFeature.java │ │ │ └── UnalignedTargetCounterFeature.java │ │ ├── mapred │ │ │ ├── CountOfRuleCountsEstimationJob.java │ │ │ ├── GoodTuringSmoothedSourcePhraseGivenTargetFeature.java │ │ │ ├── GoodTuringSmoothedTargetPhraseGivenSourceFeature.java │ │ │ ├── LhsGivenSourcePhraseFeature.java │ │ │ ├── LhsGivenTargetPhraseFeature.java │ │ │ ├── MapReduceFeature.java │ │ │ ├── MapReduceFeatureFactory.java │ │ │ ├── SourceCountFeature.java │ │ │ ├── SourcePhraseGivenLHSFeature.java │ │ │ ├── SourcePhraseGivenTargetFeature.java │ │ │ ├── SourcePhraseGivenTargetandLHSFeature.java │ │ │ ├── TargetCountFeature.java │ │ │ ├── TargetPhraseGivenLHSFeature.java │ │ │ ├── TargetPhraseGivenSourceFeature.java │ │ │ ├── TargetPhraseGivenSourceandLHSFeature.java │ │ │ └── coc │ │ │ │ ├── CountOfCountsEstimator.java │ │ │ │ └── GoodTuringSmoother.java │ │ └── pivot │ │ │ ├── NonAggregatingPivotedFeature.java │ │ │ ├── PivotedAnnotationFeature.java │ │ │ ├── PivotedFeature.java │ │ │ ├── PivotedFeatureFactory.java │ │ │ ├── PivotedLexicalSourceGivenTargetFeature.java │ │ │ ├── PivotedLexicalTargetGivenSourceFeature.java │ │ │ ├── PivotedLhsGivenSourcePhraseFeature.java │ │ │ ├── PivotedLhsGivenTargetPhraseFeature.java │ │ │ ├── PivotedNegLogProbFeature.java │ │ │ ├── PivotedRarityPenaltyFeature.java │ │ │ ├── PivotedSourcePhraseGivenLHSFeature.java │ │ │ ├── PivotedSourcePhraseGivenTargetAndLHSFeature.java │ │ │ ├── PivotedSourcePhraseGivenTargetFeature.java │ │ │ ├── PivotedTargetPhraseGivenLHSFeature.java │ │ │ ├── PivotedTargetPhraseGivenSourceAndLHSFeature.java │ │ │ └── PivotedTargetPhraseGivenSourceFeature.java │ ├── jobs │ │ ├── DefaultValues.java │ │ ├── DistributionalContextExtractionJob.java │ │ ├── DistributionalContextSortingJob.java │ │ ├── ExtractionJob.java │ │ ├── FeatureCollectionJob.java │ │ ├── JobState.java │ │ ├── OutputJob.java │ │ ├── ParaphraseAggregationJob.java │ │ ├── ParaphrasePivotingJob.java │ │ ├── Scheduler.java │ │ ├── SchedulerException.java │ │ ├── SourceWordGivenTargetWordProbabilityJob.java │ │ ├── TargetWordGivenSourceWordProbabilityJob.java │ │ ├── ThraxJob.java │ │ ├── VocabularyJob.java │ │ └── WordLexprobJob.java │ ├── output │ │ └── OutputReducer.java │ ├── paraphrasing │ │ ├── AggregationCombiner.java │ │ ├── AggregationMapper.java │ │ ├── AggregationReducer.java │ │ ├── FeatureCollectionReducer.java │ │ ├── PivotingMapper.java │ │ └── PivotingReducer.java │ └── tools │ │ ├── ExtractionTool.java │ │ ├── FeatureTool.java │ │ ├── OutputTool.java │ │ ├── SourceWordGivenTargetWordProbabilityTool.java │ │ └── TargetWordGivenSourceWordProbabilityTool.java │ ├── lexprob │ ├── HashMapLexprobTable.java │ ├── LexicalProbabilityTable.java │ ├── LexprobTest.java │ ├── SequenceFileLexprobTable.java │ ├── TableEntry.java │ └── TrieLexprobTable.java │ ├── syntax │ ├── LatticeArray.java │ ├── ParseLattice.java │ └── ParseTree.java │ ├── tools │ ├── ExtractPropbankRules.java │ ├── JudgeParaphrases.java │ ├── ParaphraseCoverage.java │ ├── ParaphraseIntersect.java │ ├── ParaphraseOverlap.java │ ├── ParaphraseScore.java │ ├── ParaphraseWordNet.java │ ├── SequenceToGrammar.java │ ├── SequenceToSignatures.java │ └── SplitAndFilter.java │ └── util │ ├── BackwardsCompatibility.java │ ├── ConfFileParser.java │ ├── CreateGlueGrammar.java │ ├── DefaultConfigFileLoader.java │ ├── ExternalizableToUtf8.java │ ├── FormatUtils.java │ ├── GrammarComparison.java │ ├── HdfsUtils.java │ ├── Intersect.java │ ├── MalformedInput.java │ ├── MalformedInput.properties │ ├── MurmurHash.java │ ├── NegLogMath.java │ ├── SequenceFileCreator.java │ ├── TestSetFilter.java │ ├── Vocabulary.java │ ├── amazon │ └── AmazonConfigFileLoader.java │ ├── exceptions │ ├── ConfigurationException.java │ ├── EmptyAlignmentException.java │ ├── EmptySentenceException.java │ ├── InconsistentAlignmentException.java │ ├── MalformedInputException.java │ ├── MalformedParseException.java │ └── NotEnoughFieldsException.java │ └── io │ ├── InputUtilities.java │ ├── LineReader.java │ └── Reader.java ├── test └── edu │ └── jhu │ └── thrax │ ├── datatypes │ └── ArrayAlignmentTest.java │ ├── extraction │ └── SAMTLabelerTest.java │ ├── hadoop │ └── features │ │ └── mapred │ │ └── coc │ │ └── CountOfCountsEstimatorTest.java │ ├── syntax │ └── ParseTreeTest.java │ └── util │ └── io │ └── InputUtilitiesTest.java └── testng.xml /.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | bin/ 2 | test-output/ 3 | doc/ 4 | AwsCredentials.properties 5 | 6 | .DS_Store -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | Thrax 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | 15 | org.eclipse.jdt.core.javanature 16 | 17 | 18 | -------------------------------------------------------------------------------- /AwsCredentials.properties: -------------------------------------------------------------------------------- 1 | accessKey= 2 | secretKey= 3 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2010-13 the Thrax team 2 | Jonny Weese 3 | Juri Ganitkevitch 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Thrax uses Apache hadoop (an open-source implementation of MapReduce) to 2 | efficiently extract a synchronous context-free grammar translation model 3 | for use in modern machine translation systems. 4 | 5 | Thrax currently has support for both Hiero-style grammars (with a single 6 | non-terminal symbol) and SAMT-style grammars (where non-terminal symbols are 7 | calculated by projecting onto the span from a target-side parse tree). 8 | 9 | COMPILING: 10 | 11 | First, you need to set two environment variables: 12 | $HADOOP should point to the directory where Hadoop is installed. 13 | $AWS_SDK should point to the directory where the Amazon Web Services SDK 14 | is installed. 15 | 16 | To compile, type 17 | 18 | ant 19 | 20 | This will compile all classes and package them into a jar for use on a 21 | Hadoop cluster. 22 | 23 | At the end of the compilation, ant should report that the build was successful. 24 | 25 | RUNNING THRAX: 26 | Thrax can be invoked with 27 | 28 | hadoop jar $THRAX/bin/thrax.jar 29 | 30 | Some example configuration files have been included with this distribution: 31 | 32 | example/hiero.conf 33 | example/samt.conf 34 | 35 | COPYRIGHT AND LICENSE: 36 | Copyright (c) 2010-13 by the Thrax team: 37 | Jonny Weese 38 | Juri Ganitkevitch 39 | 40 | See LICENSE.txt (included with this distribution) for the complete terms. 41 | -------------------------------------------------------------------------------- /build.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /example/counts/thrax-phrase.conf: -------------------------------------------------------------------------------- 1 | # this is an example Thrax configuration file 2 | # <- this symbol indicates a comment 3 | # each line should be a key-value pair separated by whitespace 4 | 5 | ### 6 | ### GRAMMAR OPTIONS 7 | ### 8 | 9 | grammar hiero # or samt 10 | reverse false 11 | source-is-parsed false 12 | target-is-parsed false 13 | # default-nt X # X is the default anyway 14 | 15 | min-rule-count 1 16 | 17 | # the number of reducers 18 | reducers 16 19 | 20 | # Maximum length of initial phrase pairs. These are set to be shorter than 21 | # used by Hiero. 22 | initial-phrase-length 5 23 | lex-source-words 5 24 | lex-target-words 5 25 | 26 | # maximum number of NTs in a rule 27 | arity 0 28 | 29 | # minimum number of aligned terminals in a rule 30 | lexicality 1 31 | 32 | # allow adjacent nonterminals on source side 33 | adjacent-nts false 34 | 35 | # allow unaligned words at boundaries of phrases 36 | loose true 37 | 38 | allow-abstract-rules false 39 | allow-nonlexical-x false 40 | allow-full-sentence-rules false 41 | 42 | nonlex-source-length 5 43 | nonlex-target-length 5 44 | nonlex-source-words 5 45 | nonlex-target-words 5 46 | 47 | allow-double-plus false 48 | 49 | rule-span-limit 12 50 | 51 | phrase-penalty 2.718 52 | 53 | # a whitespace seperated list of features 54 | # in this example, the features are phrase translation probability, 55 | # lexical probability, and phrase penalty 56 | # features phrase-penalty e2fphrase f2ephrase lexprob lexical abstract adjacent x-rule source-terminals-without-target target-terminals-without-source monotonic glue-rule rarity target-word-count unaligned-count 57 | features e_given_f_phrase f_given_e_phrase e_given_f_lex f_given_e_lex rarity phrase-penalty alignment count 58 | 59 | # the only option and default later we will want to add formats for other decoders such as moses and 60 | # cdec, if they use other formats 61 | output-format joshua 62 | 63 | # label feature scores? each score will be output as name=score 64 | label-feature-scores false 65 | 66 | amazon-work s3://edu.jhu.cs.jonny/wmt11/fr-en/hiero 67 | amazon-jar s3://edu.jhu.cs.jonny/thrax.jar 68 | amazon-num-instances 15 69 | 70 | max-split-size 8388608 71 | 72 | # the format should be: 73 | # foreign sentence ||| english sentence ||| alignment 74 | # where the english is either parsed or not depending on whether you want 75 | # SAMT or you want Hiero. 76 | #input-file s3://edu.jhu.cs.jonny/wmt11/corpus.fr-en 77 | input-file pipeline-es-en-phrase-_export_projects_mpost_language-packs_es-en_1.3/input-file 78 | -------------------------------------------------------------------------------- /example/europarl.unified.1: -------------------------------------------------------------------------------- 1 | declaro reanudado el período de sesiones del parlamento europeo , interrumpido el viernes 17 de diciembre pasado , y reitero a sus señorías mi deseo de que hayan tenido unas buenas vacaciones . ||| i declare resumed the session of the european parliament adjourned on friday 17 december 1999 , and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period . ||| 0-0 0-1 1-1 2-1 3-1 0-2 0-3 5-4 4-5 6-5 8-6 8-7 7-8 10-9 12-10 11-11 12-11 13-12 14-13 15-13 16-13 16-14 17-15 18-16 19-17 19-18 19-19 19-20 19-21 20-22 21-24 22-24 25-29 24-31 26-32 27-33 28-34 30-35 31-36 29-37 30-37 31-37 31-38 32-39 2 | -------------------------------------------------------------------------------- /example/nist09.unified.1: -------------------------------------------------------------------------------- 1 | اس ملک کا مغربی صحرائے راجھستان بھی مسلسل اپنے پانچ سال سے سخت خشک سالی کی لپیٹ میں ہے . ||| (TOP (S (NP (NP (NP (DT The) (NN country) (POS 's)) (JJ western) (NN desert) (NN state)) (PP (IN of) (NP (NNP Rajasthan)))) (VP (VBZ is) (ADVP (RB also)) (VP (VBG bracing) (PP (IN for) (NP (NP (PRP$ its) (JJ fifth) (JJ straight) (NN year)) (PP (IN of) (NP (NN drought))))))) (. .))) ||| 0-0 15-16 10-15 11-16 13-17 14-17 8-12 18-8 4-10 5-10 19-18 6-9 9-13 1-1 2-2 3-3 2 | -------------------------------------------------------------------------------- /lib/aws-java-sdk-1.1.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joshua-decoder/thrax/0d766be2e7d0fbbad0734064e55699d56048a30c/lib/aws-java-sdk-1.1.3.jar -------------------------------------------------------------------------------- /lib/commons-lang3-3.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joshua-decoder/thrax/0d766be2e7d0fbbad0734064e55699d56048a30c/lib/commons-lang3-3.1.jar -------------------------------------------------------------------------------- /lib/hadoop-common-2.5.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joshua-decoder/thrax/0d766be2e7d0fbbad0734064e55699d56048a30c/lib/hadoop-common-2.5.2.jar -------------------------------------------------------------------------------- /lib/hadoop-mapreduce-client-core-2.5.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joshua-decoder/thrax/0d766be2e7d0fbbad0734064e55699d56048a30c/lib/hadoop-mapreduce-client-core-2.5.2.jar -------------------------------------------------------------------------------- /lib/jerboa.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joshua-decoder/thrax/0d766be2e7d0fbbad0734064e55699d56048a30c/lib/jerboa.jar -------------------------------------------------------------------------------- /lib/testng-5.8-jdk15.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joshua-decoder/thrax/0d766be2e7d0fbbad0734064e55699d56048a30c/lib/testng-5.8-jdk15.jar -------------------------------------------------------------------------------- /scripts/berant_to_reference.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os, sys, codecs 4 | 5 | def main(): 6 | # 7 | # 8 | for line in sys.stdin: 9 | (source, target) = line.lstrip().rstrip().split("\t") 10 | (s_phr, s1, s2) = source[1:-1].split("::") 11 | (t_phr, t1, t2) = target[1:-1].split("::") 12 | if (s1[-2:] == t1[-2:]): 13 | t1 = "[1]" 14 | t2 = "[2]" 15 | else: 16 | t1 = "[2]" 17 | t2 = "[1]" 18 | s1 = "[1]" 19 | s2 = "[2]" 20 | print s1 + " " + s_phr + " " + s2 + " ||| " + t1 + " " + t_phr + " " + t2 21 | 22 | 23 | if __name__ == "__main__": 24 | main() 25 | 26 | 27 | -------------------------------------------------------------------------------- /scripts/create_glue_grammar.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # this script just wraps a java call 3 | 4 | if [[ -z "$THRAX" ]] 5 | then 6 | THRAX="`basename $0`/.." 7 | fi 8 | 9 | java -cp $THRAX/bin/thrax.jar edu.jhu.thrax.util.CreateGlueGrammar $1 10 | 11 | -------------------------------------------------------------------------------- /scripts/filter_rules.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if (($# < 1)) 4 | then 5 | cat << END_USAGE 6 | usage: filter_rules.sh [-v|-p|-f] [test set ...] 7 | -v verbose mode 8 | -p parallel compatibility: print blank lines, don't buffer output 9 | -f fast mode: not as aggressive 10 | END_USAGE 11 | exit 1 12 | fi 13 | 14 | java -Dfile.encoding=utf8 -cp $THRAX/bin/thrax.jar edu.jhu.thrax.util.TestSetFilter $* 15 | 16 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/datatypes/AlignedSentencePair.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.datatypes; 2 | 3 | import java.util.Arrays; 4 | 5 | public class AlignedSentencePair { 6 | public final int[] source; 7 | public final int[] target; 8 | public final Alignment alignment; 9 | 10 | public AlignedSentencePair(int[] ss, int[] ts, Alignment a) { 11 | source = ss; 12 | target = ts; 13 | alignment = a; 14 | } 15 | 16 | public boolean equals(Object o) { 17 | if (o == this) return true; 18 | if (!(o instanceof AlignedSentencePair)) return false; 19 | AlignedSentencePair other = (AlignedSentencePair) o; 20 | return Arrays.equals(source, other.source) && Arrays.equals(target, other.target) 21 | && alignment.equals(other.alignment); 22 | } 23 | 24 | public int hashCode() { 25 | int result = 137; 26 | result = result * 67 + Arrays.hashCode(source); 27 | result = result * 67 + Arrays.hashCode(target); 28 | result = result * 67 + alignment.hashCode(); 29 | return result; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/datatypes/Alignment.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.datatypes; 2 | 3 | import java.util.Iterator; 4 | 5 | /** 6 | * This interface represents a word-level alignment of a sentence pair. 7 | */ 8 | public interface Alignment { 9 | 10 | public boolean sourceIndexIsAligned(int i); 11 | 12 | public boolean targetIndexIsAligned(int i); 13 | 14 | public int numTargetWordsAlignedTo(int i); 15 | 16 | public int numSourceWordsAlignedTo(int i); 17 | 18 | public Iterator targetIndicesAlignedTo(int i); 19 | 20 | public Iterator sourceIndicesAlignedTo(int i); 21 | 22 | public boolean consistentWith(int sourceLength, int targetLength); 23 | } 24 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/datatypes/IntPair.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.datatypes; 2 | 3 | import edu.jhu.thrax.util.FormatUtils; 4 | 5 | /** 6 | * A class that represents a pair of integers. 7 | */ 8 | public class IntPair implements Comparable { 9 | 10 | /** 11 | * The first integer of the pair ("car"). 12 | */ 13 | public final int fst; 14 | 15 | /** 16 | * The second integer of the pair ("cdr"). 17 | */ 18 | public final int snd; 19 | 20 | /** 21 | * Constructor that sets the two ints of the pair. 22 | * 23 | * @param a the first int of the pair 24 | * @param b the second int of the pair 25 | */ 26 | public IntPair(int a, int b) 27 | { 28 | fst = a; 29 | snd = b; 30 | } 31 | 32 | /** 33 | * Create a new IntPair that is the reverse of this pair; that is, puts 34 | * the second int first and the first int second. 35 | */ 36 | public IntPair reverse() 37 | { 38 | return new IntPair(snd, fst); 39 | } 40 | 41 | /** 42 | * Builds a pair from the type of String that you would see in Berkeley 43 | * aligner output. For example, the String "3-4" would yield the pair 44 | * (3,4). 45 | * 46 | * @param s a string in Berkeley aligner format 47 | * @return a new IntPair representing that string 48 | */ 49 | public static IntPair fromHyphenatedString(String s) 50 | { 51 | String [] nums = FormatUtils.P_DASH.split(s); 52 | if (nums.length != 2) { 53 | return null; 54 | } 55 | return new IntPair(Integer.parseInt(nums[0]), Integer.parseInt(nums[1])); 56 | } 57 | 58 | public String toString() 59 | { 60 | return String.format("(%d,%d)", fst, snd); 61 | } 62 | 63 | public boolean equals(Object o) 64 | { 65 | if (o instanceof IntPair) { 66 | IntPair ip = (IntPair) o; 67 | return this.fst == ip.fst && this.snd == ip.snd; 68 | } 69 | return false; 70 | } 71 | 72 | public int compareTo(IntPair ip) 73 | { 74 | if (this.fst == ip.fst) { 75 | return this.snd - ip.snd; 76 | } 77 | return this.fst - ip.fst; 78 | } 79 | 80 | public int hashCode() 81 | { 82 | return fst * 37 + snd; 83 | } 84 | 85 | } 86 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/distributional/ContextPhrase.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.distributional; 2 | 3 | import org.apache.hadoop.io.IntWritable; 4 | import org.apache.hadoop.io.MapWritable; 5 | import org.apache.hadoop.io.Text; 6 | import org.apache.hadoop.io.Writable; 7 | 8 | public class ContextPhrase { 9 | 10 | private final Text phrase; 11 | 12 | private MapWritable features; 13 | 14 | public ContextPhrase(String phrase) { 15 | this.phrase = new Text(phrase); 16 | this.features = new MapWritable(); 17 | } 18 | 19 | public void addFeature(String feature_name) { 20 | addFeature(feature_name, 1); 21 | } 22 | 23 | public void addFeature(String feature_name, int feature_value) { 24 | Text feature_text = new Text(feature_name); 25 | Writable current_value = features.get(feature_text); 26 | if (current_value != null) 27 | features.put(feature_text, new IntWritable(((IntWritable) current_value).get() 28 | + feature_value)); 29 | else 30 | features.put(feature_text, new IntWritable(feature_value)); 31 | } 32 | 33 | public Text getPhrase() { 34 | return phrase; 35 | } 36 | 37 | public MapWritable getFeatures() { 38 | return features; 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/distributional/FeatureClass.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.distributional; 2 | 3 | import edu.jhu.thrax.distributional.FeatureTypes.Label; 4 | import edu.jhu.thrax.distributional.FeatureTypes.Type; 5 | 6 | public class FeatureClass { 7 | public final Type type; 8 | public final Label label; 9 | public final int max_context; 10 | public final int max_gram; 11 | 12 | public FeatureClass(Type type, Label label) { 13 | this(type, label, -1, -1); 14 | } 15 | 16 | public FeatureClass(Type type, Label label, int max_context, int max_gram) { 17 | this.type = type; 18 | this.label = label; 19 | this.max_context = max_context; 20 | this.max_gram = max_gram; 21 | } 22 | } -------------------------------------------------------------------------------- /src/edu/jhu/thrax/distributional/FeatureEncoder.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.distributional; 2 | 3 | import edu.jhu.thrax.distributional.FeatureTypes.Directionality; 4 | import edu.jhu.thrax.distributional.FeatureTypes.Flavor; 5 | import edu.jhu.thrax.distributional.FeatureTypes.Label; 6 | import edu.jhu.thrax.distributional.FeatureTypes.Type; 7 | 8 | public class FeatureEncoder { 9 | 10 | public static long encode(Type type, Label label, Flavor flavor, Directionality directionality) { 11 | return 0; 12 | } 13 | 14 | public static String type(long coded) { 15 | int feature_code = (int) (coded >> 32); 16 | 17 | return new Integer(feature_code).toString(); 18 | } 19 | 20 | public static int feature(long coded) { 21 | return (int) (coded & 0x00000000FFFFFFFF); 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/distributional/FeatureSet.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.distributional; 2 | 3 | import java.util.HashSet; 4 | import java.util.Set; 5 | 6 | import edu.jhu.thrax.distributional.FeatureTypes.Label; 7 | import edu.jhu.thrax.distributional.FeatureTypes.Type; 8 | import edu.jhu.thrax.util.FormatUtils; 9 | 10 | 11 | public class FeatureSet { 12 | 13 | private Set features; 14 | 15 | private boolean active[][]; 16 | 17 | public FeatureSet() { 18 | features = new HashSet(); 19 | active = new boolean[Type.values().length][Label.values().length]; 20 | } 21 | 22 | public void addFeatureClass(String entry) { 23 | String[] fields = FormatUtils.P_DASH.split(entry); 24 | for (String f : fields) { 25 | System.err.println(f); 26 | } 27 | } 28 | 29 | public void addFeatureSet(FeatureSet set) { 30 | for (FeatureClass fc : set.features) 31 | this.features.add(fc); 32 | 33 | for (int i = 0; i < active.length; ++i) 34 | for (int j = 0; j < active[i].length; ++j) 35 | active[i][j] = active[i][j] || set.active[i][j]; 36 | } 37 | 38 | public boolean active(Type type, Label label) { 39 | return active[type.code][label.code]; 40 | } 41 | 42 | 43 | } 44 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/distributional/FeatureTypes.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.distributional; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | 6 | public class FeatureTypes { 7 | 8 | public enum Type { 9 | NGRAM(0, "ngram"), SYN(1, "syn"), DEP(2, "dep"), CDEP(3, "cdep"), CPDEP(4, "cpdep"); 10 | 11 | private static Map map; 12 | 13 | static { 14 | map = new HashMap(); 15 | for (Type t : Type.values()) 16 | map.put(t.code, t); 17 | } 18 | 19 | public static Type get(int code) { 20 | return map.get(code); 21 | } 22 | 23 | public final int code; 24 | public final String name; 25 | 26 | Type(int code, String name) { 27 | this.code = code; 28 | this.name = name; 29 | } 30 | } 31 | 32 | public enum Label { 33 | NONE(0, "none"), LEX(1, "lex"), LEM(2, "lem"), POS(3, "pos"), NER(4, "ner"); 34 | 35 | public final int code; 36 | public final String name; 37 | 38 | Label(int code, String name) { 39 | this.code = code; 40 | this.name = name; 41 | } 42 | } 43 | 44 | public enum Directionality { 45 | NONE(0, "none"), LEFT(1, "left"), RIGHT(2, "right"), CENTER(3, "center"); 46 | 47 | public final int code; 48 | public final String name; 49 | 50 | Directionality(int code, String name) { 51 | this.code = code; 52 | this.name = name; 53 | } 54 | } 55 | 56 | public enum Flavor { 57 | NONE(0, "none"), GOV(1, "gov"), DEP(2, "dep"), HEAD(3, "head"); 58 | 59 | public final int code; 60 | public final String name; 61 | 62 | Flavor(int code, String name) { 63 | this.code = code; 64 | this.name = name; 65 | } 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/extraction/HieroLabeler.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.extraction; 2 | 3 | public class HieroLabeler implements SpanLabeler 4 | { 5 | private final int label; 6 | 7 | public HieroLabeler(int s) 8 | { 9 | label = s; 10 | } 11 | 12 | public int getLabel(int start, int end) 13 | { 14 | return label; 15 | } 16 | } 17 | 18 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/extraction/LabelCache.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.extraction; 2 | 3 | import java.util.HashMap; 4 | 5 | import edu.jhu.thrax.util.Vocabulary; 6 | 7 | public enum LabelCache { 8 | SLASH("/"), BACKSLASH("\\"), PLUS("+"); 9 | 10 | private HashMap cache = new HashMap(); 11 | private String glue; 12 | 13 | private LabelCache(String g) { 14 | glue = g; 15 | } 16 | 17 | public final int get(int left, int right) { 18 | long key = ((long) left << 32) | ((long) right & 0x00000000FFFFFFFFL); 19 | Integer val = cache.get(key); 20 | if (val == null) { 21 | val = join(left, right, glue); 22 | cache.put(key, val); 23 | } 24 | return val; 25 | } 26 | 27 | private static final int join(int a, int b, String glue) { 28 | String word_a = Vocabulary.word(a); 29 | String word_b = Vocabulary.word(b); 30 | return Vocabulary.id(word_a.substring(0, word_a.length() - 1) + glue 31 | + word_b.substring(1)); 32 | } 33 | } -------------------------------------------------------------------------------- /src/edu/jhu/thrax/extraction/Labeling.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.extraction; 2 | 3 | public enum Labeling { 4 | HIERO, SYNTAX, MANUAL; 5 | } 6 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/extraction/ManualSpanLabeler.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.extraction; 2 | 3 | public class ManualSpanLabeler implements SpanLabeler 4 | { 5 | private final int [] labels; 6 | private final int defaultLabel; 7 | private final int sentenceLength; 8 | 9 | public ManualSpanLabeler(int[] ls, int def) 10 | { 11 | labels = ls; 12 | defaultLabel = def; 13 | sentenceLength = getSentenceLength(labels.length); 14 | } 15 | 16 | public int getLabel(int from, int to) 17 | { 18 | int idx = getLabelIndex(from, to, sentenceLength); 19 | if (idx >= labels.length || idx < 0) { 20 | return defaultLabel; 21 | } 22 | else { 23 | return labels[idx]; 24 | } 25 | } 26 | 27 | private static int getSentenceLength(int numLabels) 28 | { 29 | if (numLabels < 0) 30 | return 0; 31 | // 0 labels => sentence length 0 32 | // 1 label => 1 33 | // 3 labels => 2 34 | // T_n labels => n, where T_n is the nth triangle number 35 | int result = 0; 36 | int triangle = 0; 37 | while (triangle != numLabels) { 38 | result++; 39 | triangle += result; 40 | } 41 | return result; 42 | } 43 | 44 | private static int getLabelIndex(int from, int to, int length) 45 | { 46 | // let the length of the target sentence be L 47 | // the first L labels are for spans (0,1) ... (0,L) 48 | // the next L - 1 are for (1,2) ... (1,L) 49 | // and so on 50 | int result = 0; 51 | int offset = length; 52 | for (int i = 0; i < from; i++) { 53 | result += offset; 54 | offset--; 55 | } 56 | int difference = to - from - 1; 57 | result += difference; 58 | return result; 59 | } 60 | } 61 | 62 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/extraction/SpanLabeler.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.extraction; 2 | 3 | public interface SpanLabeler 4 | { 5 | public int getLabel(int start, int end); 6 | } 7 | 8 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/comparators/FieldComparator.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.comparators; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.WritableComparator; 6 | import org.apache.hadoop.io.WritableUtils; 7 | 8 | public class FieldComparator { 9 | private final int fieldNumber; 10 | private final WritableComparator comparator; 11 | 12 | public int offset; 13 | 14 | public FieldComparator(int field, WritableComparator comparator) { 15 | if (field < 0) 16 | throw new IllegalArgumentException("TextFieldComparator: cannot compare field of index " 17 | + field); 18 | fieldNumber = field; 19 | this.comparator = comparator; 20 | } 21 | 22 | public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) throws IOException { 23 | int start1 = getFieldStart(fieldNumber, b1, s1); 24 | int start2 = getFieldStart(fieldNumber, b2, s2); 25 | 26 | int length1 = getFieldLength(b1, start1); 27 | int length2 = getFieldLength(b2, start2); 28 | 29 | // TODO: l1 and l2 may need to be adjusted to reflect offset. 30 | return comparator.compare(b1, start1, length1, b2, start2, length2); 31 | } 32 | 33 | private final int getFieldStart(int field, byte[] bytes, int start) throws IOException { 34 | // if we want the first field, just return current start 35 | if (field == 0) return start; 36 | // otherwise, find out how long this field is ... 37 | int fieldLength = getFieldLength(bytes, start); 38 | // then decrement the field number and find the next start 39 | return getFieldStart(field - 1, bytes, start + fieldLength); 40 | } 41 | 42 | private static final int getFieldLength(byte[] bytes, int start) throws IOException { 43 | // Text is serialized as vInt (the length) plus that many bytes 44 | int vint_size = WritableUtils.decodeVIntSize(bytes[start]); 45 | int field_length = WritableComparator.readVInt(bytes, start); 46 | return vint_size + field_length; 47 | } 48 | 49 | public int fieldEndIndex(byte[] bytes, int start) throws IOException { 50 | int fieldStart = getFieldStart(fieldNumber, bytes, start); 51 | int fieldLength = getFieldLength(bytes, fieldStart); 52 | return fieldStart + fieldLength; 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/comparators/PrimitiveArrayMarginalComparator.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.comparators; 2 | 3 | import org.apache.hadoop.io.Text; 4 | import org.apache.hadoop.io.WritableComparator; 5 | import org.apache.hadoop.io.WritableUtils; 6 | 7 | /** 8 | * Compares two primitive array objects lexicographically, except the zero-length array should be 9 | * sorted before any other. 10 | */ 11 | public class PrimitiveArrayMarginalComparator extends WritableComparator { 12 | 13 | public static final int[] MARGINAL = new int[0]; 14 | 15 | public PrimitiveArrayMarginalComparator() { 16 | super(Text.class); 17 | } 18 | 19 | public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { 20 | int h1 = WritableUtils.decodeVIntSize(b1[s1]); 21 | int length1 = (h1 == 1 ? b1[s1] : -1); 22 | 23 | int h2 = WritableUtils.decodeVIntSize(b2[s2]); 24 | int length2 = (h2 == 1 ? b2[s2] : -1); 25 | 26 | if (length1 == 0 && length2 == 0) return 0; 27 | if (length1 == 0) return -1; 28 | if (length2 == 0) return 1; 29 | return WritableComparator.compareBytes(b1, s1 + h1, l1 - h1, b2, s2 + h2, l2 - h2); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/comparators/TextMarginalComparator.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.comparators; 2 | 3 | import org.apache.hadoop.io.Text; 4 | import org.apache.hadoop.io.WritableComparator; 5 | import org.apache.hadoop.io.WritableUtils; 6 | 7 | /** 8 | * Compares two Text objects lexicographically, except the Text "/MARGINAL/" 9 | * should be sorted before any other string. 10 | */ 11 | public class TextMarginalComparator extends WritableComparator 12 | { 13 | private static final Text.Comparator TEXT_COMPARATOR = new Text.Comparator(); 14 | 15 | public static final Text MARGINAL = new Text("/MARGINAL/"); 16 | private static final byte [] MARGINAL_BYTES = MARGINAL.getBytes(); 17 | private static final int MARGINAL_LENGTH = MARGINAL.getLength(); 18 | 19 | public TextMarginalComparator() 20 | { 21 | super(Text.class); 22 | } 23 | 24 | public int compare(byte [] b1, int s1, int l1, 25 | byte [] b2, int s2, int l2) 26 | { 27 | // if they're equal, return zero 28 | int cmp = TEXT_COMPARATOR.compare(b1, s1, l1, b2, s2, l2); 29 | if (cmp == 0) { 30 | return 0; 31 | } 32 | // else if the first string is "/MARGINAL/", return -1 33 | int vIntSize = WritableUtils.decodeVIntSize(b1[s1]); 34 | int cmpMarginal = compareBytes(b1, s1 + vIntSize, l1 - vIntSize, 35 | MARGINAL_BYTES, 0, MARGINAL_LENGTH); 36 | if (cmpMarginal == 0) 37 | return -1; 38 | // else if the second is "/MARGINAL/", return 1 39 | vIntSize = WritableUtils.decodeVIntSize(b2[s2]); 40 | cmpMarginal = compareBytes(b2, s2 + vIntSize, l2 - vIntSize, 41 | MARGINAL_BYTES, 0, MARGINAL_LENGTH); 42 | if (cmpMarginal == 0) 43 | return 1; 44 | // else, just return the result of the comparison 45 | return cmp; 46 | } 47 | } 48 | 49 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/datatypes/Annotation.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.datatypes; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | 7 | import org.apache.hadoop.io.Writable; 8 | import org.apache.hadoop.io.WritableUtils; 9 | 10 | public class Annotation implements Writable { 11 | 12 | // Source-to-target alignment. 13 | private AlignmentWritable f2e = null; 14 | 15 | // Rule occurrence count. 16 | private int count; 17 | 18 | public Annotation() { 19 | count = 0; 20 | } 21 | 22 | public Annotation(int c) { 23 | count = c; 24 | } 25 | 26 | public Annotation(Annotation a) { 27 | count = a.count; 28 | this.f2e = new AlignmentWritable(a.f2e); 29 | } 30 | 31 | public Annotation(AlignmentWritable f2e) { 32 | count = 1; 33 | this.f2e = f2e; 34 | } 35 | 36 | public void merge(Annotation that) { 37 | this.count += that.count; 38 | } 39 | 40 | @Override 41 | public void readFields(DataInput in) throws IOException { 42 | boolean has_alignments = false; 43 | count = WritableUtils.readVInt(in); 44 | if (count < 0) { 45 | count = -count; 46 | has_alignments = true; 47 | } 48 | if (has_alignments) { 49 | f2e = new AlignmentWritable(); 50 | f2e.readFields(in); 51 | } 52 | } 53 | 54 | @Override 55 | public void write(DataOutput out) throws IOException { 56 | WritableUtils.writeVInt(out, (f2e != null ? -count : count)); 57 | if (f2e != null) f2e.write(out); 58 | } 59 | 60 | public AlignmentWritable e2f() { 61 | return f2e.flip(); 62 | } 63 | 64 | public AlignmentWritable f2e() { 65 | return f2e; 66 | } 67 | 68 | public void setAlignment(AlignmentWritable a) { 69 | f2e = a; 70 | } 71 | 72 | public int count() { 73 | return count; 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/datatypes/FeatureMap.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.datatypes; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | import java.util.HashMap; 7 | import java.util.Map; 8 | import java.util.Set; 9 | 10 | import org.apache.hadoop.io.FloatWritable; 11 | import org.apache.hadoop.io.Writable; 12 | import org.apache.hadoop.io.WritableUtils; 13 | 14 | import edu.jhu.thrax.hadoop.features.annotation.AnnotationPassthroughFeature; 15 | import edu.jhu.thrax.util.Vocabulary; 16 | 17 | public class FeatureMap implements Writable { 18 | 19 | private Map map; 20 | 21 | public FeatureMap() { 22 | map = new HashMap(); 23 | } 24 | 25 | public FeatureMap(FeatureMap fm) { 26 | this(); 27 | for (int key : fm.map.keySet()) 28 | this.map.put(key, fm.map.get(key)); 29 | } 30 | 31 | public Writable get(int key) { 32 | return map.get(key); 33 | } 34 | 35 | public Writable get(String key) { 36 | return map.get(Vocabulary.id(key)); 37 | } 38 | 39 | public void put(int key, Writable val) { 40 | map.put(key, val); 41 | } 42 | 43 | public void put(String key, Writable val) { 44 | map.put(Vocabulary.id(key), val); 45 | } 46 | 47 | public boolean containsKey(int key) { 48 | return map.containsKey(key); 49 | } 50 | 51 | public Set keySet() { 52 | return map.keySet(); 53 | } 54 | 55 | @Override 56 | public void readFields(DataInput in) throws IOException { 57 | map.clear(); 58 | int size = WritableUtils.readVInt(in); 59 | for (int i = 0; i < size; ++i) { 60 | int key = 0; 61 | Writable val = null; 62 | key = WritableUtils.readVInt(in); 63 | if (key == Vocabulary.id(AnnotationPassthroughFeature.NAME)) { 64 | val = new Annotation(); 65 | val.readFields(in); 66 | } else { 67 | val = new FloatWritable(); 68 | val.readFields(in); 69 | } 70 | map.put(key, val); 71 | } 72 | } 73 | 74 | @Override 75 | public void write(DataOutput out) throws IOException { 76 | WritableUtils.writeVInt(out, map.size()); 77 | for (int key : map.keySet()) { 78 | WritableUtils.writeVInt(out, key); 79 | if (key == Vocabulary.id(AnnotationPassthroughFeature.NAME)) { 80 | ((Annotation) this.get(key)).write(out); 81 | } else { 82 | ((FloatWritable) this.get(key)).write(out); 83 | } 84 | } 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/datatypes/FeaturePair.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.datatypes; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | 7 | import org.apache.hadoop.io.Writable; 8 | import org.apache.hadoop.io.WritableUtils; 9 | 10 | import edu.jhu.thrax.util.Vocabulary; 11 | 12 | public class FeaturePair implements Writable { 13 | public int key; 14 | public FeatureValue val; 15 | 16 | public FeaturePair() { 17 | key = 0; 18 | val = new FeatureValue(); 19 | } 20 | 21 | public FeaturePair(int k, Writable v) { 22 | key = k; 23 | val = new FeatureValue(v); 24 | } 25 | 26 | public void write(DataOutput out) throws IOException { 27 | WritableUtils.writeVInt(out, key); 28 | val.write(out); 29 | } 30 | 31 | public void readFields(DataInput in) throws IOException { 32 | key = WritableUtils.readVInt(in); 33 | val.readFields(in); 34 | } 35 | 36 | public int hashCode() { 37 | return key * 163 + val.hashCode(); 38 | } 39 | 40 | public boolean equals(Object o) { 41 | if (o instanceof FeaturePair) { 42 | FeaturePair that = (FeaturePair) o; 43 | return key == that.key && val.equals(that.val); 44 | } 45 | return false; 46 | } 47 | 48 | public String toString() { 49 | return Vocabulary.word(key) + "=" + val.toString(); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/datatypes/FeatureValue.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.datatypes; 2 | 3 | import org.apache.hadoop.io.FloatWritable; 4 | import org.apache.hadoop.io.GenericWritable; 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.io.Writable; 8 | 9 | public class FeatureValue extends GenericWritable { 10 | 11 | @SuppressWarnings("rawtypes") 12 | private static Class[] TYPES = {FloatWritable.class, IntWritable.class, Text.class, 13 | Annotation.class, AlignmentWritable.class}; 14 | 15 | FeatureValue() {} 16 | 17 | FeatureValue(Writable val) { 18 | this.set(val); 19 | } 20 | 21 | @SuppressWarnings("unchecked") 22 | @Override 23 | protected Class[] getTypes() { 24 | return TYPES; 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/datatypes/IntPair.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.datatypes; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | 7 | import org.apache.hadoop.io.WritableComparable; 8 | import org.apache.hadoop.io.WritableComparator; 9 | 10 | public class IntPair implements WritableComparable { 11 | public int fst; 12 | public int snd; 13 | 14 | public IntPair() { 15 | // do nothing 16 | } 17 | 18 | public IntPair(int car, int cdr) { 19 | fst = car; 20 | snd = cdr; 21 | } 22 | 23 | public void reverse() { 24 | int tmp = fst; 25 | fst = snd; 26 | snd = tmp; 27 | } 28 | 29 | public void write(DataOutput out) throws IOException { 30 | out.writeInt(fst); 31 | out.writeInt(snd); 32 | } 33 | 34 | public void readFields(DataInput in) throws IOException { 35 | fst = in.readInt(); 36 | snd = in.readInt(); 37 | } 38 | 39 | public int hashCode() { 40 | return fst * 163 + snd; 41 | } 42 | 43 | public boolean equals(Object o) { 44 | if (o instanceof IntPair) { 45 | IntPair ip = (IntPair) o; 46 | return fst == ip.fst && snd == ip.snd; 47 | } 48 | return false; 49 | } 50 | 51 | public String toString() { 52 | return fst + "\t" + snd; 53 | } 54 | 55 | public int compareTo(IntPair ip) { 56 | int cmp = ip.fst - fst; 57 | if (cmp != 0) { 58 | return cmp; 59 | } 60 | return ip.snd - snd; 61 | } 62 | 63 | public static class Comparator extends WritableComparator { 64 | public Comparator() { 65 | super(IntPair.class); 66 | } 67 | 68 | public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { 69 | int fst1 = readInt(b1, s1); 70 | int fst2 = readInt(b2, s2); 71 | if (fst1 != fst2) { 72 | return fst2 - fst1; 73 | } 74 | int snd1 = readInt(b1, s1 + 4); 75 | int snd2 = readInt(b2, s2 + 4); 76 | return snd2 - snd1; 77 | } 78 | } 79 | 80 | static { 81 | WritableComparator.define(IntPair.class, new Comparator()); 82 | } 83 | 84 | } 85 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/distributional/CommonLSH.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.distributional; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | 5 | import edu.jhu.jerboa.sim.SLSH; 6 | 7 | public class CommonLSH { 8 | 9 | public static SLSH getSLSH(Configuration conf) { 10 | SLSH slsh = null; 11 | try { 12 | slsh = new SLSH(); 13 | slsh.initialize(conf.getInt("thrax.lsh-num-bits", 256), 14 | conf.getInt("thrax.lsh-pool-size", 100000), conf.getInt("thrax.lsh-random-seed", 42)); 15 | } catch (Exception e) { 16 | e.printStackTrace(); 17 | System.exit(1); 18 | } 19 | return slsh; 20 | } 21 | 22 | } 23 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/distributional/DistributionalContextCombiner.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.distributional; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Reducer; 8 | 9 | import edu.jhu.jerboa.sim.SLSH; 10 | 11 | public class DistributionalContextCombiner 12 | extends Reducer { 13 | 14 | private SLSH slsh; 15 | 16 | public void setup(Context context) throws IOException, InterruptedException { 17 | Configuration conf = context.getConfiguration(); 18 | slsh = CommonLSH.getSLSH(conf); 19 | } 20 | 21 | protected void reduce(Text key, Iterable values, Context context) 22 | throws IOException, InterruptedException { 23 | ContextWritable combined = new ContextWritable(); 24 | for (ContextWritable input : values) { 25 | combined.merge(input, slsh); 26 | } 27 | if (!combined.compacted.get()) combined.compact(slsh); 28 | context.write(key, combined); 29 | return; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/distributional/DistributionalContextMapper.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.distributional; 2 | 3 | import java.io.IOException; 4 | import java.util.List; 5 | 6 | import org.apache.hadoop.conf.Configuration; 7 | import org.apache.hadoop.io.LongWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Mapper; 10 | 11 | import edu.jhu.thrax.distributional.ContextPhrase; 12 | import edu.jhu.thrax.distributional.ContextPhraseExtractor; 13 | import edu.jhu.thrax.util.MalformedInput; 14 | import edu.jhu.thrax.util.exceptions.EmptySentenceException; 15 | import edu.jhu.thrax.util.exceptions.MalformedInputException; 16 | import edu.jhu.thrax.util.exceptions.MalformedParseException; 17 | import edu.jhu.thrax.util.exceptions.NotEnoughFieldsException; 18 | 19 | public class DistributionalContextMapper extends Mapper { 20 | 21 | private ContextPhraseExtractor extractor; 22 | 23 | protected void setup(Context context) throws IOException, InterruptedException { 24 | Configuration conf = context.getConfiguration(); 25 | extractor = new ContextPhraseExtractor(conf); 26 | } 27 | 28 | protected void map(LongWritable key, Text value, Context context) throws IOException, 29 | InterruptedException { 30 | if (extractor == null) return; 31 | String line = value.toString(); 32 | try { 33 | List phrases = extractor.extract(line); 34 | for (ContextPhrase cp : phrases) { 35 | context.write(cp.getPhrase(), new ContextWritable(1, cp.getFeatures())); 36 | } 37 | } catch (NotEnoughFieldsException e) { 38 | context.getCounter(MalformedInput.NOT_ENOUGH_FIELDS).increment(1); 39 | } catch (EmptySentenceException e) { 40 | context.getCounter(MalformedInput.EMPTY_SENTENCE).increment(1); 41 | } catch (MalformedParseException e) { 42 | context.getCounter(MalformedInput.MALFORMED_PARSE).increment(1); 43 | } catch (MalformedInputException e) { 44 | context.getCounter(MalformedInput.UNKNOWN).increment(1); 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/distributional/DistributionalContextReducer.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.distributional; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.io.NullWritable; 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.mapreduce.Reducer; 9 | 10 | import edu.jhu.jerboa.sim.SLSH; 11 | import edu.jhu.jerboa.sim.Signature; 12 | 13 | public class DistributionalContextReducer 14 | extends Reducer { 15 | 16 | private int minCount; 17 | private SLSH slsh; 18 | 19 | public void setup(Context context) throws IOException, InterruptedException { 20 | Configuration conf = context.getConfiguration(); 21 | minCount = conf.getInt("thrax.min-phrase-count", 3); 22 | slsh = CommonLSH.getSLSH(conf); 23 | } 24 | 25 | protected void reduce(Text key, Iterable values, Context context) 26 | throws IOException, InterruptedException { 27 | ContextWritable reduced = new ContextWritable(); 28 | for (ContextWritable input : values) { 29 | reduced.merge(input, slsh); 30 | } 31 | if (!reduced.compacted.get()) reduced.compact(slsh); 32 | if (reduced.strength.get() >= minCount) { 33 | Signature reduced_signature = new Signature(); 34 | // TODO: double-check need for deep copy? 35 | reduced_signature.sums = reduced.sums; 36 | slsh.buildSignature(reduced_signature, false); 37 | context.write(new SignatureWritable(key, reduced_signature, reduced.strength.get()), 38 | NullWritable.get()); 39 | } 40 | return; 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/distributional/SignatureWritable.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.distributional; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | 7 | import org.apache.hadoop.io.IntWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.io.Writable; 10 | import org.apache.hadoop.io.WritableComparable; 11 | import org.apache.hadoop.mapreduce.Partitioner; 12 | 13 | import edu.jhu.jerboa.sim.Signature; 14 | import edu.jhu.thrax.hadoop.datatypes.PrimitiveUtils; 15 | 16 | public class SignatureWritable implements WritableComparable { 17 | public Text key; 18 | public byte[] bytes; 19 | public IntWritable strength; 20 | 21 | public SignatureWritable() { 22 | this.key = new Text(); 23 | this.bytes = null; 24 | this.strength = new IntWritable(); 25 | } 26 | 27 | public SignatureWritable(Text key, Signature signature, int strength) { 28 | this.key = new Text(key); 29 | // TODO: deep copy? 30 | this.bytes = signature.bytes; 31 | this.strength = new IntWritable(strength); 32 | } 33 | 34 | @Override 35 | public void readFields(DataInput in) throws IOException { 36 | key.readFields(in); 37 | bytes = PrimitiveUtils.readByteArray(in); 38 | strength.readFields(in); 39 | } 40 | 41 | @Override 42 | public void write(DataOutput out) throws IOException { 43 | key.write(out); 44 | PrimitiveUtils.writeByteArray(out, bytes); 45 | strength.write(out); 46 | } 47 | 48 | @Override 49 | public int compareTo(SignatureWritable that) { 50 | int cmp = strength.compareTo(that.strength); 51 | // Flip sign for descending sort order. 52 | if (cmp != 0) return -cmp; 53 | return key.compareTo(that.key); 54 | } 55 | 56 | public static class SignaturePartitioner extends Partitioner { 57 | public int getPartition(SignatureWritable signature, Writable value, int num_partitions) { 58 | int hash = 163; 59 | hash = 37 * hash + signature.key.hashCode(); 60 | hash = 37 * hash + signature.bytes.hashCode(); 61 | hash = 37 * hash + signature.strength.hashCode(); 62 | return (hash & Integer.MAX_VALUE) % num_partitions; 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/extraction/ExtractionCombiner.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.extraction; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.mapreduce.Reducer; 6 | 7 | import edu.jhu.thrax.hadoop.datatypes.AlignedRuleWritable; 8 | import edu.jhu.thrax.hadoop.datatypes.Annotation; 9 | 10 | public class ExtractionCombiner extends Reducer { 11 | 12 | protected void reduce(AlignedRuleWritable key, Iterable values, Context context) 13 | throws IOException, InterruptedException { 14 | context.progress(); 15 | Annotation merged = new Annotation(); 16 | for (Annotation a : values) merged.merge(a); 17 | context.write(key, merged); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/extraction/ExtractionMapper.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.extraction; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.io.LongWritable; 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.mapreduce.Mapper; 9 | 10 | import edu.jhu.thrax.hadoop.datatypes.AlignedRuleWritable; 11 | import edu.jhu.thrax.hadoop.datatypes.Annotation; 12 | import edu.jhu.thrax.util.Vocabulary; 13 | 14 | public class ExtractionMapper extends Mapper { 15 | private RuleWritableExtractor extractor; 16 | 17 | protected void setup(Context context) throws IOException, InterruptedException { 18 | Configuration conf = context.getConfiguration(); 19 | String vocabulary_path = conf.getRaw("thrax.work-dir") + "vocabulary/part-*"; 20 | Vocabulary.initialize(conf, vocabulary_path); 21 | 22 | // TODO: static initializer call for what Annotation actually carries would go here. 23 | extractor = RuleWritableExtractorFactory.create(context); 24 | if (extractor == null) { 25 | System.err.println("WARNING: could not create rule extractor as configured!"); 26 | } 27 | } 28 | 29 | protected void map(LongWritable key, Text value, Context context) throws IOException, 30 | InterruptedException { 31 | if (extractor == null) return; 32 | for (AnnotatedRule ar : extractor.extract(value)) 33 | context.write(new AlignedRuleWritable(ar.rule, ar.f2e), ar.annotation); 34 | context.progress(); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/extraction/ExtractionReducer.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.extraction; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.mapreduce.Reducer; 7 | 8 | import edu.jhu.thrax.hadoop.datatypes.AlignedRuleWritable; 9 | import edu.jhu.thrax.hadoop.datatypes.AlignmentWritable; 10 | import edu.jhu.thrax.hadoop.datatypes.Annotation; 11 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 12 | import edu.jhu.thrax.util.Vocabulary; 13 | 14 | public class ExtractionReducer 15 | extends Reducer { 16 | 17 | private RuleWritable currentRule = null; 18 | private Annotation currentAnnotation = null; 19 | private AlignmentWritable maxAlignment = null; 20 | private int alignmentCount; 21 | 22 | private int minCount; 23 | 24 | protected void setup(Context context) throws IOException, InterruptedException { 25 | Configuration conf = context.getConfiguration(); 26 | String vocabulary_path = conf.getRaw("thrax.work-dir") + "vocabulary/part-*"; 27 | Vocabulary.initialize(conf, vocabulary_path); 28 | minCount = conf.getInt("thrax.min-rule-count", 1); 29 | } 30 | 31 | protected void reduce(AlignedRuleWritable key, Iterable values, Context context) 32 | throws IOException, InterruptedException { 33 | RuleWritable rule = key.getRule(); 34 | AlignmentWritable alignment = key.getAlignment(); 35 | 36 | Annotation merged = new Annotation(); 37 | for (Annotation a : values) 38 | merged.merge(a); 39 | 40 | if (!rule.equals(currentRule)) { 41 | if (currentRule != null 42 | && (currentAnnotation.count() >= minCount || isUnigramRule(currentRule))) { 43 | currentAnnotation.setAlignment(maxAlignment); 44 | context.write(currentRule, currentAnnotation); 45 | context.progress(); 46 | } 47 | currentRule = new RuleWritable(rule); 48 | currentAnnotation = new Annotation(); 49 | alignmentCount = 0; 50 | maxAlignment = null; 51 | } 52 | currentAnnotation.merge(merged); 53 | if (alignmentCount < merged.count()) { 54 | maxAlignment = new AlignmentWritable(alignment); 55 | alignmentCount = merged.count(); 56 | } 57 | } 58 | 59 | protected void cleanup(Context context) throws IOException, InterruptedException { 60 | if (currentRule != null) { 61 | if (currentAnnotation.count() >= minCount || isUnigramRule(currentRule)) { 62 | currentAnnotation.setAlignment(maxAlignment); 63 | context.write(currentRule, currentAnnotation); 64 | context.progress(); 65 | } 66 | } 67 | } 68 | 69 | private static boolean isUnigramRule(RuleWritable rule) { 70 | if (rule.source.length == 1) return !Vocabulary.nt(rule.source[0]); 71 | return rule.target.length == 1 && !Vocabulary.nt(rule.target[0]); 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/extraction/RuleWritableExtractor.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.extraction; 2 | 3 | import org.apache.hadoop.io.Text; 4 | 5 | import edu.jhu.thrax.hadoop.datatypes.AlignmentWritable; 6 | import edu.jhu.thrax.hadoop.datatypes.Annotation; 7 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 8 | 9 | public interface RuleWritableExtractor { 10 | public Iterable extract(Text line); 11 | } 12 | 13 | 14 | class AnnotatedRule { 15 | public RuleWritable rule = null; 16 | public AlignmentWritable f2e = null; 17 | public Annotation annotation = null; 18 | 19 | public AnnotatedRule(RuleWritable r) { 20 | rule = r; 21 | } 22 | 23 | public AnnotatedRule(RuleWritable r, AlignmentWritable f2e, Annotation a) { 24 | this.rule = r; 25 | this.f2e = f2e; 26 | this.annotation = a; 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/extraction/RuleWritableExtractorFactory.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.extraction; 2 | 3 | import org.apache.hadoop.io.LongWritable; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.mapreduce.Mapper; 6 | 7 | import edu.jhu.thrax.hadoop.datatypes.AlignedRuleWritable; 8 | import edu.jhu.thrax.hadoop.datatypes.Annotation; 9 | 10 | public class RuleWritableExtractorFactory { 11 | public static RuleWritableExtractor create( 12 | Mapper.Context context) { 13 | return new HierarchicalRuleWritableExtractor(context); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/AbstractnessFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features; 2 | 3 | import java.util.Map; 4 | 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.Writable; 7 | 8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 9 | import edu.jhu.thrax.util.Vocabulary; 10 | 11 | public class AbstractnessFeature implements SimpleFeature { 12 | 13 | public static final String NAME = "abstract"; 14 | 15 | private static final IntWritable ZERO = new IntWritable(0); 16 | private static final IntWritable ONE = new IntWritable(1); 17 | 18 | public Writable score(RuleWritable r) { 19 | for (int word : r.source) { 20 | if (!Vocabulary.nt(word)) { 21 | return ZERO; 22 | } 23 | } 24 | for (int word : r.target) { 25 | if (!Vocabulary.nt(word)) { 26 | return ZERO; 27 | } 28 | } 29 | return ONE; 30 | } 31 | 32 | public String getName() { 33 | return NAME; 34 | } 35 | 36 | public void unaryGlueRuleScore(int nt, Map map) { 37 | map.put(Vocabulary.id(NAME), ONE); 38 | } 39 | 40 | public void binaryGlueRuleScore(int nt, Map map) { 41 | map.put(Vocabulary.id(NAME), ONE); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/AdjacentNonTerminalsFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features; 2 | 3 | import java.util.Map; 4 | 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.Writable; 7 | 8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 9 | import edu.jhu.thrax.util.Vocabulary; 10 | 11 | public class AdjacentNonTerminalsFeature implements SimpleFeature { 12 | 13 | public static final String NAME = "adjacent"; 14 | 15 | private static final IntWritable ZERO = new IntWritable(0); 16 | private static final IntWritable ONE = new IntWritable(1); 17 | 18 | public Writable score(RuleWritable r) { 19 | for (int i = 0; i < r.source.length - 1; ++i) 20 | if (Vocabulary.nt(r.source[i])) { 21 | if (Vocabulary.nt(r.source[i + 1])) { 22 | return ONE; 23 | } else { 24 | i += 2; 25 | continue; 26 | } 27 | } 28 | return ZERO; 29 | } 30 | 31 | public String getName() { 32 | return NAME; 33 | } 34 | 35 | public void unaryGlueRuleScore(int nt, Map map) { 36 | map.put(Vocabulary.id(NAME), ZERO); 37 | } 38 | 39 | public void binaryGlueRuleScore(int nt, Map map) { 40 | map.put(Vocabulary.id(NAME), ONE); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/CharacterCompressionRatioFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features; 2 | 3 | import java.util.Map; 4 | 5 | import org.apache.hadoop.io.FloatWritable; 6 | import org.apache.hadoop.io.Writable; 7 | 8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 9 | import edu.jhu.thrax.util.Vocabulary; 10 | 11 | public class CharacterCompressionRatioFeature implements SimpleFeature { 12 | 13 | private static final FloatWritable ZERO = new FloatWritable(0f); 14 | 15 | public static final String NAME = "char_cr"; 16 | 17 | public Writable score(RuleWritable r) { 18 | int src_length = 0; 19 | for (int tok : r.source) { 20 | if (!Vocabulary.nt(tok)) { 21 | src_length += Vocabulary.word(tok).length(); 22 | } 23 | } 24 | src_length += r.source.length - 1; 25 | 26 | int tgt_length = 0; 27 | for (int tok : r.target) { 28 | if (!Vocabulary.nt(tok)) { 29 | tgt_length += Vocabulary.word(tok).length(); 30 | } 31 | } 32 | tgt_length += r.target.length - 1; 33 | 34 | if (src_length == 0 || tgt_length == 0) 35 | return ZERO; 36 | else 37 | return new FloatWritable((float) Math.log((float) tgt_length / src_length)); 38 | } 39 | 40 | public String getName() { 41 | return NAME; 42 | } 43 | 44 | public void unaryGlueRuleScore(int nt, Map map) { 45 | map.put(Vocabulary.id(NAME), ZERO); 46 | } 47 | 48 | public void binaryGlueRuleScore(int nt, Map map) { 49 | map.put(Vocabulary.id(NAME), ZERO); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/CharacterCountDifferenceFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features; 2 | 3 | import java.util.Map; 4 | 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.Writable; 7 | 8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 9 | import edu.jhu.thrax.util.Vocabulary; 10 | 11 | public class CharacterCountDifferenceFeature implements SimpleFeature { 12 | 13 | private static final IntWritable ZERO = new IntWritable(0); 14 | 15 | public static final String NAME = "char_count_difference"; 16 | 17 | public Writable score(RuleWritable r) { 18 | int char_difference = 0; 19 | for (int tok : r.source) { 20 | if (!Vocabulary.nt(tok)) { 21 | char_difference -= Vocabulary.word(tok).length(); 22 | } 23 | } 24 | char_difference -= r.source.length - 1; 25 | 26 | for (int tok : r.target) { 27 | if (!Vocabulary.nt(tok)) { 28 | char_difference += Vocabulary.word(tok).length(); 29 | } 30 | } 31 | char_difference += r.target.length - 1; 32 | return new IntWritable(char_difference); 33 | } 34 | 35 | public String getName() { 36 | return NAME; 37 | } 38 | 39 | public void unaryGlueRuleScore(int nt, Map map) { 40 | map.put(Vocabulary.id(NAME), ZERO); 41 | } 42 | 43 | public void binaryGlueRuleScore(int nt, Map map) { 44 | map.put(Vocabulary.id(NAME), ZERO); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/ConsumeSourceTerminalsFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features; 2 | 3 | import java.util.Map; 4 | 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.Writable; 7 | 8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 9 | import edu.jhu.thrax.util.Vocabulary; 10 | 11 | public class ConsumeSourceTerminalsFeature implements SimpleFeature { 12 | 13 | public static final String NAME = "source_terminals_without_target"; 14 | 15 | private static final IntWritable ZERO = new IntWritable(0); 16 | private static final IntWritable ONE = new IntWritable(1); 17 | 18 | public Writable score(RuleWritable r) { 19 | for (int tok : r.target) { 20 | if (!Vocabulary.nt(tok)) { 21 | return ZERO; 22 | } 23 | } 24 | for (int tok : r.source) { 25 | if (!Vocabulary.nt(tok)) { 26 | return ONE; 27 | } 28 | } 29 | return ZERO; 30 | } 31 | 32 | public String getName() { 33 | return NAME; 34 | } 35 | 36 | public void unaryGlueRuleScore(int nt, Map map) { 37 | map.put(Vocabulary.id(NAME), ZERO); 38 | } 39 | 40 | public void binaryGlueRuleScore(int nt, Map map) { 41 | map.put(Vocabulary.id(NAME), ZERO); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/Feature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features; 2 | 3 | import java.util.Map; 4 | 5 | import org.apache.hadoop.io.Writable; 6 | 7 | public interface Feature { 8 | 9 | public String getName(); 10 | 11 | public void unaryGlueRuleScore(int nt, Map map); 12 | 13 | public void binaryGlueRuleScore(int nt, Map map); 14 | 15 | } 16 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/GlueRuleFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features; 2 | 3 | import java.util.Map; 4 | 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.Writable; 7 | 8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 9 | import edu.jhu.thrax.util.Vocabulary; 10 | 11 | public class GlueRuleFeature implements SimpleFeature { 12 | 13 | public static final String NAME = "glue_rule"; 14 | 15 | private static final IntWritable ZERO = new IntWritable(0); 16 | private static final IntWritable ONE = new IntWritable(1); 17 | 18 | public Writable score(RuleWritable r) { 19 | return ZERO; 20 | } 21 | 22 | public String getName() { 23 | return NAME; 24 | } 25 | 26 | public void unaryGlueRuleScore(int nt, Map map) { 27 | map.put(Vocabulary.id(NAME), ONE); 28 | } 29 | 30 | public void binaryGlueRuleScore(int nt, Map map) { 31 | map.put(Vocabulary.id(NAME), ONE); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/IdentityFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features; 2 | 3 | import java.util.Arrays; 4 | import java.util.Map; 5 | 6 | import org.apache.hadoop.io.IntWritable; 7 | import org.apache.hadoop.io.Writable; 8 | 9 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 10 | import edu.jhu.thrax.util.Vocabulary; 11 | 12 | public class IdentityFeature implements SimpleFeature { 13 | 14 | public static final String NAME = "identity"; 15 | 16 | private static final IntWritable ZERO = new IntWritable(0); 17 | private static final IntWritable ONE = new IntWritable(1); 18 | 19 | public Writable score(RuleWritable r) { 20 | if (r.monotone && Arrays.equals(r.target, r.source)) 21 | return ONE; 22 | else 23 | return ZERO; 24 | } 25 | 26 | public String getName() { 27 | return NAME; 28 | } 29 | 30 | public void unaryGlueRuleScore(int nt, Map map) { 31 | map.put(Vocabulary.id(NAME), ZERO); 32 | } 33 | 34 | public void binaryGlueRuleScore(int nt, Map map) { 35 | map.put(Vocabulary.id(NAME), ZERO); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/LexicalityFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features; 2 | 3 | import java.util.Map; 4 | 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.Writable; 7 | 8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 9 | import edu.jhu.thrax.util.Vocabulary; 10 | 11 | public class LexicalityFeature implements SimpleFeature { 12 | 13 | public static final String NAME = "lexical"; 14 | 15 | private static final IntWritable ZERO = new IntWritable(0); 16 | private static final IntWritable ONE = new IntWritable(1); 17 | 18 | public Writable score(RuleWritable r) { 19 | for (int tok : r.source) 20 | if (Vocabulary.nt(tok)) return ZERO; 21 | for (int tok : r.target) 22 | if (Vocabulary.nt(tok)) return ZERO; 23 | return ONE; 24 | } 25 | 26 | public String getName() { 27 | return NAME; 28 | } 29 | 30 | public void unaryGlueRuleScore(int nt, Map map) { 31 | map.put(Vocabulary.id(NAME), ZERO); 32 | } 33 | 34 | public void binaryGlueRuleScore(int nt, Map map) { 35 | map.put(Vocabulary.id(NAME), ZERO); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/MonotonicFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features; 2 | 3 | import java.util.Map; 4 | 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.Writable; 7 | 8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 9 | import edu.jhu.thrax.util.Vocabulary; 10 | 11 | public class MonotonicFeature implements SimpleFeature { 12 | 13 | public static final String NAME = "monotonic"; 14 | 15 | private static final IntWritable ZERO = new IntWritable(0); 16 | private static final IntWritable ONE = new IntWritable(1); 17 | 18 | public Writable score(RuleWritable r) { 19 | return (r.monotone ? ONE : ZERO); 20 | } 21 | 22 | public String getName() { 23 | return NAME; 24 | } 25 | 26 | public void unaryGlueRuleScore(int nt, Map map) { 27 | map.put(Vocabulary.id(NAME), ONE); 28 | } 29 | 30 | public void binaryGlueRuleScore(int nt, Map map) { 31 | map.put(Vocabulary.id(NAME), ONE); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/PhrasePenaltyFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features; 2 | 3 | import java.util.Map; 4 | 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.Writable; 7 | 8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 9 | import edu.jhu.thrax.util.Vocabulary; 10 | 11 | public class PhrasePenaltyFeature implements SimpleFeature { 12 | 13 | public static final String NAME = "phrase_penalty"; 14 | 15 | private static final IntWritable ONE = new IntWritable(1); 16 | 17 | public Writable score(RuleWritable r) { 18 | return ONE; 19 | } 20 | 21 | public String getName() { 22 | return NAME; 23 | } 24 | 25 | public void unaryGlueRuleScore(int nt, Map map) { 26 | map.put(Vocabulary.id(NAME), ONE); 27 | } 28 | 29 | public void binaryGlueRuleScore(int nt, Map map) { 30 | map.put(Vocabulary.id(NAME), ONE); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/ProduceTargetTerminalsFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features; 2 | 3 | import java.util.Map; 4 | 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.Writable; 7 | 8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 9 | import edu.jhu.thrax.util.Vocabulary; 10 | 11 | public class ProduceTargetTerminalsFeature implements SimpleFeature { 12 | 13 | public static final String NAME = "target_terminals_without_source"; 14 | 15 | private static final IntWritable ZERO = new IntWritable(0); 16 | private static final IntWritable ONE = new IntWritable(1); 17 | 18 | public Writable score(RuleWritable r) { 19 | for (int tok : r.source) 20 | if (!Vocabulary.nt(tok)) return ZERO; 21 | for (int tok : r.target) 22 | if (!Vocabulary.nt(tok)) return ONE; 23 | return ZERO; 24 | } 25 | 26 | public String getName() { 27 | return NAME; 28 | } 29 | 30 | public void unaryGlueRuleScore(int nt, Map map) { 31 | map.put(Vocabulary.id(NAME), ZERO); 32 | } 33 | 34 | public void binaryGlueRuleScore(int nt, Map map) { 35 | map.put(Vocabulary.id(NAME), ZERO); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/SimpleFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features; 2 | 3 | import org.apache.hadoop.io.Writable; 4 | 5 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 6 | 7 | public interface SimpleFeature extends Feature { 8 | 9 | public Writable score(RuleWritable r); 10 | 11 | } 12 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/SimpleFeatureFactory.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import edu.jhu.thrax.util.FormatUtils; 7 | 8 | public class SimpleFeatureFactory { 9 | 10 | public static SimpleFeature get(String name) { 11 | if (name.equals(AbstractnessFeature.NAME)) 12 | return new AbstractnessFeature(); 13 | else if (name.equals(AdjacentNonTerminalsFeature.NAME)) 14 | return new AdjacentNonTerminalsFeature(); 15 | else if (name.equals(LexicalityFeature.NAME)) 16 | return new LexicalityFeature(); 17 | else if (name.equals(XRuleFeature.NAME)) 18 | return new XRuleFeature(); 19 | else if (name.equals(MonotonicFeature.NAME)) 20 | return new MonotonicFeature(); 21 | else if (name.equals(PhrasePenaltyFeature.NAME)) 22 | return new PhrasePenaltyFeature(); 23 | else if (name.equals(SourceWordCounterFeature.NAME)) 24 | return new SourceWordCounterFeature(); 25 | else if (name.equals(TargetWordCounterFeature.NAME)) 26 | return new TargetWordCounterFeature(); 27 | else if (name.equals(ConsumeSourceTerminalsFeature.NAME)) 28 | return new ConsumeSourceTerminalsFeature(); 29 | else if (name.equals(ProduceTargetTerminalsFeature.NAME)) 30 | return new ProduceTargetTerminalsFeature(); 31 | else if (name.equals(IdentityFeature.NAME)) 32 | return new IdentityFeature(); 33 | else if (name.equals(WordCountDifferenceFeature.NAME)) 34 | return new WordCountDifferenceFeature(); 35 | else if (name.equals(WordLengthDifferenceFeature.NAME)) 36 | return new WordLengthDifferenceFeature(); 37 | else if (name.equals(WordCompressionRatioFeature.NAME)) 38 | return new WordCompressionRatioFeature(); 39 | else if (name.equals(CharacterCountDifferenceFeature.NAME)) 40 | return new CharacterCountDifferenceFeature(); 41 | else if (name.equals(CharacterCompressionRatioFeature.NAME)) 42 | return new CharacterCompressionRatioFeature(); 43 | else if (name.equals(GlueRuleFeature.NAME)) return new GlueRuleFeature(); 44 | 45 | return null; 46 | } 47 | 48 | public static List getAll(String names) { 49 | String[] feature_names = FormatUtils.P_COMMA_OR_SPACE.split(names); 50 | List features = new ArrayList(); 51 | 52 | for (String feature_name : feature_names) { 53 | SimpleFeature feature = get(feature_name); 54 | if (feature != null) features.add(feature); 55 | } 56 | return features; 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/SourceWordCounterFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features; 2 | 3 | import java.util.Map; 4 | 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.Writable; 7 | 8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 9 | import edu.jhu.thrax.util.Vocabulary; 10 | 11 | public class SourceWordCounterFeature implements SimpleFeature { 12 | 13 | public static final String NAME = "source_word_count"; 14 | 15 | private static final IntWritable ZERO = new IntWritable(0); 16 | 17 | public Writable score(RuleWritable r) { 18 | int words = 0; 19 | for (int word : r.source) 20 | if (!Vocabulary.nt(word)) words++; 21 | return new IntWritable(words); 22 | } 23 | 24 | public String getName() { 25 | return NAME; 26 | } 27 | 28 | public void unaryGlueRuleScore(int nt, Map map) { 29 | map.put(Vocabulary.id(NAME), ZERO); 30 | } 31 | 32 | public void binaryGlueRuleScore(int nt, Map map) { 33 | map.put(Vocabulary.id(NAME), ZERO); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/TargetWordCounterFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features; 2 | 3 | import java.util.Map; 4 | 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.Writable; 7 | 8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 9 | import edu.jhu.thrax.util.Vocabulary; 10 | 11 | public class TargetWordCounterFeature implements SimpleFeature { 12 | 13 | public static final String NAME = "target_word_count"; 14 | 15 | private static final IntWritable ZERO = new IntWritable(0); 16 | 17 | public Writable score(RuleWritable r) { 18 | int words = 0; 19 | for (int tok : r.target) 20 | if (!Vocabulary.nt(tok)) words++; 21 | return new IntWritable(words); 22 | } 23 | 24 | public String getName() { 25 | return NAME; 26 | } 27 | 28 | public void unaryGlueRuleScore(int nt, Map map) { 29 | map.put(Vocabulary.id(NAME), ZERO); 30 | } 31 | 32 | public void binaryGlueRuleScore(int nt, Map map) { 33 | map.put(Vocabulary.id(NAME), ZERO); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/WordCompressionRatioFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features; 2 | 3 | import java.util.Map; 4 | 5 | import org.apache.hadoop.io.FloatWritable; 6 | import org.apache.hadoop.io.IntWritable; 7 | import org.apache.hadoop.io.Writable; 8 | 9 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 10 | import edu.jhu.thrax.util.Vocabulary; 11 | 12 | public class WordCompressionRatioFeature implements SimpleFeature { 13 | 14 | public static final String NAME = "word_cr"; 15 | 16 | private static final IntWritable ZERO = new IntWritable(0); 17 | 18 | public Writable score(RuleWritable r) { 19 | int src_count = 0; 20 | for (int tok : r.source) 21 | if (!Vocabulary.nt(tok)) src_count++; 22 | int tgt_count = 0; 23 | for (int tok : r.target) 24 | if (!Vocabulary.nt(tok)) tgt_count++; 25 | if (src_count == 0 || tgt_count == 0) { 26 | return ZERO; 27 | } else { 28 | return new FloatWritable((float) Math.log((float) tgt_count / src_count)); 29 | } 30 | } 31 | 32 | public String getName() { 33 | return NAME; 34 | } 35 | 36 | public void unaryGlueRuleScore(int nt, Map map) { 37 | map.put(Vocabulary.id(NAME), ZERO); 38 | } 39 | 40 | public void binaryGlueRuleScore(int nt, Map map) { 41 | map.put(Vocabulary.id(NAME), ZERO); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/WordCountDifferenceFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features; 2 | 3 | import java.util.Map; 4 | 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.Writable; 7 | 8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 9 | import edu.jhu.thrax.util.Vocabulary; 10 | 11 | public class WordCountDifferenceFeature implements SimpleFeature { 12 | 13 | public static final String NAME = "word_count_difference"; 14 | 15 | private static final IntWritable ZERO = new IntWritable(0); 16 | 17 | public Writable score(RuleWritable r) { 18 | int word_difference = 0; 19 | for (int tok : r.source) 20 | if (!Vocabulary.nt(tok)) word_difference--; 21 | for (int tok : r.target) 22 | if (!Vocabulary.nt(tok)) word_difference++; 23 | return new IntWritable(word_difference); 24 | } 25 | 26 | public String getName() { 27 | return NAME; 28 | } 29 | 30 | public void unaryGlueRuleScore(int nt, Map map) { 31 | map.put(Vocabulary.id(NAME), ZERO); 32 | } 33 | 34 | public void binaryGlueRuleScore(int nt, Map map) { 35 | map.put(Vocabulary.id(NAME), ZERO); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/WordLengthDifferenceFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features; 2 | 3 | import java.util.Map; 4 | 5 | import org.apache.hadoop.io.FloatWritable; 6 | import org.apache.hadoop.io.Writable; 7 | 8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 9 | import edu.jhu.thrax.util.Vocabulary; 10 | 11 | public class WordLengthDifferenceFeature implements SimpleFeature { 12 | 13 | public static final String NAME = "word_length_difference"; 14 | 15 | private static final FloatWritable ZERO = new FloatWritable(0); 16 | 17 | public Writable score(RuleWritable r) { 18 | int src_length = 0; 19 | int src_count = 0; 20 | for (int tok : r.source) { 21 | if (!Vocabulary.nt(tok)) { 22 | src_length += Vocabulary.word(tok).length(); 23 | src_count++; 24 | } 25 | } 26 | int tgt_length = 0; 27 | int tgt_count = 0; 28 | for (int tok : r.target) { 29 | if (!Vocabulary.nt(tok)) { 30 | tgt_length += Vocabulary.word(tok).length(); 31 | tgt_count++; 32 | } 33 | } 34 | if (src_count == 0 || tgt_count == 0) { 35 | return ZERO; 36 | } else { 37 | float avg_src_length = (float) src_length / src_count; 38 | float avg_tgt_length = (float) tgt_length / tgt_count; 39 | return new FloatWritable(avg_tgt_length - avg_src_length); 40 | } 41 | } 42 | 43 | public String getName() { 44 | return NAME; 45 | } 46 | 47 | public void unaryGlueRuleScore(int nt, Map map) { 48 | map.put(Vocabulary.id(NAME), ZERO); 49 | } 50 | 51 | public void binaryGlueRuleScore(int nt, Map map) { 52 | map.put(Vocabulary.id(NAME), ZERO); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/XRuleFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features; 2 | 3 | import java.util.Map; 4 | 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.Writable; 7 | 8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 9 | import edu.jhu.thrax.util.Vocabulary; 10 | 11 | public class XRuleFeature implements SimpleFeature { 12 | 13 | public static final String NAME = "x_rule"; 14 | 15 | private static final IntWritable ZERO = new IntWritable(0); 16 | private static final IntWritable ONE = new IntWritable(1); 17 | 18 | // TODO: should be default nonterminal and not explicitly X. 19 | private final int PATTERN = Vocabulary.id("[X]"); 20 | 21 | public Writable score(RuleWritable r) { 22 | return (r.lhs == PATTERN ? ONE : ZERO); 23 | } 24 | 25 | public String getName() { 26 | return NAME; 27 | } 28 | 29 | public void unaryGlueRuleScore(int nt, Map map) { 30 | map.put(Vocabulary.id(NAME), ZERO); 31 | } 32 | 33 | public void binaryGlueRuleScore(int nt, Map map) { 34 | map.put(Vocabulary.id(NAME), ZERO); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/annotation/AlignmentFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features.annotation; 2 | 3 | import java.io.IOException; 4 | import java.util.Map; 5 | import java.util.Set; 6 | 7 | import org.apache.hadoop.io.IntWritable; 8 | import org.apache.hadoop.io.Writable; 9 | import org.apache.hadoop.mapreduce.Reducer.Context; 10 | 11 | import edu.jhu.thrax.hadoop.datatypes.AlignmentWritable; 12 | import edu.jhu.thrax.hadoop.datatypes.Annotation; 13 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 14 | import edu.jhu.thrax.hadoop.jobs.ThraxJob; 15 | import edu.jhu.thrax.util.Vocabulary; 16 | 17 | @SuppressWarnings("rawtypes") 18 | public class AlignmentFeature implements AnnotationFeature { 19 | 20 | public static final String NAME = "alignment"; 21 | 22 | private static final IntWritable ZERO = new IntWritable(0); 23 | 24 | public String getName() { 25 | return NAME; 26 | } 27 | 28 | public AlignmentWritable score(RuleWritable r, Annotation annotation) { 29 | return annotation.f2e(); 30 | } 31 | 32 | public void unaryGlueRuleScore(int nt, Map map) { 33 | map.put(Vocabulary.id(NAME), ZERO); 34 | } 35 | 36 | public void binaryGlueRuleScore(int nt, Map map) { 37 | map.put(Vocabulary.id(NAME), ZERO); 38 | } 39 | 40 | @Override 41 | public void init(Context context) throws IOException, InterruptedException {} 42 | 43 | @Override 44 | public Set> getPrerequisites() { 45 | return null; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/annotation/AnnotationFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features.annotation; 2 | 3 | import java.io.IOException; 4 | import java.util.Set; 5 | 6 | import org.apache.hadoop.io.Writable; 7 | import org.apache.hadoop.mapreduce.Reducer.Context; 8 | 9 | import edu.jhu.thrax.hadoop.datatypes.Annotation; 10 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 11 | import edu.jhu.thrax.hadoop.features.Feature; 12 | import edu.jhu.thrax.hadoop.jobs.ThraxJob; 13 | 14 | public interface AnnotationFeature extends Feature { 15 | 16 | @SuppressWarnings("rawtypes") 17 | public void init(Context context) throws IOException, InterruptedException; 18 | 19 | public Writable score(RuleWritable r, Annotation annotation); 20 | 21 | // TODO: move this into its own interface, have AF extend it. 22 | public Set> getPrerequisites(); 23 | } 24 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/annotation/AnnotationFeatureFactory.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features.annotation; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import edu.jhu.thrax.util.FormatUtils; 7 | 8 | public class AnnotationFeatureFactory { 9 | 10 | public static AnnotationFeature get(String name) { 11 | if (name.equals(UnalignedSourceCounterFeature.NAME)) 12 | return new UnalignedSourceCounterFeature(); 13 | else if (name.equals(UnalignedTargetCounterFeature.NAME)) 14 | return new UnalignedTargetCounterFeature(); 15 | else if (name.equals(RarityPenaltyFeature.NAME)) 16 | return new RarityPenaltyFeature(); 17 | else if (name.equals(CountFeature.NAME)) 18 | return new CountFeature(); 19 | else if (name.equals(LogCountFeature.NAME)) 20 | return new LogCountFeature(); 21 | else if (name.equals(SourceGivenTargetLexicalProbabilityFeature.NAME)) 22 | return new SourceGivenTargetLexicalProbabilityFeature(); 23 | else if (name.equals(TargetGivenSourceLexicalProbabilityFeature.NAME)) 24 | return new TargetGivenSourceLexicalProbabilityFeature(); 25 | else if (name.equals(AlignmentFeature.NAME)) 26 | return new AlignmentFeature(); 27 | 28 | return null; 29 | } 30 | 31 | public static List getAll(String names) { 32 | String[] feature_names = FormatUtils.P_COMMA_OR_SPACE.split(names); 33 | List features = new ArrayList(); 34 | 35 | for (String feature_name : feature_names) { 36 | AnnotationFeature feature = get(feature_name); 37 | if (feature != null) features.add(feature); 38 | } 39 | return features; 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/annotation/AnnotationFeatureJob.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features.annotation; 2 | 3 | import java.io.IOException; 4 | import java.util.HashSet; 5 | import java.util.Set; 6 | 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.fs.Path; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 12 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 14 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; 15 | 16 | import edu.jhu.thrax.hadoop.datatypes.Annotation; 17 | import edu.jhu.thrax.hadoop.datatypes.FeaturePair; 18 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 19 | import edu.jhu.thrax.hadoop.jobs.DefaultValues; 20 | import edu.jhu.thrax.hadoop.jobs.ExtractionJob; 21 | import edu.jhu.thrax.hadoop.jobs.ThraxJob; 22 | 23 | public class AnnotationFeatureJob implements ThraxJob { 24 | 25 | public AnnotationFeatureJob() {} 26 | 27 | protected static HashSet> prereqs = 28 | new HashSet>(); 29 | 30 | public Set> getPrerequisites() { 31 | prereqs.add(ExtractionJob.class); 32 | return prereqs; 33 | } 34 | 35 | public static void addPrerequisites(Iterable> cs) { 36 | if (cs != null) 37 | for (Class c : cs) 38 | prereqs.add(c); 39 | } 40 | 41 | public static void addPrerequisite(Class c) { 42 | prereqs.add(c); 43 | } 44 | 45 | public String getOutputSuffix() { 46 | return getName(); 47 | } 48 | 49 | public Job getJob(Configuration conf) throws IOException { 50 | String name = getName(); 51 | Job job = new Job(conf, name); 52 | job.setJarByClass(this.getClass()); 53 | 54 | job.setMapperClass(Mapper.class); 55 | job.setPartitionerClass(RuleWritable.YieldPartitioner.class); 56 | job.setReducerClass(AnnotationReducer.class); 57 | 58 | job.setInputFormatClass(SequenceFileInputFormat.class); 59 | job.setMapOutputKeyClass(RuleWritable.class); 60 | job.setMapOutputValueClass(Annotation.class); 61 | job.setOutputKeyClass(RuleWritable.class); 62 | job.setOutputValueClass(FeaturePair.class); 63 | job.setOutputFormatClass(SequenceFileOutputFormat.class); 64 | 65 | int num_reducers = conf.getInt("thrax.reducers", conf.getInt("mapreduce.job.reduces", DefaultValues.DEFAULT_NUM_REDUCERS)); 66 | job.setNumReduceTasks(num_reducers); 67 | 68 | FileInputFormat.setInputPaths(job, new Path(conf.get("thrax.work-dir") + "rules")); 69 | FileOutputFormat.setOutputPath(job, new Path(conf.get("thrax.work-dir") + "annotation")); 70 | return job; 71 | } 72 | 73 | @Override 74 | public String getName() { 75 | return "annotation"; 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/annotation/AnnotationPassthroughFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features.annotation; 2 | 3 | import java.io.IOException; 4 | import java.util.Map; 5 | import java.util.Set; 6 | 7 | import org.apache.hadoop.io.Writable; 8 | import org.apache.hadoop.mapreduce.Reducer.Context; 9 | 10 | import edu.jhu.thrax.hadoop.datatypes.Annotation; 11 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 12 | import edu.jhu.thrax.hadoop.jobs.ThraxJob; 13 | 14 | @SuppressWarnings("rawtypes") 15 | public class AnnotationPassthroughFeature implements AnnotationFeature { 16 | 17 | public static final String NAME = "annotation"; 18 | 19 | public String getName() { 20 | return NAME; 21 | } 22 | 23 | public Annotation score(RuleWritable r, Annotation annotation) { 24 | return annotation; 25 | } 26 | 27 | public void unaryGlueRuleScore(int nt, Map map) { 28 | } 29 | 30 | public void binaryGlueRuleScore(int nt, Map map) { 31 | } 32 | 33 | @Override 34 | public void init(Context context) throws IOException, InterruptedException {} 35 | 36 | @Override 37 | public Set> getPrerequisites() { 38 | return null; 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/annotation/AnnotationReducer.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features.annotation; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.mapreduce.Reducer; 9 | 10 | import edu.jhu.thrax.hadoop.datatypes.Annotation; 11 | import edu.jhu.thrax.hadoop.datatypes.FeaturePair; 12 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 13 | import edu.jhu.thrax.util.BackwardsCompatibility; 14 | import edu.jhu.thrax.util.Vocabulary; 15 | 16 | public class AnnotationReducer extends Reducer { 17 | 18 | private List annotationFeatures; 19 | 20 | public AnnotationReducer() {} 21 | 22 | protected void setup(Context context) throws IOException, InterruptedException { 23 | Configuration conf = context.getConfiguration(); 24 | String vocabulary_path = conf.getRaw("thrax.work-dir") + "vocabulary/part-*"; 25 | Vocabulary.initialize(conf, vocabulary_path); 26 | 27 | String features = BackwardsCompatibility.equivalent(conf.get("thrax.features", "")); 28 | 29 | // Paraphrasing only needs the annotation to be passed through. 30 | String type = conf.get("thrax.type", "translation"); 31 | if ("paraphrasing".equals(type)) { 32 | annotationFeatures = new ArrayList(); 33 | annotationFeatures.add(new AnnotationPassthroughFeature()); 34 | } else { 35 | annotationFeatures = AnnotationFeatureFactory.getAll(features); 36 | } 37 | 38 | for (AnnotationFeature af : annotationFeatures) 39 | af.init(context); 40 | } 41 | 42 | protected void reduce(RuleWritable key, Iterable values, Context context) 43 | throws IOException, InterruptedException { 44 | for (Annotation annotation : values) { 45 | for (AnnotationFeature f : annotationFeatures) { 46 | context.write(key, new FeaturePair(Vocabulary.id(f.getName()), f.score(key, annotation))); 47 | } 48 | } 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/annotation/CountFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features.annotation; 2 | 3 | import java.util.Map; 4 | import java.util.Set; 5 | 6 | import org.apache.hadoop.io.IntWritable; 7 | import org.apache.hadoop.io.Writable; 8 | import org.apache.hadoop.mapreduce.Reducer.Context; 9 | 10 | import edu.jhu.thrax.hadoop.datatypes.Annotation; 11 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 12 | import edu.jhu.thrax.hadoop.jobs.ThraxJob; 13 | import edu.jhu.thrax.util.Vocabulary; 14 | 15 | @SuppressWarnings("rawtypes") 16 | public class CountFeature implements AnnotationFeature { 17 | 18 | public static final String NAME = "count"; 19 | 20 | private static final IntWritable ZERO = new IntWritable(0); 21 | 22 | public String getName() { 23 | return NAME; 24 | } 25 | 26 | public void unaryGlueRuleScore(int nt, Map map) { 27 | map.put(Vocabulary.id(NAME), ZERO); 28 | } 29 | 30 | public void binaryGlueRuleScore(int nt, Map map) { 31 | map.put(Vocabulary.id(NAME), ZERO); 32 | } 33 | 34 | @Override 35 | public Writable score(RuleWritable r, Annotation annotation) { 36 | return new IntWritable(annotation.count()); 37 | } 38 | 39 | @Override 40 | public void init(Context context) {} 41 | 42 | @Override 43 | public Set> getPrerequisites() { 44 | return null; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/annotation/LogCountFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features.annotation; 2 | 3 | import java.util.Map; 4 | import java.util.Set; 5 | 6 | import org.apache.hadoop.io.FloatWritable; 7 | import org.apache.hadoop.io.Writable; 8 | import org.apache.hadoop.mapreduce.Reducer.Context; 9 | 10 | import edu.jhu.thrax.hadoop.datatypes.Annotation; 11 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 12 | import edu.jhu.thrax.hadoop.jobs.ThraxJob; 13 | import edu.jhu.thrax.util.Vocabulary; 14 | 15 | @SuppressWarnings("rawtypes") 16 | public class LogCountFeature implements AnnotationFeature { 17 | 18 | public static final String NAME = "logcount"; 19 | 20 | private static final FloatWritable ZERO = new FloatWritable(0); 21 | 22 | public String getName() { 23 | return NAME; 24 | } 25 | 26 | public void unaryGlueRuleScore(int nt, Map map) { 27 | map.put(Vocabulary.id(NAME), ZERO); 28 | } 29 | 30 | public void binaryGlueRuleScore(int nt, Map map) { 31 | map.put(Vocabulary.id(NAME), ZERO); 32 | } 33 | 34 | @Override 35 | public Writable score(RuleWritable r, Annotation annotation) { 36 | return new FloatWritable((float) Math.log(annotation.count())); 37 | } 38 | 39 | @Override 40 | public void init(Context context) {} 41 | 42 | @Override 43 | public Set> getPrerequisites() { 44 | return null; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/annotation/RarityPenaltyFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features.annotation; 2 | 3 | import java.util.Map; 4 | import java.util.Set; 5 | 6 | import org.apache.hadoop.io.FloatWritable; 7 | import org.apache.hadoop.io.Writable; 8 | import org.apache.hadoop.mapreduce.Reducer.Context; 9 | 10 | import edu.jhu.thrax.hadoop.datatypes.Annotation; 11 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 12 | import edu.jhu.thrax.hadoop.jobs.ThraxJob; 13 | import edu.jhu.thrax.util.Vocabulary; 14 | 15 | @SuppressWarnings("rawtypes") 16 | public class RarityPenaltyFeature implements AnnotationFeature { 17 | 18 | public static final String NAME = "rarity"; 19 | 20 | private static final FloatWritable ZERO = new FloatWritable(0.0f); 21 | 22 | public String getName() { 23 | return NAME; 24 | } 25 | 26 | public void unaryGlueRuleScore(int nt, Map map) { 27 | map.put(Vocabulary.id(NAME), ZERO); 28 | } 29 | 30 | public void binaryGlueRuleScore(int nt, Map map) { 31 | map.put(Vocabulary.id(NAME), ZERO); 32 | } 33 | 34 | @Override 35 | public Writable score(RuleWritable r, Annotation annotation) { 36 | return new FloatWritable((float) Math.exp(1 - annotation.count())); 37 | } 38 | 39 | @Override 40 | public void init(Context context) {} 41 | 42 | @Override 43 | public Set> getPrerequisites() { 44 | return null; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/annotation/UnalignedSourceCounterFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features.annotation; 2 | 3 | import java.io.IOException; 4 | import java.util.Map; 5 | import java.util.Set; 6 | 7 | import org.apache.hadoop.io.IntWritable; 8 | import org.apache.hadoop.io.Writable; 9 | import org.apache.hadoop.mapreduce.Reducer.Context; 10 | 11 | import edu.jhu.thrax.hadoop.datatypes.Annotation; 12 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 13 | import edu.jhu.thrax.hadoop.jobs.ThraxJob; 14 | import edu.jhu.thrax.util.Vocabulary; 15 | 16 | @SuppressWarnings("rawtypes") 17 | public class UnalignedSourceCounterFeature implements AnnotationFeature { 18 | 19 | public static final String NAME = "unaligned_source"; 20 | 21 | private static final IntWritable ZERO = new IntWritable(0); 22 | 23 | public String getName() { 24 | return NAME; 25 | } 26 | 27 | public IntWritable score(RuleWritable r, Annotation annotation) { 28 | byte[] f2e = annotation.f2e().points; 29 | int[] src = r.source; 30 | 31 | int count = 0; 32 | int i = 0, j = 0; 33 | for (i = 0; i < src.length; ++i) { 34 | if (Vocabulary.nt(src[i])) continue; 35 | if (j >= f2e.length || i != f2e[j]) count++; 36 | while (j < f2e.length && f2e[j] <= i) 37 | j += 2; 38 | } 39 | return new IntWritable(count); 40 | } 41 | 42 | public void unaryGlueRuleScore(int nt, Map map) { 43 | map.put(Vocabulary.id(NAME), ZERO); 44 | } 45 | 46 | public void binaryGlueRuleScore(int nt, Map map) { 47 | map.put(Vocabulary.id(NAME), ZERO); 48 | } 49 | 50 | @Override 51 | public void init(Context context) throws IOException, InterruptedException {} 52 | 53 | @Override 54 | public Set> getPrerequisites() { 55 | return null; 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/annotation/UnalignedTargetCounterFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features.annotation; 2 | 3 | import java.io.IOException; 4 | import java.util.Map; 5 | import java.util.Set; 6 | 7 | import org.apache.hadoop.io.IntWritable; 8 | import org.apache.hadoop.io.Writable; 9 | import org.apache.hadoop.mapreduce.Reducer.Context; 10 | 11 | import edu.jhu.thrax.hadoop.datatypes.Annotation; 12 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 13 | import edu.jhu.thrax.hadoop.jobs.ThraxJob; 14 | import edu.jhu.thrax.util.Vocabulary; 15 | 16 | @SuppressWarnings("rawtypes") 17 | public class UnalignedTargetCounterFeature implements AnnotationFeature { 18 | 19 | public static final String NAME = "unaligned_target"; 20 | 21 | private static final IntWritable ZERO = new IntWritable(0); 22 | 23 | public String getName() { 24 | return NAME; 25 | } 26 | 27 | public IntWritable score(RuleWritable r, Annotation annotation) { 28 | byte[] e2f = annotation.e2f().points; 29 | int[] tgt = r.target; 30 | 31 | int count = 0; 32 | int i = 0, j = 0; 33 | for (i = 0; i < tgt.length; ++i) { 34 | if (Vocabulary.nt(tgt[i])) continue; 35 | if (j >= e2f.length || i != e2f[j]) count++; 36 | while (j < e2f.length && e2f[j] <= i) 37 | j += 2; 38 | } 39 | return new IntWritable(count); 40 | } 41 | 42 | public void unaryGlueRuleScore(int nt, Map map) { 43 | map.put(Vocabulary.id(NAME), ZERO); 44 | } 45 | 46 | public void binaryGlueRuleScore(int nt, Map map) { 47 | map.put(Vocabulary.id(NAME), ZERO); 48 | } 49 | 50 | @Override 51 | public void init(Context context) throws IOException, InterruptedException {} 52 | 53 | @Override 54 | public Set> getPrerequisites() { 55 | return null; 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/mapred/MapReduceFeatureFactory.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features.mapred; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import edu.jhu.thrax.hadoop.features.annotation.CountFeature; 7 | import edu.jhu.thrax.util.FormatUtils; 8 | 9 | public class MapReduceFeatureFactory { 10 | 11 | public static MapReduceFeature get(String name) { 12 | if (name.equals(SourcePhraseGivenTargetFeature.NAME)) 13 | return new SourcePhraseGivenTargetFeature(); 14 | else if (name.equals(TargetPhraseGivenSourceFeature.NAME)) 15 | return new TargetPhraseGivenSourceFeature(); 16 | else if (name.equals(GoodTuringSmoothedTargetPhraseGivenSourceFeature.NAME)) 17 | return new GoodTuringSmoothedTargetPhraseGivenSourceFeature(); 18 | else if (name.equals(GoodTuringSmoothedSourcePhraseGivenTargetFeature.NAME)) 19 | return new GoodTuringSmoothedSourcePhraseGivenTargetFeature(); 20 | else if (name.equals(SourcePhraseGivenLHSFeature.NAME)) 21 | return new SourcePhraseGivenLHSFeature(); 22 | else if (name.equals(LhsGivenSourcePhraseFeature.NAME)) 23 | return new LhsGivenSourcePhraseFeature(); 24 | else if (name.equals(SourcePhraseGivenTargetandLHSFeature.NAME)) 25 | return new SourcePhraseGivenTargetandLHSFeature(); 26 | else if (name.equals(TargetPhraseGivenSourceandLHSFeature.NAME)) 27 | return new TargetPhraseGivenSourceandLHSFeature(); 28 | else if (name.equals(TargetPhraseGivenLHSFeature.NAME)) 29 | return new TargetPhraseGivenLHSFeature(); 30 | else if (name.equals(LhsGivenTargetPhraseFeature.NAME)) 31 | return new LhsGivenTargetPhraseFeature(); 32 | else if (name.equals(SourceCountFeature.NAME)) 33 | return new SourceCountFeature(); 34 | else if (name.equals(TargetCountFeature.NAME)) 35 | return new TargetCountFeature(); 36 | 37 | return null; 38 | } 39 | 40 | public static List getAll(String names) { 41 | String[] feature_names = FormatUtils.P_COMMA_OR_SPACE.split(names); 42 | List features = new ArrayList(); 43 | 44 | for (String feature_name : feature_names) { 45 | MapReduceFeature feature = get(feature_name); 46 | if (feature != null) features.add(feature); 47 | } 48 | return features; 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/mapred/coc/GoodTuringSmoother.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | package edu.jhu.thrax.hadoop.features.mapred.coc; 3 | 4 | public class GoodTuringSmoother { 5 | private CountOfCountsEstimator estimator; 6 | 7 | public GoodTuringSmoother(CountOfCountsEstimator estimator) { 8 | this.estimator = estimator; 9 | } 10 | 11 | public double smoothedCount(int count) { 12 | double turingFraction = estimator.getEstimatedCountOfCount(count + 1) / estimator.getEstimatedCountOfCount(count); 13 | return (count + 1) * turingFraction; 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/pivot/NonAggregatingPivotedFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features.pivot; 2 | 3 | import java.util.Map; 4 | 5 | import org.apache.hadoop.io.FloatWritable; 6 | import org.apache.hadoop.io.Writable; 7 | 8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap; 9 | import edu.jhu.thrax.util.Vocabulary; 10 | 11 | public abstract class NonAggregatingPivotedFeature implements PivotedFeature { 12 | 13 | private static final FloatWritable ZERO = new FloatWritable(0.0f); 14 | 15 | private float value; 16 | 17 | public void initializeAggregation() { 18 | value = Float.MAX_VALUE; 19 | } 20 | 21 | public void aggregate(FeatureMap features) { 22 | FloatWritable val = (FloatWritable) features.get(getName()); 23 | if (value == Float.MAX_VALUE) { 24 | value = val.get(); 25 | } else { 26 | if (value != val.get()) { 27 | throw new RuntimeException("Diverging values in pseudo-aggregation: " + value + " versus " 28 | + val.get()); 29 | } 30 | } 31 | } 32 | 33 | public FloatWritable finalizeAggregation() { 34 | return new FloatWritable(value); 35 | } 36 | 37 | public void unaryGlueRuleScore(int nt, Map map) { 38 | map.put(Vocabulary.id(getName()), ZERO); 39 | } 40 | 41 | public void binaryGlueRuleScore(int nt, Map map) { 42 | map.put(Vocabulary.id(getName()), ZERO); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/pivot/PivotedAnnotationFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features.pivot; 2 | 3 | import java.util.HashSet; 4 | import java.util.Map; 5 | import java.util.Set; 6 | 7 | import org.apache.hadoop.io.Writable; 8 | 9 | import edu.jhu.thrax.hadoop.datatypes.AlignmentWritable; 10 | import edu.jhu.thrax.hadoop.datatypes.Annotation; 11 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap; 12 | import edu.jhu.thrax.hadoop.features.annotation.AlignmentFeature; 13 | import edu.jhu.thrax.hadoop.features.annotation.AnnotationPassthroughFeature; 14 | 15 | public class PivotedAnnotationFeature implements PivotedFeature { 16 | 17 | public static final String NAME = "annotation"; 18 | 19 | private Annotation aggregated = null; 20 | 21 | public String getName() { 22 | return NAME; 23 | } 24 | 25 | public Set getPrerequisites() { 26 | Set prereqs = new HashSet(); 27 | prereqs.add(AlignmentFeature.NAME); 28 | return prereqs; 29 | } 30 | 31 | public Annotation pivot(FeatureMap src, FeatureMap tgt) { 32 | AlignmentWritable src_f2e = ((AlignmentWritable) src.get(AlignmentFeature.NAME)); 33 | AlignmentWritable tgt_f2e = ((AlignmentWritable) tgt.get(AlignmentFeature.NAME)); 34 | 35 | return new Annotation(src_f2e.join(tgt_f2e)); 36 | } 37 | 38 | public void unaryGlueRuleScore(int nt, Map map) {} 39 | 40 | public void binaryGlueRuleScore(int nt, Map map) {} 41 | 42 | public void initializeAggregation() { 43 | aggregated = null; 44 | } 45 | 46 | public void aggregate(FeatureMap a) { 47 | Annotation annotation = (Annotation) a.get(AnnotationPassthroughFeature.NAME); 48 | if (aggregated == null) { 49 | aggregated = new Annotation(annotation); 50 | } else { 51 | aggregated.setAlignment(aggregated.f2e().intersect(annotation.f2e())); 52 | aggregated.merge(annotation); 53 | } 54 | } 55 | 56 | public Annotation finalizeAggregation() { 57 | return aggregated; 58 | } 59 | 60 | @Override 61 | public Set getLowerBoundLabels() { 62 | return null; 63 | } 64 | 65 | @Override 66 | public Set getUpperBoundLabels() { 67 | return null; 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/pivot/PivotedFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features.pivot; 2 | 3 | import java.util.Set; 4 | 5 | import org.apache.hadoop.io.Writable; 6 | 7 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap; 8 | import edu.jhu.thrax.hadoop.features.Feature; 9 | 10 | public interface PivotedFeature extends Feature { 11 | 12 | public Set getPrerequisites(); 13 | 14 | public Writable pivot(FeatureMap src, FeatureMap tgt); 15 | 16 | public void initializeAggregation(); 17 | 18 | public void aggregate(FeatureMap a); 19 | 20 | public Writable finalizeAggregation(); 21 | 22 | public Set getLowerBoundLabels(); 23 | 24 | public Set getUpperBoundLabels(); 25 | } 26 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/pivot/PivotedFeatureFactory.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features.pivot; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import edu.jhu.thrax.util.FormatUtils; 7 | 8 | public class PivotedFeatureFactory { 9 | 10 | public static PivotedFeature get(String name) { 11 | if (name.equals(PivotedTargetPhraseGivenSourceFeature.NAME)) 12 | return new PivotedTargetPhraseGivenSourceFeature(); 13 | else if (name.equals(PivotedSourcePhraseGivenTargetFeature.NAME)) 14 | return new PivotedSourcePhraseGivenTargetFeature(); 15 | else if (name.equals(PivotedRarityPenaltyFeature.NAME)) 16 | return new PivotedRarityPenaltyFeature(); 17 | else if (name.equals(PivotedLexicalSourceGivenTargetFeature.NAME)) 18 | return new PivotedLexicalSourceGivenTargetFeature(); 19 | else if (name.equals(PivotedLexicalTargetGivenSourceFeature.NAME)) 20 | return new PivotedLexicalTargetGivenSourceFeature(); 21 | else if (name.equals(PivotedSourcePhraseGivenLHSFeature.NAME)) 22 | return new PivotedSourcePhraseGivenLHSFeature(); 23 | else if (name.equals(PivotedLhsGivenSourcePhraseFeature.NAME)) 24 | return new PivotedLhsGivenSourcePhraseFeature(); 25 | else if (name.equals(PivotedSourcePhraseGivenTargetAndLHSFeature.NAME)) 26 | return new PivotedSourcePhraseGivenTargetAndLHSFeature(); 27 | else if (name.equals(PivotedTargetPhraseGivenLHSFeature.NAME)) 28 | return new PivotedTargetPhraseGivenLHSFeature(); 29 | else if (name.equals(PivotedLhsGivenTargetPhraseFeature.NAME)) 30 | return new PivotedLhsGivenTargetPhraseFeature(); 31 | else if (name.equals(PivotedTargetPhraseGivenSourceAndLHSFeature.NAME)) 32 | return new PivotedTargetPhraseGivenSourceAndLHSFeature(); 33 | 34 | return null; 35 | } 36 | 37 | public static List getAll(String names) { 38 | String[] feature_names = FormatUtils.P_COMMA_OR_SPACE.split(names); 39 | List features = new ArrayList(); 40 | 41 | for (String feature_name : feature_names) { 42 | PivotedFeature feature = get(feature_name); 43 | if (feature != null) features.add(feature); 44 | } 45 | return features; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/pivot/PivotedLexicalSourceGivenTargetFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features.pivot; 2 | 3 | import java.util.HashSet; 4 | import java.util.Set; 5 | 6 | import org.apache.hadoop.io.FloatWritable; 7 | 8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap; 9 | import edu.jhu.thrax.hadoop.features.annotation.SourceGivenTargetLexicalProbabilityFeature; 10 | import edu.jhu.thrax.hadoop.features.annotation.TargetGivenSourceLexicalProbabilityFeature; 11 | 12 | public class PivotedLexicalSourceGivenTargetFeature extends PivotedNegLogProbFeature { 13 | 14 | public static final String NAME = SourceGivenTargetLexicalProbabilityFeature.NAME; 15 | 16 | public String getName() { 17 | return NAME; 18 | } 19 | 20 | public Set getPrerequisites() { 21 | Set prereqs = new HashSet(); 22 | prereqs.add(SourceGivenTargetLexicalProbabilityFeature.NAME); 23 | prereqs.add(TargetGivenSourceLexicalProbabilityFeature.NAME); 24 | return prereqs; 25 | } 26 | 27 | public FloatWritable pivot(FeatureMap src, FeatureMap tgt) { 28 | float egf = ((FloatWritable) tgt.get(TargetGivenSourceLexicalProbabilityFeature.NAME)).get(); 29 | float fge = ((FloatWritable) src.get(SourceGivenTargetLexicalProbabilityFeature.NAME)).get(); 30 | 31 | return new FloatWritable(egf + fge); 32 | } 33 | 34 | @Override 35 | public Set getLowerBoundLabels() { 36 | Set lower_bound_labels = new HashSet(); 37 | lower_bound_labels.add(TargetGivenSourceLexicalProbabilityFeature.NAME); 38 | lower_bound_labels.add(SourceGivenTargetLexicalProbabilityFeature.NAME); 39 | return lower_bound_labels; 40 | } 41 | 42 | @Override 43 | public Set getUpperBoundLabels() { 44 | return null; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/pivot/PivotedLexicalTargetGivenSourceFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features.pivot; 2 | 3 | import java.util.HashSet; 4 | import java.util.Set; 5 | 6 | import org.apache.hadoop.io.FloatWritable; 7 | 8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap; 9 | import edu.jhu.thrax.hadoop.features.annotation.SourceGivenTargetLexicalProbabilityFeature; 10 | import edu.jhu.thrax.hadoop.features.annotation.TargetGivenSourceLexicalProbabilityFeature; 11 | 12 | public class PivotedLexicalTargetGivenSourceFeature extends PivotedNegLogProbFeature { 13 | 14 | public static final String NAME = TargetGivenSourceLexicalProbabilityFeature.NAME; 15 | 16 | public String getName() { 17 | return NAME; 18 | } 19 | 20 | public Set getPrerequisites() { 21 | Set prereqs = new HashSet(); 22 | prereqs.add(TargetGivenSourceLexicalProbabilityFeature.NAME); 23 | prereqs.add(SourceGivenTargetLexicalProbabilityFeature.NAME); 24 | return prereqs; 25 | } 26 | 27 | public FloatWritable pivot(FeatureMap src, FeatureMap tgt) { 28 | float egf = ((FloatWritable) src.get(TargetGivenSourceLexicalProbabilityFeature.NAME)).get(); 29 | float fge = ((FloatWritable) tgt.get(SourceGivenTargetLexicalProbabilityFeature.NAME)).get(); 30 | 31 | return new FloatWritable(egf + fge); 32 | } 33 | 34 | @Override 35 | public Set getLowerBoundLabels() { 36 | Set lower_bound_labels = new HashSet(); 37 | lower_bound_labels.add(TargetGivenSourceLexicalProbabilityFeature.NAME); 38 | lower_bound_labels.add(SourceGivenTargetLexicalProbabilityFeature.NAME); 39 | return lower_bound_labels; 40 | } 41 | 42 | @Override 43 | public Set getUpperBoundLabels() { 44 | return null; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/pivot/PivotedLhsGivenSourcePhraseFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features.pivot; 2 | 3 | import java.util.HashSet; 4 | import java.util.Set; 5 | 6 | import org.apache.hadoop.io.FloatWritable; 7 | 8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap; 9 | import edu.jhu.thrax.hadoop.features.mapred.LhsGivenSourcePhraseFeature; 10 | import edu.jhu.thrax.hadoop.features.mapred.LhsGivenTargetPhraseFeature; 11 | 12 | public class PivotedLhsGivenSourcePhraseFeature extends NonAggregatingPivotedFeature { 13 | 14 | public static final String NAME = LhsGivenSourcePhraseFeature.NAME; 15 | 16 | public String getName() { 17 | return NAME; 18 | } 19 | 20 | public Set getPrerequisites() { 21 | Set prereqs = new HashSet(); 22 | prereqs.add(LhsGivenTargetPhraseFeature.NAME); 23 | return prereqs; 24 | } 25 | 26 | public FloatWritable pivot(FeatureMap src, FeatureMap tgt) { 27 | return new FloatWritable(((FloatWritable) src.get(LhsGivenTargetPhraseFeature.NAME)).get()); 28 | } 29 | 30 | @Override 31 | public Set getLowerBoundLabels() { 32 | Set lower_bound_labels = new HashSet(); 33 | lower_bound_labels.add(LhsGivenTargetPhraseFeature.NAME); 34 | return lower_bound_labels; 35 | } 36 | 37 | @Override 38 | public Set getUpperBoundLabels() { 39 | return null; 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/pivot/PivotedLhsGivenTargetPhraseFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features.pivot; 2 | 3 | import java.util.HashSet; 4 | import java.util.Set; 5 | 6 | import org.apache.hadoop.io.FloatWritable; 7 | 8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap; 9 | import edu.jhu.thrax.hadoop.features.mapred.LhsGivenTargetPhraseFeature; 10 | 11 | public class PivotedLhsGivenTargetPhraseFeature extends NonAggregatingPivotedFeature { 12 | 13 | public static final String NAME = LhsGivenTargetPhraseFeature.NAME; 14 | 15 | public String getName() { 16 | return NAME; 17 | } 18 | 19 | public Set getPrerequisites() { 20 | Set prereqs = new HashSet(); 21 | prereqs.add(LhsGivenTargetPhraseFeature.NAME); 22 | return prereqs; 23 | } 24 | 25 | public FloatWritable pivot(FeatureMap src, FeatureMap tgt) { 26 | return new FloatWritable(((FloatWritable) tgt.get(LhsGivenTargetPhraseFeature.NAME)).get()); 27 | } 28 | 29 | @Override 30 | public Set getLowerBoundLabels() { 31 | Set lower_bound_labels = new HashSet(); 32 | lower_bound_labels.add(LhsGivenTargetPhraseFeature.NAME); 33 | return lower_bound_labels; 34 | } 35 | 36 | @Override 37 | public Set getUpperBoundLabels() { 38 | return null; 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/pivot/PivotedNegLogProbFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features.pivot; 2 | 3 | import java.util.Map; 4 | 5 | import org.apache.hadoop.io.FloatWritable; 6 | import org.apache.hadoop.io.Writable; 7 | 8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap; 9 | import edu.jhu.thrax.util.NegLogMath; 10 | import edu.jhu.thrax.util.Vocabulary; 11 | 12 | public abstract class PivotedNegLogProbFeature implements PivotedFeature { 13 | 14 | private static final FloatWritable ONE_PROB = new FloatWritable(0.0f); 15 | 16 | private float aggregated; 17 | 18 | public void initializeAggregation() { 19 | aggregated = 64.0f; 20 | } 21 | 22 | public void aggregate(FeatureMap features) { 23 | FloatWritable val = (FloatWritable) features.get(getName()); 24 | aggregated = NegLogMath.logAdd(aggregated, val.get()); 25 | } 26 | 27 | public FloatWritable finalizeAggregation() { 28 | return new FloatWritable(aggregated); 29 | } 30 | 31 | public void unaryGlueRuleScore(int nt, Map map) { 32 | map.put(Vocabulary.id(getName()), ONE_PROB); 33 | } 34 | 35 | public void binaryGlueRuleScore(int nt, Map map) { 36 | map.put(Vocabulary.id(getName()), ONE_PROB); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/pivot/PivotedRarityPenaltyFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features.pivot; 2 | 3 | import java.util.HashSet; 4 | import java.util.Map; 5 | import java.util.Set; 6 | 7 | import org.apache.hadoop.io.FloatWritable; 8 | import org.apache.hadoop.io.Writable; 9 | 10 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap; 11 | import edu.jhu.thrax.hadoop.features.annotation.RarityPenaltyFeature; 12 | import edu.jhu.thrax.util.Vocabulary; 13 | 14 | public class PivotedRarityPenaltyFeature implements PivotedFeature { 15 | 16 | public static final String NAME = RarityPenaltyFeature.NAME; 17 | 18 | private static final FloatWritable ZERO = new FloatWritable(0.0f); 19 | 20 | private static final float RENORMALIZE = (float) Math.exp(-1); 21 | 22 | private float aggregated_rp; 23 | 24 | public String getName() { 25 | return NAME; 26 | } 27 | 28 | public Set getPrerequisites() { 29 | Set prereqs = new HashSet(); 30 | prereqs.add(RarityPenaltyFeature.NAME); 31 | return prereqs; 32 | } 33 | 34 | public FloatWritable pivot(FeatureMap a, FeatureMap b) { 35 | float a_rp = ((FloatWritable) a.get(RarityPenaltyFeature.NAME)).get(); 36 | float b_rp = ((FloatWritable) b.get(RarityPenaltyFeature.NAME)).get(); 37 | return new FloatWritable(Math.max(a_rp, b_rp)); 38 | } 39 | 40 | public void unaryGlueRuleScore(int nt, Map map) { 41 | map.put(Vocabulary.id(NAME), ZERO); 42 | } 43 | 44 | public void binaryGlueRuleScore(int nt, Map map) { 45 | map.put(Vocabulary.id(NAME), ZERO); 46 | } 47 | 48 | public void initializeAggregation() { 49 | aggregated_rp = -1; 50 | } 51 | 52 | public void aggregate(FeatureMap a) { 53 | float rp = ((FloatWritable) a.get(NAME)).get(); 54 | if (aggregated_rp == -1) { 55 | aggregated_rp = rp; 56 | } else { 57 | // Rarity is exp(1 - count). To compute rarity over a sum of counts: 58 | // rarity_{1+2} = exp(1 - (count_1 + count_2)) = exp(1 - count_1) * exp(-count_2) = 59 | // = exp(1 - count_1) * exp(1 - count_2) * exp(-1) = rarity_1 * rarity_2 * exp(-1) 60 | aggregated_rp *= rp * RENORMALIZE; 61 | } 62 | } 63 | 64 | public FloatWritable finalizeAggregation() { 65 | return new FloatWritable(aggregated_rp); 66 | } 67 | 68 | @Override 69 | public Set getLowerBoundLabels() { 70 | Set lower_bound_labels = new HashSet(); 71 | lower_bound_labels.add(RarityPenaltyFeature.NAME); 72 | return lower_bound_labels; 73 | } 74 | 75 | @Override 76 | public Set getUpperBoundLabels() { 77 | return null; 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/pivot/PivotedSourcePhraseGivenLHSFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features.pivot; 2 | 3 | import java.util.HashSet; 4 | import java.util.Set; 5 | 6 | import org.apache.hadoop.io.FloatWritable; 7 | 8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap; 9 | import edu.jhu.thrax.hadoop.features.mapred.SourcePhraseGivenLHSFeature; 10 | import edu.jhu.thrax.hadoop.features.mapred.TargetPhraseGivenLHSFeature; 11 | 12 | public class PivotedSourcePhraseGivenLHSFeature extends NonAggregatingPivotedFeature { 13 | 14 | public static final String NAME = SourcePhraseGivenLHSFeature.NAME; 15 | 16 | public String getName() { 17 | return NAME; 18 | } 19 | 20 | public Set getPrerequisites() { 21 | Set prereqs = new HashSet(); 22 | prereqs.add(TargetPhraseGivenLHSFeature.NAME); 23 | return prereqs; 24 | } 25 | 26 | public FloatWritable pivot(FeatureMap src, FeatureMap tgt) { 27 | return new FloatWritable(((FloatWritable) src.get(TargetPhraseGivenLHSFeature.NAME)).get()); 28 | } 29 | 30 | @Override 31 | public Set getLowerBoundLabels() { 32 | Set lower_bound_labels = new HashSet(); 33 | lower_bound_labels.add(TargetPhraseGivenLHSFeature.NAME); 34 | return lower_bound_labels; 35 | } 36 | 37 | @Override 38 | public Set getUpperBoundLabels() { 39 | return null; 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/pivot/PivotedSourcePhraseGivenTargetAndLHSFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features.pivot; 2 | 3 | import java.util.HashSet; 4 | import java.util.Set; 5 | 6 | import org.apache.hadoop.io.FloatWritable; 7 | 8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap; 9 | import edu.jhu.thrax.hadoop.features.mapred.SourcePhraseGivenTargetandLHSFeature; 10 | import edu.jhu.thrax.hadoop.features.mapred.TargetPhraseGivenSourceandLHSFeature; 11 | 12 | public class PivotedSourcePhraseGivenTargetAndLHSFeature extends PivotedNegLogProbFeature { 13 | 14 | public static final String NAME = SourcePhraseGivenTargetandLHSFeature.NAME; 15 | 16 | public String getName() { 17 | return NAME; 18 | } 19 | 20 | public Set getPrerequisites() { 21 | Set prereqs = new HashSet(); 22 | prereqs.add(SourcePhraseGivenTargetandLHSFeature.NAME); 23 | prereqs.add(TargetPhraseGivenSourceandLHSFeature.NAME); 24 | return prereqs; 25 | } 26 | 27 | public FloatWritable pivot(FeatureMap src, FeatureMap tgt) { 28 | float fge = ((FloatWritable) src.get(TargetPhraseGivenSourceandLHSFeature.NAME)).get(); 29 | float egf = ((FloatWritable) tgt.get(SourcePhraseGivenTargetandLHSFeature.NAME)).get(); 30 | 31 | return new FloatWritable(egf + fge); 32 | } 33 | 34 | @Override 35 | public Set getLowerBoundLabels() { 36 | Set lower_bound_labels = new HashSet(); 37 | lower_bound_labels.add(TargetPhraseGivenSourceandLHSFeature.NAME); 38 | lower_bound_labels.add(SourcePhraseGivenTargetandLHSFeature.NAME); 39 | return lower_bound_labels; 40 | } 41 | 42 | @Override 43 | public Set getUpperBoundLabels() { 44 | return null; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/pivot/PivotedSourcePhraseGivenTargetFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features.pivot; 2 | 3 | import java.util.HashSet; 4 | import java.util.Set; 5 | 6 | import org.apache.hadoop.io.FloatWritable; 7 | 8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap; 9 | import edu.jhu.thrax.hadoop.features.mapred.SourcePhraseGivenTargetFeature; 10 | import edu.jhu.thrax.hadoop.features.mapred.TargetPhraseGivenSourceFeature; 11 | 12 | public class PivotedSourcePhraseGivenTargetFeature extends PivotedNegLogProbFeature { 13 | 14 | public static final String NAME = SourcePhraseGivenTargetFeature.NAME; 15 | 16 | public String getName() { 17 | return NAME; 18 | } 19 | 20 | public Set getPrerequisites() { 21 | Set prereqs = new HashSet(); 22 | prereqs.add(TargetPhraseGivenSourceFeature.NAME); 23 | prereqs.add(SourcePhraseGivenTargetFeature.NAME); 24 | return prereqs; 25 | } 26 | 27 | public FloatWritable pivot(FeatureMap src, FeatureMap tgt) { 28 | float src_f = ((FloatWritable) src.get(TargetPhraseGivenSourceFeature.NAME)).get(); 29 | float f_tgt = ((FloatWritable) tgt.get(SourcePhraseGivenTargetFeature.NAME)).get(); 30 | 31 | return new FloatWritable(src_f + f_tgt); 32 | } 33 | 34 | @Override 35 | public Set getLowerBoundLabels() { 36 | Set lower_bound_labels = new HashSet(); 37 | lower_bound_labels.add(TargetPhraseGivenSourceFeature.NAME); 38 | lower_bound_labels.add(SourcePhraseGivenTargetFeature.NAME); 39 | return lower_bound_labels; 40 | } 41 | 42 | @Override 43 | public Set getUpperBoundLabels() { 44 | return null; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/pivot/PivotedTargetPhraseGivenLHSFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features.pivot; 2 | 3 | import java.util.HashSet; 4 | import java.util.Set; 5 | 6 | import org.apache.hadoop.io.FloatWritable; 7 | 8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap; 9 | import edu.jhu.thrax.hadoop.features.mapred.TargetPhraseGivenLHSFeature; 10 | 11 | public class PivotedTargetPhraseGivenLHSFeature extends NonAggregatingPivotedFeature { 12 | 13 | public static final String NAME = TargetPhraseGivenLHSFeature.NAME; 14 | 15 | public String getName() { 16 | return NAME; 17 | } 18 | 19 | public Set getPrerequisites() { 20 | Set prereqs = new HashSet(); 21 | prereqs.add(TargetPhraseGivenLHSFeature.NAME); 22 | return prereqs; 23 | } 24 | 25 | public FloatWritable pivot(FeatureMap src, FeatureMap tgt) { 26 | return new FloatWritable(((FloatWritable) tgt.get(TargetPhraseGivenLHSFeature.NAME)).get()); 27 | } 28 | 29 | @Override 30 | public Set getLowerBoundLabels() { 31 | Set lower_bound_labels = new HashSet(); 32 | lower_bound_labels.add(TargetPhraseGivenLHSFeature.NAME); 33 | return lower_bound_labels; 34 | } 35 | 36 | @Override 37 | public Set getUpperBoundLabels() { 38 | return null; 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/pivot/PivotedTargetPhraseGivenSourceAndLHSFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features.pivot; 2 | 3 | import java.util.HashSet; 4 | import java.util.Set; 5 | 6 | import org.apache.hadoop.io.FloatWritable; 7 | 8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap; 9 | import edu.jhu.thrax.hadoop.features.mapred.SourcePhraseGivenTargetandLHSFeature; 10 | import edu.jhu.thrax.hadoop.features.mapred.TargetPhraseGivenSourceandLHSFeature; 11 | 12 | public class PivotedTargetPhraseGivenSourceAndLHSFeature extends PivotedNegLogProbFeature { 13 | 14 | public static final String NAME = TargetPhraseGivenSourceandLHSFeature.NAME; 15 | 16 | public String getName() { 17 | return NAME; 18 | } 19 | 20 | public Set getPrerequisites() { 21 | Set prereqs = new HashSet(); 22 | prereqs.add(TargetPhraseGivenSourceandLHSFeature.NAME); 23 | prereqs.add(SourcePhraseGivenTargetandLHSFeature.NAME); 24 | return prereqs; 25 | } 26 | 27 | public FloatWritable pivot(FeatureMap src, FeatureMap tgt) { 28 | float fge = ((FloatWritable) tgt.get(TargetPhraseGivenSourceandLHSFeature.NAME)).get(); 29 | float egf = ((FloatWritable) src.get(SourcePhraseGivenTargetandLHSFeature.NAME)).get(); 30 | 31 | return new FloatWritable(egf + fge); 32 | } 33 | 34 | @Override 35 | public Set getLowerBoundLabels() { 36 | Set lower_bound_labels = new HashSet(); 37 | lower_bound_labels.add(TargetPhraseGivenSourceandLHSFeature.NAME); 38 | lower_bound_labels.add(SourcePhraseGivenTargetandLHSFeature.NAME); 39 | return lower_bound_labels; 40 | } 41 | 42 | @Override 43 | public Set getUpperBoundLabels() { 44 | return null; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/features/pivot/PivotedTargetPhraseGivenSourceFeature.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.features.pivot; 2 | 3 | import java.util.HashSet; 4 | import java.util.Set; 5 | 6 | import org.apache.hadoop.io.FloatWritable; 7 | 8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap; 9 | import edu.jhu.thrax.hadoop.features.mapred.SourcePhraseGivenTargetFeature; 10 | import edu.jhu.thrax.hadoop.features.mapred.TargetPhraseGivenSourceFeature; 11 | 12 | public class PivotedTargetPhraseGivenSourceFeature extends PivotedNegLogProbFeature { 13 | 14 | public static final String NAME = TargetPhraseGivenSourceFeature.NAME; 15 | 16 | public String getName() { 17 | return NAME; 18 | } 19 | 20 | public Set getPrerequisites() { 21 | Set prereqs = new HashSet(); 22 | prereqs.add(TargetPhraseGivenSourceFeature.NAME); 23 | prereqs.add(SourcePhraseGivenTargetFeature.NAME); 24 | return prereqs; 25 | } 26 | 27 | public FloatWritable pivot(FeatureMap src, FeatureMap tgt) { 28 | float tgt_f = ((FloatWritable) tgt.get(TargetPhraseGivenSourceFeature.NAME)).get(); 29 | float f_src = ((FloatWritable) src.get(SourcePhraseGivenTargetFeature.NAME)).get(); 30 | 31 | return new FloatWritable(tgt_f + f_src); 32 | } 33 | 34 | @Override 35 | public Set getLowerBoundLabels() { 36 | Set lower_bound_labels = new HashSet(); 37 | lower_bound_labels.add(TargetPhraseGivenSourceFeature.NAME); 38 | lower_bound_labels.add(SourcePhraseGivenTargetFeature.NAME); 39 | return lower_bound_labels; 40 | } 41 | 42 | @Override 43 | public Set getUpperBoundLabels() { 44 | return null; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/jobs/DefaultValues.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.jobs; 2 | 3 | public class DefaultValues { 4 | public static int DEFAULT_NUM_REDUCERS = 4; 5 | 6 | private DefaultValues() {}; 7 | } 8 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/jobs/DistributionalContextExtractionJob.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.jobs; 2 | 3 | import java.io.IOException; 4 | import java.util.HashSet; 5 | import java.util.Set; 6 | 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.fs.Path; 9 | import org.apache.hadoop.io.NullWritable; 10 | import org.apache.hadoop.io.Text; 11 | import org.apache.hadoop.mapreduce.Job; 12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 14 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; 15 | 16 | import edu.jhu.thrax.hadoop.distributional.ContextWritable; 17 | import edu.jhu.thrax.hadoop.distributional.DistributionalContextCombiner; 18 | import edu.jhu.thrax.hadoop.distributional.DistributionalContextMapper; 19 | import edu.jhu.thrax.hadoop.distributional.DistributionalContextReducer; 20 | import edu.jhu.thrax.hadoop.distributional.SignatureWritable; 21 | 22 | public class DistributionalContextExtractionJob implements ThraxJob { 23 | 24 | public Job getJob(Configuration conf) throws IOException { 25 | Job job = new Job(conf, "distributional"); 26 | 27 | job.setJarByClass(DistributionalContextMapper.class); 28 | 29 | job.setMapperClass(DistributionalContextMapper.class); 30 | job.setCombinerClass(DistributionalContextCombiner.class); 31 | job.setReducerClass(DistributionalContextReducer.class); 32 | 33 | job.setMapOutputKeyClass(Text.class); 34 | job.setMapOutputValueClass(ContextWritable.class); 35 | 36 | job.setOutputKeyClass(SignatureWritable.class); 37 | job.setOutputValueClass(NullWritable.class); 38 | 39 | job.setOutputFormatClass(SequenceFileOutputFormat.class); 40 | 41 | int numReducers = conf.getInt("thrax.reducers", conf.getInt("mapreduce.job.reduces", DefaultValues.DEFAULT_NUM_REDUCERS)); 42 | job.setNumReduceTasks(numReducers); 43 | 44 | FileInputFormat.setInputPaths(job, new Path(conf.get("thrax.input-file"))); 45 | FileOutputFormat.setOutputPath(job, new Path(conf.get("thrax.work-dir") + "signatures")); 46 | 47 | int max_split_size = conf.getInt("thrax.max-split-size", 0); 48 | if (max_split_size != 0) FileInputFormat.setMaxInputSplitSize(job, max_split_size); 49 | 50 | return job; 51 | } 52 | 53 | public String getName() { 54 | return "distributional"; 55 | } 56 | 57 | public String getOutputSuffix() { 58 | return null; 59 | } 60 | 61 | @Override 62 | public Set> getPrerequisites() { 63 | return new HashSet>(); 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/jobs/DistributionalContextSortingJob.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.jobs; 2 | 3 | import java.io.IOException; 4 | import java.util.HashSet; 5 | import java.util.Set; 6 | 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.fs.Path; 9 | import org.apache.hadoop.io.NullWritable; 10 | import org.apache.hadoop.mapreduce.Job; 11 | import org.apache.hadoop.mapreduce.Mapper; 12 | import org.apache.hadoop.mapreduce.Reducer; 13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 14 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; 15 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 16 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; 17 | 18 | import edu.jhu.thrax.hadoop.distributional.DistributionalContextMapper; 19 | import edu.jhu.thrax.hadoop.distributional.SignatureWritable; 20 | 21 | public class DistributionalContextSortingJob implements ThraxJob { 22 | 23 | private static HashSet> prereqs = 24 | new HashSet>(); 25 | 26 | public Job getJob(Configuration conf) throws IOException { 27 | Job job = new Job(conf, "sorting"); 28 | 29 | job.setJarByClass(DistributionalContextMapper.class); 30 | 31 | job.setMapperClass(Mapper.class); 32 | job.setReducerClass(Reducer.class); 33 | 34 | job.setInputFormatClass(SequenceFileInputFormat.class); 35 | 36 | job.setOutputKeyClass(SignatureWritable.class); 37 | job.setOutputValueClass(NullWritable.class); 38 | 39 | job.setOutputFormatClass(SequenceFileOutputFormat.class); 40 | 41 | // TODO: Figure out how to make this workable with multiple reducers. Currently -getmerge-ing 42 | // multiple sequence file outputs from several reducers yields a broken file. 43 | job.setNumReduceTasks(1); 44 | 45 | FileInputFormat.setInputPaths(job, new Path(conf.get("thrax.work-dir") + "signatures")); 46 | FileOutputFormat.setOutputPath(job, new Path(conf.get("thrax.outputPath", ""))); 47 | 48 | int max_split_size = conf.getInt("thrax.max-split-size", 0); 49 | if (max_split_size != 0) FileInputFormat.setMaxInputSplitSize(job, max_split_size); 50 | 51 | return job; 52 | } 53 | 54 | public String getName() { 55 | return "sorting"; 56 | } 57 | 58 | public Set> getPrerequisites() { 59 | prereqs.add(DistributionalContextExtractionJob.class); 60 | return prereqs; 61 | } 62 | 63 | public String getOutputSuffix() { 64 | return null; 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/jobs/ExtractionJob.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.jobs; 2 | 3 | import java.io.IOException; 4 | import java.util.HashSet; 5 | import java.util.Set; 6 | 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.fs.Path; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 11 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 12 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; 13 | 14 | import edu.jhu.thrax.hadoop.datatypes.AlignedRuleWritable; 15 | import edu.jhu.thrax.hadoop.datatypes.Annotation; 16 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 17 | import edu.jhu.thrax.hadoop.extraction.ExtractionCombiner; 18 | import edu.jhu.thrax.hadoop.extraction.ExtractionMapper; 19 | import edu.jhu.thrax.hadoop.extraction.ExtractionReducer; 20 | 21 | public class ExtractionJob implements ThraxJob { 22 | 23 | public Set> getPrerequisites() { 24 | Set> result = new HashSet>(); 25 | result.add(VocabularyJob.class); 26 | return result; 27 | } 28 | 29 | public Job getJob(Configuration conf) throws IOException { 30 | Job job = new Job(conf, "extraction"); 31 | job.setJarByClass(ExtractionMapper.class); 32 | 33 | job.setMapperClass(ExtractionMapper.class); 34 | job.setCombinerClass(ExtractionCombiner.class); 35 | job.setReducerClass(ExtractionReducer.class); 36 | 37 | job.setSortComparatorClass(AlignedRuleWritable.RuleYieldComparator.class); 38 | job.setPartitionerClass(AlignedRuleWritable.RuleYieldPartitioner.class); 39 | 40 | job.setMapOutputKeyClass(AlignedRuleWritable.class); 41 | job.setMapOutputValueClass(Annotation.class); 42 | job.setOutputKeyClass(RuleWritable.class); 43 | job.setOutputValueClass(Annotation.class); 44 | 45 | job.setOutputFormatClass(SequenceFileOutputFormat.class); 46 | 47 | int numReducers = conf.getInt("thrax.reducers", conf.getInt("mapreduce.job.reduces", DefaultValues.DEFAULT_NUM_REDUCERS)); 48 | job.setNumReduceTasks(numReducers); 49 | 50 | FileInputFormat.setInputPaths(job, new Path(conf.get("thrax.input-file"))); 51 | int maxSplitSize = conf.getInt("thrax.max-split-size", 0); 52 | if (maxSplitSize != 0) FileInputFormat.setMaxInputSplitSize(job, maxSplitSize); 53 | 54 | FileOutputFormat.setOutputPath(job, new Path(conf.get("thrax.work-dir") + "rules")); 55 | 56 | return job; 57 | } 58 | 59 | // TODO: unify names of jobs and their output directories 60 | 61 | public String getName() { 62 | return "extraction"; 63 | } 64 | 65 | public String getOutputSuffix() { 66 | return "rules"; 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/jobs/FeatureCollectionJob.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.jobs; 2 | 3 | import java.io.IOException; 4 | import java.util.HashSet; 5 | import java.util.Set; 6 | 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.fs.Path; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 12 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 14 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; 15 | 16 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap; 17 | import edu.jhu.thrax.hadoop.datatypes.FeaturePair; 18 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 19 | import edu.jhu.thrax.hadoop.paraphrasing.FeatureCollectionReducer; 20 | 21 | public class FeatureCollectionJob implements ThraxJob { 22 | 23 | private static HashSet> prereqs = 24 | new HashSet>(); 25 | 26 | private static HashSet prereq_names = new HashSet(); 27 | 28 | public static void addPrerequisite(Class c) { 29 | prereqs.add(c); 30 | try { 31 | ThraxJob prereq; 32 | prereq = c.newInstance(); 33 | prereq_names.add(prereq.getOutputSuffix()); 34 | } catch (Exception e) { 35 | e.printStackTrace(); 36 | } 37 | } 38 | 39 | public Set> getPrerequisites() { 40 | prereqs.add(ExtractionJob.class); 41 | return prereqs; 42 | } 43 | 44 | public Job getJob(Configuration conf) throws IOException { 45 | Job job = new Job(conf, "collect"); 46 | 47 | String workDir = conf.get("thrax.work-dir"); 48 | 49 | job.setJarByClass(FeatureCollectionReducer.class); 50 | 51 | job.setMapperClass(Mapper.class); 52 | job.setReducerClass(FeatureCollectionReducer.class); 53 | 54 | job.setInputFormatClass(SequenceFileInputFormat.class); 55 | job.setMapOutputKeyClass(RuleWritable.class); 56 | job.setMapOutputValueClass(FeaturePair.class); 57 | job.setOutputKeyClass(RuleWritable.class); 58 | job.setOutputValueClass(FeatureMap.class); 59 | job.setOutputFormatClass(SequenceFileOutputFormat.class); 60 | 61 | job.setPartitionerClass(RuleWritable.YieldPartitioner.class); 62 | 63 | int numReducers = conf.getInt("thrax.reducers", conf.getInt("mapreduce.job.reduces", DefaultValues.DEFAULT_NUM_REDUCERS)); 64 | job.setNumReduceTasks(numReducers); 65 | 66 | int maxSplitSize = conf.getInt("thrax.max-split-size", 0); 67 | if (maxSplitSize != 0) FileInputFormat.setMaxInputSplitSize(job, maxSplitSize * 20); 68 | 69 | for (String prereq_name : prereq_names) 70 | FileInputFormat.addInputPath(job, new Path(workDir + prereq_name)); 71 | 72 | // TODO: double-check this. 73 | if (FileInputFormat.getInputPaths(job).length == 0) 74 | FileInputFormat.addInputPath(job, new Path(workDir + "rules")); 75 | 76 | String outputPath = workDir + "collected"; 77 | FileOutputFormat.setOutputPath(job, new Path(outputPath)); 78 | 79 | return job; 80 | } 81 | 82 | public String getName() { 83 | return "collect"; 84 | } 85 | 86 | public String getOutputSuffix() { 87 | return "collected"; 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/jobs/JobState.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.jobs; 2 | 3 | public enum JobState 4 | { 5 | PREREQ_FAILED, 6 | FAILED, 7 | READY, 8 | RUNNING, 9 | SUCCESS, 10 | WAITING, 11 | PLANNED 12 | } 13 | 14 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/jobs/ParaphraseAggregationJob.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.jobs; 2 | 3 | import java.io.IOException; 4 | import java.util.HashSet; 5 | import java.util.Set; 6 | 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.fs.Path; 9 | import org.apache.hadoop.io.NullWritable; 10 | import org.apache.hadoop.io.Text; 11 | import org.apache.hadoop.io.compress.GzipCodec; 12 | import org.apache.hadoop.mapreduce.Job; 13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 14 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; 15 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 16 | 17 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap; 18 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 19 | import edu.jhu.thrax.hadoop.paraphrasing.AggregationCombiner; 20 | import edu.jhu.thrax.hadoop.paraphrasing.AggregationMapper; 21 | import edu.jhu.thrax.hadoop.paraphrasing.AggregationReducer; 22 | 23 | public class ParaphraseAggregationJob implements ThraxJob { 24 | 25 | private static HashSet> prereqs = 26 | new HashSet>(); 27 | 28 | public Job getJob(Configuration conf) throws IOException { 29 | Job job = new Job(conf, "aggregate"); 30 | 31 | job.setJarByClass(AggregationReducer.class); 32 | 33 | job.setMapperClass(AggregationMapper.class); 34 | job.setCombinerClass(AggregationCombiner.class); 35 | job.setReducerClass(AggregationReducer.class); 36 | 37 | job.setInputFormatClass(SequenceFileInputFormat.class); 38 | job.setMapOutputKeyClass(RuleWritable.class); 39 | job.setMapOutputValueClass(FeatureMap.class); 40 | job.setOutputKeyClass(Text.class); 41 | job.setOutputValueClass(NullWritable.class); 42 | 43 | job.setSortComparatorClass(RuleWritable.YieldComparator.class); 44 | job.setPartitionerClass(RuleWritable.FirstWordPartitioner.class); 45 | 46 | FileInputFormat.setInputPaths(job, new Path(conf.get("thrax.work-dir") + "pivoted")); 47 | int maxSplitSize = conf.getInt("thrax.max-split-size", 0); 48 | if (maxSplitSize != 0) FileInputFormat.setMaxInputSplitSize(job, maxSplitSize * 20); 49 | 50 | int numReducers = conf.getInt("thrax.reducers", conf.getInt("mapreduce.job.reduces", DefaultValues.DEFAULT_NUM_REDUCERS)); 51 | job.setNumReduceTasks(numReducers); 52 | 53 | String outputPath = conf.get("thrax.outputPath", ""); 54 | FileOutputFormat.setOutputPath(job, new Path(outputPath)); 55 | 56 | FileOutputFormat.setCompressOutput(job, true); 57 | FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); 58 | 59 | return job; 60 | } 61 | 62 | public String getName() { 63 | return "aggregate"; 64 | } 65 | 66 | public static void addPrerequisite(Class c) { 67 | prereqs.add(c); 68 | } 69 | 70 | public Set> getPrerequisites() { 71 | prereqs.add(ParaphrasePivotingJob.class); 72 | return prereqs; 73 | } 74 | 75 | public String getOutputSuffix() { 76 | return null; 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/jobs/ParaphrasePivotingJob.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.jobs; 2 | 3 | import java.io.IOException; 4 | import java.util.HashSet; 5 | import java.util.Set; 6 | 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.fs.Path; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 11 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; 12 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; 14 | 15 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap; 16 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 17 | import edu.jhu.thrax.hadoop.paraphrasing.PivotingMapper; 18 | import edu.jhu.thrax.hadoop.paraphrasing.PivotingReducer; 19 | 20 | public class ParaphrasePivotingJob implements ThraxJob { 21 | 22 | private static HashSet> prereqs = 23 | new HashSet>(); 24 | 25 | public static void addPrerequisite(Class c) { 26 | prereqs.add(c); 27 | } 28 | 29 | public Set> getPrerequisites() { 30 | prereqs.add(FeatureCollectionJob.class); 31 | return prereqs; 32 | } 33 | 34 | public Job getJob(Configuration conf) throws IOException { 35 | Job job = new Job(conf, "pivoting"); 36 | 37 | job.setJarByClass(PivotingReducer.class); 38 | 39 | job.setMapperClass(PivotingMapper.class); 40 | job.setReducerClass(PivotingReducer.class); 41 | 42 | job.setInputFormatClass(SequenceFileInputFormat.class); 43 | job.setMapOutputKeyClass(RuleWritable.class); 44 | job.setMapOutputValueClass(FeatureMap.class); 45 | job.setOutputKeyClass(RuleWritable.class); 46 | job.setOutputValueClass(FeatureMap.class); 47 | job.setOutputFormatClass(SequenceFileOutputFormat.class); 48 | 49 | job.setPartitionerClass(RuleWritable.SourcePartitioner.class); 50 | 51 | FileInputFormat.setInputPaths(job, new Path(conf.get("thrax.work-dir") + "collected")); 52 | int maxSplitSize = conf.getInt("thrax.max-split-size", 0); 53 | if (maxSplitSize != 0) FileInputFormat.setMaxInputSplitSize(job, maxSplitSize * 20); 54 | 55 | int numReducers = conf.getInt("thrax.reducers", conf.getInt("mapreduce.job.reduces", DefaultValues.DEFAULT_NUM_REDUCERS)); 56 | job.setNumReduceTasks(numReducers); 57 | 58 | FileOutputFormat.setOutputPath(job, new Path(conf.get("thrax.work-dir") + "pivoted")); 59 | 60 | return job; 61 | } 62 | 63 | public String getName() { 64 | return "pivoting"; 65 | } 66 | 67 | public String getOutputSuffix() { 68 | return "pivoted"; 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/jobs/SchedulerException.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.jobs; 2 | 3 | public class SchedulerException extends Exception 4 | { 5 | private static final long serialVersionUID = 9090L; 6 | 7 | public SchedulerException(String s) 8 | { 9 | super(s); 10 | } 11 | } 12 | 13 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/jobs/SourceWordGivenTargetWordProbabilityJob.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.jobs; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.mapreduce.Job; 8 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 9 | 10 | public class SourceWordGivenTargetWordProbabilityJob extends WordLexprobJob { 11 | 12 | public SourceWordGivenTargetWordProbabilityJob() { 13 | super(true); 14 | } 15 | 16 | public Job getJob(Configuration conf) throws IOException { 17 | Job job = super.getJob(conf); 18 | FileOutputFormat.setOutputPath(job, new Path(conf.get("thrax.work-dir") + "lexprobs_sgt")); 19 | return job; 20 | } 21 | 22 | public String getName() { 23 | return "source-word-lexprob"; 24 | } 25 | 26 | public String getOutputSuffix() { 27 | return "lexprobs_sgt"; 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/jobs/TargetWordGivenSourceWordProbabilityJob.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.jobs; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.mapreduce.Job; 8 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 9 | 10 | public class TargetWordGivenSourceWordProbabilityJob extends WordLexprobJob { 11 | 12 | public TargetWordGivenSourceWordProbabilityJob() { 13 | super(false); 14 | } 15 | 16 | public Job getJob(Configuration conf) throws IOException { 17 | Job job = super.getJob(conf); 18 | FileOutputFormat.setOutputPath(job, new Path(conf.get("thrax.work-dir") + "lexprobs_tgs")); 19 | return job; 20 | } 21 | 22 | @Override 23 | public String getName() { 24 | return "target-word-lexprob"; 25 | } 26 | 27 | @Override 28 | public String getOutputSuffix() { 29 | return "lexprobs_tgs"; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/jobs/ThraxJob.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.jobs; 2 | 3 | import java.io.IOException; 4 | import java.util.Set; 5 | 6 | import org.apache.hadoop.conf.Configuration; 7 | import org.apache.hadoop.mapreduce.Job; 8 | 9 | public interface ThraxJob { 10 | 11 | public Job getJob(Configuration conf) throws IOException; 12 | 13 | public Set> getPrerequisites(); 14 | 15 | public String getName(); 16 | 17 | public String getOutputSuffix(); 18 | } 19 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/jobs/WordLexprobJob.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.jobs; 2 | 3 | import java.io.IOException; 4 | import java.util.HashSet; 5 | import java.util.Set; 6 | 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.fs.Path; 9 | import org.apache.hadoop.io.FloatWritable; 10 | import org.apache.hadoop.io.IntWritable; 11 | import org.apache.hadoop.io.LongWritable; 12 | import org.apache.hadoop.mapreduce.Job; 13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 14 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; 15 | import org.apache.hadoop.mapreduce.lib.reduce.IntSumReducer; 16 | 17 | import edu.jhu.thrax.hadoop.features.WordLexicalProbabilityCalculator; 18 | 19 | public abstract class WordLexprobJob implements ThraxJob { 20 | public static final String SOURCE_GIVEN_TARGET = "thrax.__wordlexprob_sgt"; 21 | private boolean isSourceGivenTarget; 22 | 23 | public WordLexprobJob(boolean isSrcGivenTgt) { 24 | isSourceGivenTarget = isSrcGivenTgt; 25 | } 26 | 27 | public Set> getPrerequisites() { 28 | Set> result = new HashSet>(); 29 | result.add(VocabularyJob.class); 30 | return result; 31 | } 32 | 33 | public Job getJob(Configuration conf) throws IOException { 34 | Configuration theConf = new Configuration(conf); 35 | theConf.setBoolean(SOURCE_GIVEN_TARGET, isSourceGivenTarget); 36 | Job job = new Job(theConf, getName()); 37 | job.setJarByClass(WordLexicalProbabilityCalculator.class); 38 | job.setMapperClass(WordLexicalProbabilityCalculator.Map.class); 39 | job.setCombinerClass(IntSumReducer.class); 40 | 41 | job.setPartitionerClass(WordLexicalProbabilityCalculator.Partition.class); 42 | job.setReducerClass(WordLexicalProbabilityCalculator.Reduce.class); 43 | 44 | job.setMapOutputKeyClass(LongWritable.class); 45 | job.setMapOutputValueClass(IntWritable.class); 46 | 47 | int numReducers = conf.getInt("thrax.reducers", conf.getInt("mapreduce.job.reduces", DefaultValues.DEFAULT_NUM_REDUCERS)); 48 | job.setNumReduceTasks(numReducers); 49 | 50 | job.setOutputKeyClass(LongWritable.class); 51 | job.setOutputValueClass(FloatWritable.class); 52 | 53 | job.setOutputFormatClass(SequenceFileOutputFormat.class); 54 | 55 | FileInputFormat.setInputPaths(job, new Path(conf.get("thrax.input-file"))); 56 | int maxSplitSize = conf.getInt("thrax.max-split-size", 0); 57 | if (maxSplitSize != 0) { 58 | FileInputFormat.setMaxInputSplitSize(job, maxSplitSize); 59 | } 60 | return job; 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/output/OutputReducer.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.output; 2 | 3 | import java.io.IOException; 4 | import java.util.List; 5 | import java.util.Map; 6 | import java.util.TreeMap; 7 | 8 | import org.apache.hadoop.conf.Configuration; 9 | import org.apache.hadoop.io.NullWritable; 10 | import org.apache.hadoop.io.Text; 11 | import org.apache.hadoop.io.Writable; 12 | import org.apache.hadoop.mapreduce.Reducer; 13 | 14 | import edu.jhu.thrax.hadoop.datatypes.FeaturePair; 15 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 16 | import edu.jhu.thrax.hadoop.features.SimpleFeature; 17 | import edu.jhu.thrax.hadoop.features.SimpleFeatureFactory; 18 | import edu.jhu.thrax.util.BackwardsCompatibility; 19 | import edu.jhu.thrax.util.FormatUtils; 20 | import edu.jhu.thrax.util.Vocabulary; 21 | 22 | public class OutputReducer extends Reducer { 23 | 24 | private boolean label; 25 | private boolean sparse; 26 | 27 | private List simpleFeatures; 28 | 29 | protected void setup(Context context) throws IOException, InterruptedException { 30 | Configuration conf = context.getConfiguration(); 31 | String vocabulary_path = conf.getRaw("thrax.work-dir") + "vocabulary/part-*"; 32 | Vocabulary.initialize(conf, vocabulary_path); 33 | 34 | label = conf.getBoolean("thrax.label-feature-scores", true); 35 | sparse = conf.getBoolean("thrax.sparse-feature-vectors", false); 36 | 37 | String features = BackwardsCompatibility.equivalent(conf.get("thrax.features", "")); 38 | simpleFeatures = SimpleFeatureFactory.getAll(features); 39 | } 40 | 41 | protected void reduce(RuleWritable key, Iterable values, Context context) 42 | throws IOException, InterruptedException { 43 | Map features = new TreeMap(); 44 | for (FeaturePair fp : values) 45 | features.put(Vocabulary.word(fp.key), fp.val.get()); 46 | for (SimpleFeature feature : simpleFeatures) 47 | features.put(feature.getName(), feature.score(key)); 48 | context.write(FormatUtils.ruleToText(key, features, label, sparse), NullWritable.get()); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/paraphrasing/AggregationCombiner.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.paraphrasing; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.mapreduce.Reducer; 9 | 10 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap; 11 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 12 | import edu.jhu.thrax.hadoop.features.annotation.AnnotationFeature; 13 | import edu.jhu.thrax.hadoop.features.annotation.AnnotationFeatureFactory; 14 | import edu.jhu.thrax.hadoop.features.pivot.PivotedAnnotationFeature; 15 | import edu.jhu.thrax.hadoop.features.pivot.PivotedFeature; 16 | import edu.jhu.thrax.hadoop.features.pivot.PivotedFeatureFactory; 17 | import edu.jhu.thrax.util.BackwardsCompatibility; 18 | import edu.jhu.thrax.util.FormatUtils; 19 | import edu.jhu.thrax.util.Vocabulary; 20 | 21 | public class AggregationCombiner 22 | extends Reducer { 23 | 24 | private List pivotedFeatures; 25 | 26 | protected void setup(Context context) throws IOException, InterruptedException { 27 | Configuration conf = context.getConfiguration(); 28 | String vocabulary_path = conf.getRaw("thrax.work-dir") + "vocabulary/part-*"; 29 | Vocabulary.initialize(conf, vocabulary_path); 30 | 31 | pivotedFeatures = new ArrayList(); 32 | List annotationFeatures = new ArrayList(); 33 | 34 | String features = BackwardsCompatibility.equivalent(conf.get("thrax.features", "")); 35 | for (String f_name : FormatUtils.P_COMMA_OR_SPACE.split(features)) { 36 | PivotedFeature pf = PivotedFeatureFactory.get(f_name); 37 | if (pf != null) { 38 | pivotedFeatures.add(pf); 39 | } else { 40 | AnnotationFeature af = AnnotationFeatureFactory.get(f_name); 41 | if (af != null) { 42 | annotationFeatures.add(af); 43 | } 44 | } 45 | } 46 | if (!annotationFeatures.isEmpty()) pivotedFeatures.add(new PivotedAnnotationFeature()); 47 | } 48 | 49 | protected void reduce(RuleWritable key, Iterable values, Context context) 50 | throws IOException, InterruptedException { 51 | FeatureMap merged = new FeatureMap(); 52 | 53 | for (PivotedFeature feature : pivotedFeatures) 54 | feature.initializeAggregation(); 55 | for (FeatureMap feature_map : values) { 56 | for (PivotedFeature feature : pivotedFeatures) { 57 | try { 58 | feature.aggregate(feature_map); 59 | } catch (Exception e) { 60 | throw new RuntimeException(key.toString() + " on " + feature.getName() + ": " 61 | + e.getMessage()); 62 | } 63 | } 64 | } 65 | for (PivotedFeature feature : pivotedFeatures) 66 | merged.put(feature.getName(), feature.finalizeAggregation()); 67 | context.write(key, merged); 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/paraphrasing/AggregationMapper.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.paraphrasing; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.mapreduce.Mapper; 7 | 8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap; 9 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 10 | import edu.jhu.thrax.util.Vocabulary; 11 | 12 | public class AggregationMapper extends Mapper { 13 | 14 | protected void setup(Context context) throws IOException, InterruptedException { 15 | Configuration conf = context.getConfiguration(); 16 | Vocabulary.initialize(conf); 17 | } 18 | 19 | protected void map(RuleWritable key, FeatureMap value, Context context) throws IOException, 20 | InterruptedException { 21 | context.write(key, value); 22 | context.progress(); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/paraphrasing/FeatureCollectionReducer.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.paraphrasing; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.mapreduce.Reducer; 7 | 8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap; 9 | import edu.jhu.thrax.hadoop.datatypes.FeaturePair; 10 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 11 | import edu.jhu.thrax.util.Vocabulary; 12 | 13 | public class FeatureCollectionReducer 14 | extends Reducer { 15 | 16 | protected void setup(Context context) throws IOException, InterruptedException { 17 | Configuration conf = context.getConfiguration(); 18 | String vocabulary_path = conf.getRaw("thrax.work-dir") + "vocabulary/part-*"; 19 | Vocabulary.initialize(conf, vocabulary_path); 20 | } 21 | 22 | protected void reduce(RuleWritable key, Iterable values, Context context) 23 | throws IOException, InterruptedException { 24 | FeatureMap features = new FeatureMap(); 25 | for (FeaturePair fp : values) 26 | features.put(fp.key, fp.val.get()); 27 | context.write(key, features); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/paraphrasing/PivotingMapper.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.paraphrasing; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.mapreduce.Mapper; 7 | 8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap; 9 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 10 | import edu.jhu.thrax.util.Vocabulary; 11 | 12 | public class PivotingMapper extends Mapper { 13 | 14 | protected void setup(Context context) throws IOException, InterruptedException { 15 | Configuration conf = context.getConfiguration(); 16 | Vocabulary.initialize(conf); 17 | } 18 | 19 | protected void map(RuleWritable key, FeatureMap value, Context context) throws IOException, 20 | InterruptedException { 21 | context.write(key, value); 22 | context.progress(); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/tools/ExtractionTool.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.tools; 2 | 3 | import java.util.Map; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.conf.Configured; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.IntWritable; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 11 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 12 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; 13 | import org.apache.hadoop.mapreduce.lib.reduce.IntSumReducer; 14 | import org.apache.hadoop.util.Tool; 15 | import org.apache.hadoop.util.ToolRunner; 16 | 17 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 18 | import edu.jhu.thrax.hadoop.extraction.ExtractionMapper; 19 | import edu.jhu.thrax.util.ConfFileParser; 20 | 21 | public class ExtractionTool extends Configured implements Tool 22 | { 23 | public int run(String [] argv) throws Exception 24 | { 25 | if (argv.length < 1) { 26 | System.err.println("USAGE: ExtractionTool "); 27 | return 1; 28 | } 29 | String thraxConf = argv[0]; 30 | Configuration conf = getConf(); 31 | 32 | Map options = ConfFileParser.parse(thraxConf); 33 | for (String opt : options.keySet()) { 34 | conf.set("thrax." + opt, options.get(opt)); 35 | } 36 | String inputPath = conf.get("thrax.input-file"); 37 | if (inputPath == null) { 38 | System.err.println("Set input-file key in conf file " + thraxConf + "!"); 39 | return 1; 40 | } 41 | String workDir = conf.get("thrax.work-dir"); 42 | if (workDir == null) { 43 | System.err.println("Set work-dir key in conf file " + thraxConf + "!"); 44 | return 1; 45 | } 46 | 47 | Job job = new Job(conf, "thrax"); 48 | job.setJarByClass(ExtractionMapper.class); 49 | job.setMapperClass(ExtractionMapper.class); 50 | job.setCombinerClass(IntSumReducer.class); 51 | job.setReducerClass(IntSumReducer.class); 52 | 53 | job.setMapOutputKeyClass(RuleWritable.class); 54 | job.setMapOutputValueClass(IntWritable.class); 55 | 56 | job.setOutputKeyClass(RuleWritable.class); 57 | job.setOutputValueClass(IntWritable.class); 58 | 59 | job.setOutputFormatClass(SequenceFileOutputFormat.class); 60 | 61 | FileInputFormat.setInputPaths(job, new Path(inputPath)); 62 | if (!workDir.endsWith(Path.SEPARATOR)) 63 | workDir += Path.SEPARATOR; 64 | FileOutputFormat.setOutputPath(job, new Path(workDir + "rules")); 65 | 66 | job.submit(); 67 | return 0; 68 | } 69 | 70 | public static void main(String [] argv) throws Exception 71 | { 72 | int exit_code = ToolRunner.run(null, new ExtractionTool(), argv); 73 | System.exit(exit_code); 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/tools/FeatureTool.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.tools; 2 | 3 | import java.util.Map; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.conf.Configured; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.IntWritable; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 11 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; 12 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; 14 | import org.apache.hadoop.util.Tool; 15 | import org.apache.hadoop.util.ToolRunner; 16 | 17 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 18 | import edu.jhu.thrax.hadoop.features.mapred.MapReduceFeature; 19 | import edu.jhu.thrax.hadoop.features.mapred.MapReduceFeatureFactory; 20 | import edu.jhu.thrax.util.ConfFileParser; 21 | 22 | public class FeatureTool extends Configured implements Tool 23 | { 24 | public int run(String [] argv) throws Exception 25 | { 26 | if (argv.length < 2) { 27 | System.err.println("usage: FeatureTool "); 28 | return 1; 29 | } 30 | String confFile = argv[0]; 31 | String featureName = argv[1]; 32 | MapReduceFeature f = MapReduceFeatureFactory.get(featureName); 33 | if (!(f instanceof MapReduceFeature)) { 34 | System.err.println("Not a MapReduceFeature: " + featureName); 35 | return 1; 36 | } 37 | Configuration conf = getConf(); 38 | Map options = ConfFileParser.parse(confFile); 39 | for (String opt : options.keySet()) { 40 | conf.set("thrax." + opt, options.get(opt)); 41 | } 42 | String workDir = conf.get("thrax.work-dir"); 43 | if (workDir == null) { 44 | System.err.println("set work-dir key in conf file " + confFile + "!"); 45 | return 1; 46 | } 47 | if (!workDir.endsWith(Path.SEPARATOR)) { 48 | workDir += Path.SEPARATOR; 49 | conf.set("thrax.work-dir", workDir); 50 | } 51 | Job job = new Job(conf, String.format("thrax-%s", featureName)); 52 | 53 | job.setJarByClass(f.getClass()); 54 | job.setMapperClass(f.mapperClass()); 55 | job.setCombinerClass(f.combinerClass()); 56 | job.setSortComparatorClass(f.sortComparatorClass()); 57 | job.setPartitionerClass(f.partitionerClass()); 58 | job.setReducerClass(f.reducerClass()); 59 | 60 | job.setInputFormatClass(SequenceFileInputFormat.class); 61 | 62 | job.setMapOutputKeyClass(RuleWritable.class); 63 | job.setMapOutputValueClass(IntWritable.class); 64 | 65 | job.setOutputKeyClass(RuleWritable.class); 66 | job.setOutputValueClass(IntWritable.class); 67 | 68 | job.setOutputFormatClass(SequenceFileOutputFormat.class); 69 | 70 | FileInputFormat.setInputPaths(job, new Path(workDir + "rules")); 71 | FileOutputFormat.setOutputPath(job, new Path(workDir + featureName)); 72 | 73 | job.submit(); 74 | return 0; 75 | } 76 | 77 | public static void main(String [] argv) throws Exception 78 | { 79 | int exit_code = ToolRunner.run(null, new FeatureTool(), argv); 80 | System.exit(exit_code); 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/tools/OutputTool.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.tools; 2 | 3 | import java.util.Map; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.conf.Configured; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.NullWritable; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 12 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 14 | import org.apache.hadoop.util.Tool; 15 | import org.apache.hadoop.util.ToolRunner; 16 | 17 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable; 18 | import edu.jhu.thrax.hadoop.features.mapred.MapReduceFeature; 19 | import edu.jhu.thrax.hadoop.features.mapred.MapReduceFeatureFactory; 20 | import edu.jhu.thrax.hadoop.output.OutputReducer; 21 | import edu.jhu.thrax.util.BackwardsCompatibility; 22 | import edu.jhu.thrax.util.ConfFileParser; 23 | import edu.jhu.thrax.util.FormatUtils; 24 | 25 | public class OutputTool extends Configured implements Tool 26 | { 27 | public int run(String [] argv) throws Exception 28 | { 29 | if (argv.length < 1) { 30 | System.err.println("usage: OutputTool "); 31 | return 1; 32 | } 33 | String confFile = argv[0]; 34 | Map options = ConfFileParser.parse(confFile); 35 | Configuration conf = getConf(); 36 | for (String opt : options.keySet()) { 37 | conf.set("thrax." + opt, options.get(opt)); 38 | } 39 | String workDir = conf.get("thrax.work-dir"); 40 | if (workDir == null) { 41 | System.err.println("Set work-dir key in conf file " + confFile + "!"); 42 | return 1; 43 | } 44 | if (!workDir.endsWith(Path.SEPARATOR)) { 45 | workDir += Path.SEPARATOR; 46 | conf.set("thrax.work-dir", workDir); 47 | } 48 | Job job = new Job(conf, "thrax-collect"); 49 | job.setJarByClass(OutputReducer.class); 50 | 51 | job.setMapperClass(Mapper.class); 52 | job.setReducerClass(OutputReducer.class); 53 | 54 | job.setInputFormatClass(SequenceFileInputFormat.class); 55 | 56 | job.setMapOutputKeyClass(RuleWritable.class); 57 | job.setMapOutputValueClass(NullWritable.class); 58 | 59 | job.setOutputKeyClass(RuleWritable.class); 60 | job.setOutputValueClass(NullWritable.class); 61 | 62 | String features = BackwardsCompatibility.equivalent(conf.get("thrax.features", "")); 63 | for (String feature : FormatUtils.P_SPACE.split(features)) { 64 | if (MapReduceFeatureFactory.get(feature) instanceof MapReduceFeature) { 65 | FileInputFormat.addInputPath(job, new Path(workDir + feature)); 66 | } 67 | } 68 | if (FileInputFormat.getInputPaths(job).length == 0) 69 | FileInputFormat.addInputPath(job, new Path(workDir + "rules")); 70 | FileOutputFormat.setOutputPath(job, new Path(workDir + "final")); 71 | 72 | job.submit(); 73 | return 0; 74 | } 75 | 76 | public static void main(String [] argv) throws Exception 77 | { 78 | int exit_code = ToolRunner.run(null, new OutputTool(), argv); 79 | System.exit(exit_code); 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/tools/SourceWordGivenTargetWordProbabilityTool.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.tools; 2 | 3 | import java.util.Map; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.conf.Configured; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.FloatWritable; 9 | import org.apache.hadoop.io.IntWritable; 10 | import org.apache.hadoop.mapreduce.Job; 11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 12 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; 14 | import org.apache.hadoop.mapreduce.lib.reduce.IntSumReducer; 15 | import org.apache.hadoop.util.Tool; 16 | import org.apache.hadoop.util.ToolRunner; 17 | 18 | import edu.jhu.thrax.hadoop.datatypes.TextPair; 19 | import edu.jhu.thrax.hadoop.features.WordLexicalProbabilityCalculator; 20 | import edu.jhu.thrax.hadoop.jobs.WordLexprobJob; 21 | import edu.jhu.thrax.util.ConfFileParser; 22 | 23 | public class SourceWordGivenTargetWordProbabilityTool extends Configured implements Tool 24 | { 25 | public int run(String [] argv) throws Exception 26 | { 27 | if (argv.length < 1) { 28 | System.err.println("usage: SourceWordGivenTargetWordProbabilityTool "); 29 | return 1; 30 | } 31 | String confFile = argv[0]; 32 | Configuration conf = getConf(); 33 | Map options = ConfFileParser.parse(confFile); 34 | for (String opt : options.keySet()) { 35 | conf.set("thrax." + opt, options.get(opt)); 36 | } 37 | String input = conf.get("thrax.input-file"); 38 | if (input == null) { 39 | System.err.println("set input-file key in conf file " + confFile + "!"); 40 | return 1; 41 | } 42 | String workDir = conf.get("thrax.work-dir"); 43 | if (workDir == null) { 44 | System.err.println("set work-dir key in conf file " + confFile + "!"); 45 | return 1; 46 | } 47 | if (!workDir.endsWith(Path.SEPARATOR)) { 48 | workDir += Path.SEPARATOR; 49 | conf.set("thrax.work-dir", workDir); 50 | } 51 | conf.setBoolean(WordLexprobJob.SOURCE_GIVEN_TARGET, true); 52 | Job job = new Job(conf, "thrax-sgt-word-lexprob"); 53 | 54 | job.setJarByClass(WordLexicalProbabilityCalculator.class); 55 | job.setMapperClass(WordLexicalProbabilityCalculator.Map.class); 56 | job.setCombinerClass(IntSumReducer.class); 57 | job.setSortComparatorClass(TextPair.SndMarginalComparator.class); 58 | job.setPartitionerClass(WordLexicalProbabilityCalculator.Partition.class); 59 | job.setReducerClass(WordLexicalProbabilityCalculator.Reduce.class); 60 | 61 | job.setMapOutputKeyClass(TextPair.class); 62 | job.setMapOutputValueClass(IntWritable.class); 63 | 64 | job.setOutputKeyClass(TextPair.class); 65 | job.setOutputValueClass(FloatWritable.class); 66 | 67 | job.setOutputFormatClass(SequenceFileOutputFormat.class); 68 | 69 | FileInputFormat.setInputPaths(job, new Path(input)); 70 | FileOutputFormat.setOutputPath(job, new Path(workDir + "lexprobs_sgt")); 71 | 72 | job.submit(); 73 | return 0; 74 | } 75 | 76 | public static void main(String [] argv) throws Exception 77 | { 78 | int exit_code = ToolRunner.run(null, new SourceWordGivenTargetWordProbabilityTool(), argv); 79 | System.exit(exit_code); 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/hadoop/tools/TargetWordGivenSourceWordProbabilityTool.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.hadoop.tools; 2 | 3 | import java.util.Map; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.conf.Configured; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.FloatWritable; 9 | import org.apache.hadoop.io.IntWritable; 10 | import org.apache.hadoop.mapreduce.Job; 11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 12 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; 14 | import org.apache.hadoop.mapreduce.lib.reduce.IntSumReducer; 15 | import org.apache.hadoop.util.Tool; 16 | import org.apache.hadoop.util.ToolRunner; 17 | 18 | import edu.jhu.thrax.hadoop.datatypes.TextPair; 19 | import edu.jhu.thrax.hadoop.features.WordLexicalProbabilityCalculator; 20 | import edu.jhu.thrax.hadoop.jobs.WordLexprobJob; 21 | import edu.jhu.thrax.util.ConfFileParser; 22 | 23 | public class TargetWordGivenSourceWordProbabilityTool extends Configured implements Tool 24 | { 25 | public int run(String [] argv) throws Exception 26 | { 27 | if (argv.length < 1) { 28 | System.err.println("usage: TargetWordGivenSourceWordProbabilityTool "); 29 | return 1; 30 | } 31 | String confFile = argv[0]; 32 | Configuration conf = getConf(); 33 | Map options = ConfFileParser.parse(confFile); 34 | for (String opt : options.keySet()) { 35 | conf.set("thrax." + opt, options.get(opt)); 36 | } 37 | String input = conf.get("thrax.input-file"); 38 | if (input == null) { 39 | System.err.println("set input-file key in conf file " + confFile + "!"); 40 | return 1; 41 | } 42 | String workDir = conf.get("thrax.work-dir"); 43 | if (workDir == null) { 44 | System.err.println("set work-dir key in conf file " + confFile + "!"); 45 | return 1; 46 | } 47 | if (!workDir.endsWith(Path.SEPARATOR)) { 48 | workDir += Path.SEPARATOR; 49 | conf.set("thrax.work-dir", workDir); 50 | } 51 | conf.setBoolean(WordLexprobJob.SOURCE_GIVEN_TARGET, false); 52 | Job job = new Job(conf, "thrax-tgs-word-lexprob"); 53 | 54 | job.setJarByClass(WordLexicalProbabilityCalculator.class); 55 | job.setMapperClass(WordLexicalProbabilityCalculator.Map.class); 56 | job.setCombinerClass(IntSumReducer.class); 57 | job.setSortComparatorClass(TextPair.SndMarginalComparator.class); 58 | job.setPartitionerClass(WordLexicalProbabilityCalculator.Partition.class); 59 | job.setReducerClass(WordLexicalProbabilityCalculator.Reduce.class); 60 | 61 | job.setMapOutputKeyClass(TextPair.class); 62 | job.setMapOutputValueClass(IntWritable.class); 63 | 64 | job.setOutputKeyClass(TextPair.class); 65 | job.setOutputValueClass(FloatWritable.class); 66 | 67 | job.setOutputFormatClass(SequenceFileOutputFormat.class); 68 | 69 | FileInputFormat.setInputPaths(job, new Path(input)); 70 | FileOutputFormat.setOutputPath(job, new Path(workDir + "lexprobs_tgs")); 71 | 72 | job.submit(); 73 | return 0; 74 | } 75 | 76 | public static void main(String [] argv) throws Exception 77 | { 78 | int exit_code = ToolRunner.run(null, new TargetWordGivenSourceWordProbabilityTool(), argv); 79 | System.exit(exit_code); 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/lexprob/HashMapLexprobTable.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.lexprob; 2 | 3 | import java.io.IOException; 4 | import java.util.HashMap; 5 | 6 | import org.apache.hadoop.conf.Configuration; 7 | 8 | public class HashMapLexprobTable extends SequenceFileLexprobTable { 9 | private HashMap table; 10 | 11 | public HashMapLexprobTable(Configuration conf, String fileGlob) throws IOException { 12 | super(conf, fileGlob); 13 | Iterable entries = getSequenceFileIterator(fs, conf, files); 14 | initialize(entries); 15 | } 16 | 17 | public void initialize(Iterable entries) { 18 | table = new HashMap(); 19 | for (TableEntry te : entries) { 20 | table.put((((long) te.car << 32) | te.cdr), te.probability); 21 | if (table.size() % 1000 == 0) System.err.printf("[%d]\n", table.size()); 22 | } 23 | } 24 | 25 | public float get(int car, int cdr) { 26 | long pair = (((long) car << 32) | cdr); 27 | if (table.containsKey(pair)) return table.get(pair); 28 | return -1.0f; 29 | } 30 | 31 | public boolean contains(int car, int cdr) { 32 | long pair = (((long) car << 32) | cdr); 33 | return table.containsKey(pair); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/lexprob/LexicalProbabilityTable.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.lexprob; 2 | 3 | 4 | /** 5 | * A data structure holding word-level lexical probabilities. The table only needs to support two 6 | * operations: determining whether a particular pair is present in the table, and returning the 7 | * probability associated with the pair. 8 | */ 9 | public interface LexicalProbabilityTable { 10 | /** 11 | * Return the lexical probability of a source language word given a target language word. 12 | * 13 | * @param source the source language word 14 | * @param target the target language word 15 | * @return the probability p(source|target) if present, -1 otherwise 16 | */ 17 | public float logpSourceGivenTarget(int source, int target); 18 | 19 | // TODO: these don't actually return -logp, they return p. 20 | 21 | /** 22 | * Return the lexical probability of a target language word given a source language word. 23 | * 24 | * @param source the source language word 25 | * @param target the target language word 26 | * @return the probability p(target|source) is present, -1 otherwise 27 | */ 28 | public float logpTargetGivenSource(int source, int target); 29 | } 30 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/lexprob/LexprobTest.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.lexprob; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.conf.Configured; 5 | import org.apache.hadoop.util.Tool; 6 | import org.apache.hadoop.util.ToolRunner; 7 | 8 | public class LexprobTest extends Configured implements Tool { 9 | public int run(String[] argv) throws Exception { 10 | if (argv.length < 1) { 11 | System.err.println("usage: LexprobTest "); 12 | return 1; 13 | } 14 | 15 | Configuration conf = getConf(); 16 | HashMapLexprobTable t = new HashMapLexprobTable(conf, argv[0]); 17 | System.err.println("HashMap populated: " + t.toString()); 18 | TrieLexprobTable trie = new TrieLexprobTable(conf, argv[0]); 19 | System.err.println("Trie populated: " + trie.toString()); 20 | return 0; 21 | } 22 | 23 | public static void main(String[] argv) throws Exception { 24 | ToolRunner.run(null, new LexprobTest(), argv); 25 | return; 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/lexprob/TableEntry.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.lexprob; 2 | 3 | import org.apache.hadoop.io.FloatWritable; 4 | import org.apache.hadoop.io.LongWritable; 5 | 6 | import edu.jhu.thrax.util.Vocabulary; 7 | 8 | public class TableEntry { 9 | 10 | public final int car; 11 | public final int cdr; 12 | public final float probability; 13 | 14 | public TableEntry(LongWritable pair, FloatWritable d) { 15 | int first = (int) (pair.get() >> 32); 16 | car = (first < 0 ? Vocabulary.getUnknownId() : first); 17 | cdr = (int) pair.get(); 18 | probability = d.get(); 19 | } 20 | 21 | public String toString() { 22 | return String.format("(%s,%s):%.4f", car, cdr, probability); 23 | } 24 | 25 | public boolean equals(Object o) { 26 | if (this == o) return true; 27 | if (!(o instanceof TableEntry)) return false; 28 | TableEntry te = (TableEntry) o; 29 | return car == te.car && cdr == te.cdr && probability == te.probability; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/syntax/ParseLattice.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.syntax; 2 | 3 | import java.util.Collection; 4 | 5 | public interface ParseLattice { 6 | 7 | public Collection getConstituentLabels(int from, int to); 8 | 9 | public Collection getConcatenatedLabels(int from, int to); 10 | 11 | public Collection getCcgLabels(int from, int to); 12 | 13 | } 14 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/tools/JudgeParaphrases.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.tools; 2 | 3 | import java.io.BufferedWriter; 4 | import java.io.IOException; 5 | import java.util.Scanner; 6 | import java.util.logging.Logger; 7 | 8 | import edu.jhu.jerboa.util.FileManager; 9 | import edu.jhu.thrax.util.io.LineReader; 10 | 11 | public class JudgeParaphrases { 12 | 13 | private static final Logger logger = Logger.getLogger(JudgeParaphrases.class.getName()); 14 | 15 | public static void main(String[] args) { 16 | 17 | String input = null; 18 | String output = null; 19 | 20 | for (int i = 0; i < args.length; i++) { 21 | if ("-i".equals(args[i]) && (i < args.length - 1)) { 22 | input = args[++i]; 23 | } else if ("-o".equals(args[i]) && (i < args.length - 1)) { 24 | output = args[++i]; 25 | } 26 | } 27 | 28 | if (input == null) { 29 | logger.severe("No input file specified."); 30 | return; 31 | } 32 | if (output == null) { 33 | logger.severe("No output file specified."); 34 | return; 35 | } 36 | 37 | LineReader reader = null; 38 | BufferedWriter writer = null; 39 | Scanner user = null; 40 | try { 41 | reader = new LineReader(input); 42 | writer = FileManager.getWriter(output); 43 | user = new Scanner(System.in); 44 | while (reader.hasNext()) { 45 | String pp = reader.next().trim(); 46 | System.out.print(pp + "\t"); 47 | String score = user.next().trim(); 48 | if (score.toLowerCase().equals("quit") || score.toLowerCase().equals("exit")) 49 | break; 50 | writer.write(score + "\t" + pp + "\n"); 51 | } 52 | reader.close(); 53 | writer.close(); 54 | } catch (IOException e) { 55 | logger.severe(e.getMessage()); 56 | } 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/tools/SequenceToGrammar.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.tools; 2 | 3 | import java.io.BufferedWriter; 4 | import java.util.logging.Logger; 5 | 6 | import org.apache.hadoop.conf.Configuration; 7 | import org.apache.hadoop.fs.FileSystem; 8 | import org.apache.hadoop.fs.Path; 9 | import org.apache.hadoop.io.SequenceFile; 10 | import org.apache.hadoop.io.Text; 11 | 12 | import edu.jhu.jerboa.util.FileManager; 13 | 14 | public class SequenceToGrammar { 15 | 16 | private static final Logger logger = Logger.getLogger(SequenceToGrammar.class.getName()); 17 | 18 | private static void usage() { 19 | System.err.println("Usage: java edu.jhu.thrax.tools.SequenceToGrammar"); 20 | System.err.println("\t -i sequence_file \t Sequence file from Thrax grammar extraction."); 21 | System.err.println("\t -o output_file \t Output grammar file name."); 22 | System.err.println(); 23 | } 24 | 25 | public static void main(String[] args) throws Exception { 26 | String input_file = null; 27 | String output_file = null; 28 | 29 | if (args.length < 4 || args[0].toLowerCase().equals("-h")) { 30 | usage(); 31 | System.exit(0); 32 | } 33 | for (int i = 0; i < args.length; i++) { 34 | if ("-i".equals(args[i]) && (i < args.length - 1)) { 35 | input_file = args[++i]; 36 | } else if ("-o".equals(args[i]) && (i < args.length - 1)) { 37 | output_file = args[++i]; 38 | } 39 | } 40 | if (input_file == null) { 41 | logger.severe("No input file specified."); 42 | usage(); 43 | System.exit(0); 44 | } 45 | if (output_file == null) { 46 | logger.severe("No output file specified."); 47 | usage(); 48 | System.exit(0); 49 | } 50 | 51 | Text rule_string = new Text(); 52 | Configuration config = new Configuration(); 53 | Path path = new Path(input_file); 54 | SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.getLocal(config), path, config); 55 | 56 | BufferedWriter grammar_writer = FileManager.getWriter(output_file); 57 | long rule_count = 0; 58 | while (reader.next(rule_string)) { 59 | grammar_writer.write(rule_string.toString()); 60 | grammar_writer.newLine(); 61 | rule_count++; 62 | } 63 | reader.close(); 64 | grammar_writer.close(); 65 | System.err.println("Merged " + rule_count + " rules."); 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/util/BackwardsCompatibility.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.util; 2 | 3 | import edu.jhu.thrax.hadoop.features.annotation.SourceGivenTargetLexicalProbabilityFeature; 4 | import edu.jhu.thrax.hadoop.features.annotation.TargetGivenSourceLexicalProbabilityFeature; 5 | import edu.jhu.thrax.hadoop.features.annotation.UnalignedSourceCounterFeature; 6 | import edu.jhu.thrax.hadoop.features.annotation.UnalignedTargetCounterFeature; 7 | import edu.jhu.thrax.hadoop.features.mapred.SourcePhraseGivenTargetFeature; 8 | import edu.jhu.thrax.hadoop.features.mapred.TargetPhraseGivenSourceFeature; 9 | 10 | public class BackwardsCompatibility { 11 | 12 | public static String equivalent(String features) { 13 | features = features.replace("e2fphrase", SourcePhraseGivenTargetFeature.NAME); 14 | features = features.replace("f2ephrase", TargetPhraseGivenSourceFeature.NAME); 15 | 16 | features = features.replace("lexprob_tgs", TargetGivenSourceLexicalProbabilityFeature.NAME); 17 | features = features.replace("lexprob_sgt", SourceGivenTargetLexicalProbabilityFeature.NAME); 18 | 19 | features = 20 | features.replace("lexprob", TargetGivenSourceLexicalProbabilityFeature.NAME + " " 21 | + SourceGivenTargetLexicalProbabilityFeature.NAME); 22 | 23 | features = 24 | features.replace("unaligned-count", UnalignedSourceCounterFeature.NAME + " " 25 | + UnalignedTargetCounterFeature.NAME); 26 | 27 | return features; 28 | } 29 | 30 | public static String defaultLabelPolicy(boolean allow_nonlexical_x) { 31 | if (allow_nonlexical_x) { 32 | return "always"; 33 | } else { 34 | return "phrases"; 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/util/ConfFileParser.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.util; 2 | 3 | import java.net.URI; 4 | import java.util.HashMap; 5 | import java.util.Map; 6 | import java.util.Scanner; 7 | 8 | import edu.jhu.thrax.util.amazon.AmazonConfigFileLoader; 9 | 10 | /** 11 | * This class parses conf files of a standard format. The '#' character is used 12 | * to indicate comments, and non-comment lines have a key and a value separated 13 | * by whitespace. 14 | */ 15 | public class ConfFileParser { 16 | 17 | public static Map parse(String confName) 18 | { 19 | Map opts = new HashMap(); 20 | Scanner scanner; 21 | 22 | try { 23 | URI configURI = new URI(confName); 24 | String scheme = configURI.getScheme(); 25 | if (scheme != null && (scheme.equalsIgnoreCase("s3n") || scheme.equalsIgnoreCase("s3"))) { 26 | scanner = new Scanner(AmazonConfigFileLoader.getConfigStream(configURI)); 27 | } 28 | else { 29 | scanner = new Scanner(DefaultConfigFileLoader.getConfigStream(configURI)); 30 | } 31 | } catch (Exception e) { 32 | throw new IllegalArgumentException(e.toString()); 33 | } 34 | 35 | while (scanner.hasNextLine()) { 36 | String line = scanner.nextLine(); 37 | // strip comments 38 | if (line.indexOf("#") != -1) { 39 | line = line.substring(0, line.indexOf("#")).trim(); 40 | } 41 | if ("".equals(line)) 42 | continue; 43 | 44 | String [] keyVal = line.split("\\s+", 2); 45 | if (keyVal.length > 1) 46 | opts.put(keyVal[0].trim(), keyVal[1].trim()); 47 | } 48 | scanner.close(); 49 | return opts; 50 | } 51 | } 52 | 53 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/util/CreateGlueGrammar.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.util; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.util.HashSet; 6 | 7 | import edu.jhu.thrax.util.io.LineReader; 8 | 9 | public class CreateGlueGrammar { 10 | private static HashSet nts; 11 | 12 | // [GOAL] ||| [GOAL,1] [X,2] ||| [GOAL,1] [X,2] ||| -1 13 | // [GOAL] ||| [GOAL,1] ||| [GOAL,1] ||| 0 14 | // [GOAL] ||| ||| ||| 0 15 | 16 | private static final String R_START = "[%1$s] ||| ||| ||| 0"; 17 | private static final String R_TWO = "[%1$s] ||| [%1$s,1] [%2$s,2] ||| [%1$s,1] [%2$s,2] ||| -1"; 18 | private static final String R_END = "[%1$s] ||| [%1$s,1] ||| [%1$s,1] ||| 0"; 19 | 20 | // [GOAL] ||| [X,1] ||| [X,1] ||| 0 21 | private static final String R_TOP = "[%1$s] ||| [%2$s,1] ||| [%2$s,1] ||| 0"; 22 | 23 | private static String GOAL = "GOAL"; 24 | 25 | public static void main(String[] argv) throws IOException { 26 | String grammar_file_name = null; 27 | if (argv.length > 0) grammar_file_name = argv[0]; 28 | if (argv.length > 1) GOAL = argv[1]; 29 | 30 | if (grammar_file_name == null) { 31 | System.err.println("No grammar specified."); 32 | System.exit(1); 33 | } 34 | File grammar_file = new File(grammar_file_name); 35 | if (!grammar_file.exists()) { 36 | System.err.println("Grammar file doesn't exist: " + grammar_file_name); 37 | System.exit(1); 38 | } 39 | 40 | nts = new HashSet(); 41 | if (grammar_file.isDirectory()) { 42 | Vocabulary.read(grammar_file_name + File.separator + "vocabulary"); 43 | for (int i = 0; i < Vocabulary.size(); ++i) { 44 | String token = Vocabulary.word(i); 45 | if (Vocabulary.nt(token)) nts.add(token.substring(1, token.length() - 1)); 46 | } 47 | } else { 48 | LineReader reader = new LineReader(grammar_file_name); 49 | while (reader.hasNext()) { 50 | String line = reader.next(); 51 | int lhsStart = line.indexOf("[") + 1; 52 | int lhsEnd = line.indexOf("]"); 53 | if (lhsStart < 1 || lhsEnd < 0) { 54 | System.err.printf("malformed rule: %s\n", line); 55 | continue; 56 | } 57 | String lhs = line.substring(lhsStart, lhsEnd); 58 | nts.add(lhs); 59 | } 60 | } 61 | 62 | System.out.println(String.format(R_START, GOAL)); 63 | for (String nt : nts) 64 | System.out.println(String.format(R_TWO, GOAL, nt)); 65 | System.out.println(String.format(R_END, GOAL)); 66 | for (String nt : nts) 67 | System.out.println(String.format(R_TOP, GOAL, nt)); 68 | 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/util/DefaultConfigFileLoader.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.util; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.IOException; 6 | import java.io.InputStream; 7 | import java.net.URI; 8 | 9 | public class DefaultConfigFileLoader 10 | { 11 | public static InputStream getConfigStream(URI configURI) throws IOException 12 | { 13 | return new FileInputStream(new File(configURI.getPath())); 14 | } 15 | } 16 | 17 | 18 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/util/ExternalizableToUtf8.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.util; 2 | 3 | import java.io.IOException; 4 | 5 | public interface ExternalizableToUtf8 { 6 | 7 | public void readExternalUtf8(String fileName) throws IOException; 8 | 9 | public void writeExternalUtf8(String fileName) throws IOException; 10 | 11 | } 12 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/util/GrammarComparison.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.util; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.FileNotFoundException; 6 | import java.io.FileOutputStream; 7 | import java.io.IOException; 8 | import java.io.PrintStream; 9 | import java.util.HashSet; 10 | import java.util.Scanner; 11 | import java.util.Set; 12 | import java.util.zip.GZIPInputStream; 13 | 14 | 15 | public class GrammarComparison { 16 | 17 | private static final String SEPARATOR = "|||"; 18 | private static final String USAGE = "usage: GrammarComparison "; 19 | 20 | public static void main(String [] argv) 21 | { 22 | if (argv.length < 3) { 23 | System.err.println(USAGE); 24 | return; 25 | } 26 | 27 | String file1 = argv[0]; 28 | String file2 = argv[1]; 29 | String outputBase = argv[2]; 30 | 31 | try { 32 | HashSet grammar1 = getRulesFromFile(file1); 33 | HashSet alsoGrammar1 = getRulesFromFile(file1); 34 | HashSet grammar2 = getRulesFromFile(file2); 35 | 36 | Set smaller = grammar1.size() < grammar2.size() 37 | ? grammar1 38 | : grammar2; 39 | Set larger = smaller == grammar1 ? grammar2 : grammar1; 40 | 41 | Set intersection = new HashSet(); 42 | for (String s : smaller) { 43 | if (larger.contains(s)) 44 | intersection.add(s); 45 | } 46 | alsoGrammar1.removeAll(grammar2); 47 | grammar2.removeAll(grammar1); 48 | 49 | printRules(alsoGrammar1, outputBase + ".1"); 50 | printRules(grammar2, outputBase + ".2"); 51 | printRules(intersection, outputBase + ".both"); 52 | } 53 | catch (Exception e) { 54 | e.printStackTrace(); 55 | } 56 | return; 57 | } 58 | 59 | private static void printRules(Set rules, String filename) throws FileNotFoundException, SecurityException { 60 | PrintStream ps = new PrintStream(new FileOutputStream(filename)); 61 | for (String s : rules) 62 | ps.println(s); 63 | ps.close(); 64 | return; 65 | } 66 | 67 | private static HashSet getRulesFromFile(String filename) throws IOException 68 | { 69 | Scanner scanner; 70 | if (filename.endsWith(".gz")) { 71 | scanner = new Scanner(new GZIPInputStream(new FileInputStream(new File(filename))), "UTF-8"); 72 | } 73 | else { 74 | scanner = new Scanner(new File(filename), "UTF-8"); 75 | } 76 | 77 | HashSet ret = new HashSet(); 78 | while (scanner.hasNextLine()) { 79 | String line = scanner.nextLine(); 80 | String rule = line.substring(0, line.lastIndexOf(SEPARATOR)); 81 | ret.add(rule); 82 | } 83 | scanner.close(); 84 | return ret; 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/util/HdfsUtils.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | package edu.jhu.thrax.util; 3 | 4 | import java.io.IOException; 5 | import java.io.ObjectInputStream; 6 | import java.io.ObjectOutputStream; 7 | 8 | import org.apache.hadoop.conf.Configuration; 9 | import org.apache.hadoop.fs.FSDataInputStream; 10 | import org.apache.hadoop.fs.FSDataOutputStream; 11 | import org.apache.hadoop.fs.FileSystem; 12 | import org.apache.hadoop.fs.Path; 13 | 14 | public class HdfsUtils { 15 | 16 | private HdfsUtils() {}; 17 | 18 | public static void writeObjectToFs(Configuration conf, E object, Path outPath) throws IOException { 19 | FileSystem hdfs = FileSystem.get(conf); 20 | 21 | ObjectOutputStream oos = null; 22 | try { 23 | FSDataOutputStream out = hdfs.create(outPath); 24 | oos = new ObjectOutputStream(out); 25 | oos.writeObject(object); 26 | } finally { 27 | if (oos != null) { 28 | oos.close(); 29 | } 30 | } 31 | } 32 | 33 | public static E readObjectFromFs(Configuration conf, Path inPath) throws IOException,ClassNotFoundException { 34 | FileSystem hdfs = FileSystem.get(conf); 35 | 36 | ObjectInputStream ois = null; 37 | try { 38 | FSDataInputStream in = hdfs.open(inPath); 39 | ois = new ObjectInputStream(in); 40 | @SuppressWarnings("unchecked") 41 | E object = (E) ois.readObject(); 42 | return object; 43 | } finally { 44 | if (ois != null) { 45 | ois.close(); 46 | } 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/util/Intersect.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.util; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.FileOutputStream; 6 | import java.io.IOException; 7 | import java.io.PrintStream; 8 | import java.util.ArrayList; 9 | import java.util.HashMap; 10 | import java.util.Scanner; 11 | import java.util.zip.GZIPInputStream; 12 | 13 | public class Intersect 14 | { 15 | private static HashMap> rules; 16 | private static boolean ignoreNTs; 17 | public static void main(String [] argv) throws Exception 18 | { 19 | String file1; 20 | String file2; 21 | String outputPrefix; 22 | if (argv[0].equals("-X")) { 23 | file1 = argv[1]; 24 | file2 = argv[2]; 25 | outputPrefix = argv[3]; 26 | ignoreNTs = true; 27 | } 28 | else { 29 | file1 = argv[0]; 30 | file2 = argv[1]; 31 | outputPrefix = argv[2]; 32 | ignoreNTs = false; 33 | } 34 | getRulesFromFile(file1); 35 | 36 | Scanner scanner; 37 | if (file2.endsWith(".gz")) 38 | scanner = new Scanner(new GZIPInputStream(new FileInputStream(new File(file2))), "UTF-8"); 39 | else 40 | scanner = new Scanner(new File(file2), "UTF-8"); 41 | PrintStream firstGrammar = new PrintStream(new FileOutputStream(outputPrefix + ".1")); 42 | PrintStream secondGrammar = new PrintStream(new FileOutputStream(outputPrefix + ".2")); 43 | while (scanner.hasNextLine()) { 44 | String s = scanner.nextLine(); 45 | String r = repr(s); 46 | if (rules.containsKey(r)) { 47 | secondGrammar.println(s); 48 | for (String x : rules.get(r)) 49 | firstGrammar.println(x); 50 | rules.get(r).clear(); 51 | } 52 | } 53 | scanner.close(); 54 | firstGrammar.close(); 55 | secondGrammar.close(); 56 | return; 57 | } 58 | 59 | private static String repr(String s) 60 | { 61 | String r = s.substring(0, s.lastIndexOf("|||")); 62 | if (ignoreNTs) 63 | r = r.replaceAll("\\[[^]]+?\\]", "[X]"); 64 | return r; 65 | } 66 | 67 | private static void getRulesFromFile(String filename) throws IOException 68 | { 69 | rules = new HashMap>(); 70 | Scanner scanner; 71 | if (filename.endsWith(".gz")) { 72 | scanner = new Scanner(new GZIPInputStream(new FileInputStream(new File(filename))), "UTF-8"); 73 | } 74 | else { 75 | scanner = new Scanner(new File(filename), "UTF-8"); 76 | } 77 | while (scanner.hasNextLine()) { 78 | String s = scanner.nextLine(); 79 | String r = repr(s); 80 | if (rules.containsKey(r)) 81 | rules.get(r).add(s); 82 | else { 83 | ArrayList al = new ArrayList(); 84 | al.add(s); 85 | rules.put(r, al); 86 | } 87 | } 88 | scanner.close(); 89 | return; 90 | } 91 | } 92 | 93 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/util/MalformedInput.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.util; 2 | 3 | public enum MalformedInput 4 | { 5 | NOT_ENOUGH_FIELDS, 6 | EMPTY_SENTENCE, 7 | MALFORMED_PARSE, 8 | EMPTY_ALIGNMENT, 9 | INCONSISTENT_ALIGNMENT, 10 | UNKNOWN 11 | } 12 | 13 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/util/MalformedInput.properties: -------------------------------------------------------------------------------- 1 | CounterGroupName=Malformed Inputs 2 | NOT_ENOUGH_FIELDS.name=Not enough fields 3 | EMPTY_SENTENCE.name=Empty sentences 4 | MALFORMED_PARSE.name=Malformed parses 5 | EMPTY_ALIGNMENT.name=Empty alignments 6 | INCONSISTENT_ALIGNMENT.name=Inconsistent alignments 7 | UNKNOWN.name=Unknown errors 8 | 9 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/util/NegLogMath.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.util; 2 | 3 | public class NegLogMath { 4 | 5 | // Number of entries in the table. 6 | private static final int LOG_ADD_TABLE_SIZE = 640000; 7 | // Smallest value for nlog_a - nlog_b. 8 | private static final float LOG_ADD_MIN = -64.0f; 9 | private static final float AS_GOOD_AS_ZERO = 1e-10f; 10 | private static final float logAddInc = -LOG_ADD_MIN / LOG_ADD_TABLE_SIZE; 11 | private static final float invLogAddInc = LOG_ADD_TABLE_SIZE / -LOG_ADD_MIN; 12 | private static final float[] logAddTable = new float[LOG_ADD_TABLE_SIZE + 1]; 13 | 14 | static { 15 | for (int i = 0; i <= LOG_ADD_TABLE_SIZE; i++) { 16 | logAddTable[i] = (float) -Math.log1p(Math.exp((i * logAddInc) + LOG_ADD_MIN)); 17 | } 18 | } 19 | 20 | public static float logAdd(float nlog_a, float nlog_b) { 21 | if (nlog_b < nlog_a) { 22 | float temp = nlog_a; 23 | nlog_a = nlog_b; 24 | nlog_b = temp; 25 | } 26 | float neg_diff = (nlog_a - nlog_b) - LOG_ADD_MIN; 27 | if (neg_diff < AS_GOOD_AS_ZERO) { 28 | return nlog_a; 29 | } 30 | return nlog_a + logAddTable[(int) (neg_diff * invLogAddInc)]; 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/util/SequenceFileCreator.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.util; 2 | 3 | import java.net.URI; 4 | import java.util.Scanner; 5 | 6 | import org.apache.hadoop.conf.Configuration; 7 | import org.apache.hadoop.fs.FileSystem; 8 | import org.apache.hadoop.fs.Path; 9 | import org.apache.hadoop.io.LongWritable; 10 | import org.apache.hadoop.io.SequenceFile; 11 | import org.apache.hadoop.io.Text; 12 | 13 | public class SequenceFileCreator 14 | { 15 | public static void main(String [] argv) throws Exception 16 | { 17 | LongWritable k = new LongWritable(); 18 | Text v = new Text(); 19 | 20 | URI uri = URI.create(argv[0]); 21 | Configuration conf = new Configuration(); 22 | FileSystem fs = FileSystem.get(uri, conf); 23 | Path path = new Path(argv[0]); 24 | SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, path, LongWritable.class, Text.class); 25 | 26 | long current = 0; 27 | Scanner scanner = new Scanner(System.in, "UTF-8"); 28 | while (scanner.hasNextLine()) { 29 | String line = scanner.nextLine(); 30 | k.set(current); 31 | v.set(line); 32 | writer.append(k, v); 33 | current++; 34 | } 35 | scanner.close(); 36 | writer.close(); 37 | return; 38 | } 39 | } 40 | 41 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/util/amazon/AmazonConfigFileLoader.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.util.amazon; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.net.URI; 6 | 7 | import com.amazonaws.auth.PropertiesCredentials; 8 | import com.amazonaws.services.s3.AmazonS3; 9 | import com.amazonaws.services.s3.AmazonS3Client; 10 | import com.amazonaws.services.s3.model.GetObjectRequest; 11 | 12 | public class AmazonConfigFileLoader { 13 | protected static final String CRED_PROPS = "AwsCredentials.properties"; 14 | 15 | public static InputStream getConfigStream(URI configURI) throws IOException { 16 | InputStream resStream = AmazonConfigFileLoader.class.getResourceAsStream(CRED_PROPS); 17 | 18 | if (resStream == null) { 19 | resStream = AmazonConfigFileLoader.class.getResourceAsStream("/" + CRED_PROPS); 20 | } 21 | 22 | if (resStream == null) { 23 | throw new IllegalArgumentException("Could not locate " + CRED_PROPS); 24 | } 25 | 26 | AmazonS3 s3 = new AmazonS3Client(new PropertiesCredentials(resStream)); 27 | return s3.getObject( 28 | new GetObjectRequest(configURI.getHost(), configURI.getPath().replaceFirst("/+", ""))) 29 | .getObjectContent(); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/util/exceptions/ConfigurationException.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.util.exceptions; 2 | 3 | public class ConfigurationException extends Exception 4 | { 5 | private static final long serialVersionUID = 3040L; 6 | 7 | public ConfigurationException(String message) 8 | { 9 | super(message); 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/util/exceptions/EmptyAlignmentException.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.util.exceptions; 2 | 3 | public class EmptyAlignmentException extends MalformedInputException 4 | { 5 | private static final long serialVersionUID = 4556L; 6 | } 7 | 8 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/util/exceptions/EmptySentenceException.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.util.exceptions; 2 | 3 | public class EmptySentenceException extends MalformedInputException 4 | { 5 | private static final long serialVersionUID = 8132L; 6 | } 7 | 8 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/util/exceptions/InconsistentAlignmentException.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.util.exceptions; 2 | 3 | public class InconsistentAlignmentException extends MalformedInputException 4 | { 5 | private static final long serialVersionUID = 33L; 6 | 7 | public InconsistentAlignmentException(String alignment) 8 | { 9 | super(alignment); 10 | } 11 | } 12 | 13 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/util/exceptions/MalformedInputException.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.util.exceptions; 2 | 3 | public class MalformedInputException extends Exception 4 | { 5 | private static final long serialVersionUID = 5544L; 6 | 7 | public MalformedInputException() 8 | { 9 | super(); 10 | } 11 | 12 | public MalformedInputException(String input) 13 | { 14 | super(input); 15 | } 16 | } 17 | 18 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/util/exceptions/MalformedParseException.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.util.exceptions; 2 | 3 | public class MalformedParseException extends MalformedInputException 4 | { 5 | private static final long serialVersionUID = 1095L; 6 | 7 | public MalformedParseException(String parse) 8 | { 9 | super(parse); 10 | } 11 | } 12 | 13 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/util/exceptions/NotEnoughFieldsException.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.util.exceptions; 2 | 3 | public class NotEnoughFieldsException extends MalformedInputException 4 | { 5 | private static final long serialVersionUID = 9988L; 6 | } 7 | 8 | -------------------------------------------------------------------------------- /src/edu/jhu/thrax/util/io/Reader.java: -------------------------------------------------------------------------------- 1 | /* This file is part of the Joshua Machine Translation System. 2 | * 3 | * Joshua is free software; you can redistribute it and/or modify 4 | * it under the terms of the GNU Lesser General Public License as 5 | * published by the Free Software Foundation; either version 2.1 6 | * of the License, or (at your option) any later version. 7 | * 8 | * This library is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 | * Lesser General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU Lesser General Public 14 | * License along with this library; if not, write to the Free 15 | * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, 16 | * MA 02111-1307 USA 17 | */ 18 | package edu.jhu.thrax.util.io; 19 | 20 | import java.io.IOException; 21 | import java.util.Iterator; 22 | 23 | /** 24 | * Common interface for Reader type objects. 25 | * 26 | * @author wren ng thornton 27 | * @version $LastChangedDate: 2009-03-26 15:06:57 -0400 (Thu, 26 Mar 2009) $ 28 | */ 29 | public interface Reader extends Iterable, Iterator { 30 | 31 | /** Close the reader, freeing all resources. */ 32 | void close() throws IOException; 33 | 34 | /** Determine if the reader is ready to read a line. */ 35 | boolean ready() throws IOException; 36 | 37 | /** Read a "line" and return an object representing it. */ 38 | E readLine() throws IOException; 39 | } 40 | -------------------------------------------------------------------------------- /test/edu/jhu/thrax/extraction/SAMTLabelerTest.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.extraction; 2 | 3 | import org.testng.Assert; 4 | import org.testng.annotations.Test; 5 | 6 | import edu.jhu.thrax.util.Vocabulary; 7 | 8 | public class SAMTLabelerTest { 9 | 10 | private final int defaultLabel = Vocabulary.id("X"); 11 | 12 | @Test 13 | public void getLabel_MalformedTree_isDefault() { 14 | SAMTLabeler labeler = 15 | new SAMTLabeler("(A b))", true, true, true, true, "top", defaultLabel); 16 | Assert.assertEquals(labeler.getLabel(0, 1), defaultLabel); 17 | } 18 | 19 | @Test 20 | public void getLabel_SpanOutOfBounds_isDefault() { 21 | SAMTLabeler labeler = new SAMTLabeler("(A b)", true, true, true, true, "top", defaultLabel); 22 | Assert.assertEquals(labeler.getLabel(0, 3), defaultLabel); 23 | Assert.assertEquals(labeler.getLabel(-2, 1), defaultLabel); 24 | } 25 | 26 | @Test 27 | public void getLabel_UnaryChain_Top() { 28 | SAMTLabeler labeler = new SAMTLabeler("(A (B c))", true, true, true, true, "top", defaultLabel); 29 | Assert.assertEquals(labeler.getLabel(0, 1), "A"); 30 | } 31 | 32 | @Test 33 | public void getLabel_UnaryChain_Bottom() { 34 | SAMTLabeler labeler = new SAMTLabeler("(A (B c))", true, true, true, true, "bottom", defaultLabel); 35 | Assert.assertEquals(labeler.getLabel(0, 1), "B"); 36 | } 37 | 38 | @Test 39 | public void getLabel_UnaryChain_All() { 40 | SAMTLabeler labeler = new SAMTLabeler("(A (B c))", true, true, true, true, "all", defaultLabel); 41 | Assert.assertEquals(labeler.getLabel(0, 1), "A:B"); 42 | } 43 | 44 | @Test 45 | public void getLabel_NoConst_returnCat() { 46 | SAMTLabeler labeler = new SAMTLabeler("(A (B c) (D e))", false, true, true, true, "all", defaultLabel); 47 | Assert.assertEquals(labeler.getLabel(0, 2), "B+D"); 48 | } 49 | 50 | @Test 51 | public void getLabel_NoConstCat_noCCG_returnDefault() { 52 | SAMTLabeler labeler = new SAMTLabeler("(A (B c) (D e))", false, true, false, true, "all", defaultLabel); 53 | Assert.assertEquals(labeler.getLabel(0, 2), defaultLabel); 54 | } 55 | 56 | @Test 57 | public void getLabel_NoConstCat_returnCCG() { 58 | SAMTLabeler labeler = new SAMTLabeler("(A (B c) (D e))", false, true, false, true, "all", defaultLabel); 59 | Assert.assertEquals(labeler.getLabel(0, 1), "A/D"); 60 | Assert.assertEquals(labeler.getLabel(1, 2), "A\\B"); 61 | } 62 | 63 | @Test 64 | public void getLabel_NoConstCatCCG_returnDoubleCat() { 65 | SAMTLabeler labeler = 66 | new SAMTLabeler("(A (B c) (D e) (F g))", false, false, false, true, "all", defaultLabel); 67 | Assert.assertEquals(labeler.getLabel(0, 3), "B+D+F"); 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /test/edu/jhu/thrax/syntax/ParseTreeTest.java: -------------------------------------------------------------------------------- 1 | package edu.jhu.thrax.syntax; 2 | 3 | import java.util.Iterator; 4 | import java.util.List; 5 | 6 | import org.testng.Assert; 7 | import org.testng.annotations.Test; 8 | 9 | public class ParseTreeTest 10 | { 11 | @Test 12 | public void numLeaves_Leaf_isOne() 13 | { 14 | ParseTree pt = ParseTree.fromPennFormat("a"); 15 | Assert.assertEquals(pt.numLeaves(), 1); 16 | } 17 | 18 | @Test 19 | public void numNodes_Leaf_isOne() 20 | { 21 | ParseTree pt = ParseTree.fromPennFormat("a"); 22 | Assert.assertEquals(pt.numNodes(), 1); 23 | } 24 | 25 | @Test 26 | public void numLeaves_Tree() 27 | { 28 | ParseTree pt = ParseTree.fromPennFormat("(A (B c d))"); 29 | Assert.assertEquals(pt.numLeaves(), 2); 30 | } 31 | 32 | @Test 33 | public void numNodes_Tree() 34 | { 35 | ParseTree pt = ParseTree.fromPennFormat("(A (B c d))"); 36 | Assert.assertEquals(pt.numNodes(), 4); 37 | } 38 | 39 | @Test 40 | void internalNodesWithSpan_Single() 41 | { 42 | ParseTree pt = ParseTree.fromPennFormat("(A (B c d) e)"); 43 | List list = pt.internalNodesWithSpan(0, 2); 44 | Assert.assertEquals(list.size(), 1); 45 | ParseTree.Node node = list.get(0); 46 | Assert.assertEquals(node.label(), "B"); 47 | Assert.assertEquals(node.spanStart(), 0); 48 | Assert.assertEquals(node.spanEnd(), 2); 49 | Assert.assertFalse(node.numChildren() == 0); 50 | } 51 | 52 | @Test 53 | public void internalNodesWithSpan_unaryChain() 54 | { 55 | ParseTree pt = ParseTree.fromPennFormat("(A (B c))"); 56 | List list = pt.internalNodesWithSpan(0, 1); 57 | Assert.assertEquals(list.size(), 2); 58 | ParseTree.Node node = list.get(0); 59 | Assert.assertEquals(node.label(), "A"); 60 | Assert.assertEquals(node.spanStart(), 0); 61 | Assert.assertEquals(node.spanEnd(), 1); 62 | Assert.assertFalse(node.numChildren() == 0); 63 | node = list.get(1); 64 | Assert.assertEquals(node.label(), "B"); 65 | Assert.assertEquals(node.spanStart(), 0); 66 | Assert.assertEquals(node.spanEnd(), 1); 67 | Assert.assertFalse(node.numChildren() == 0); 68 | } 69 | 70 | @Test 71 | public void leaf_ChildIterator_isEmpty() 72 | { 73 | ParseTree pt = ParseTree.fromPennFormat("a"); 74 | ParseTree.Node node = pt.root(); 75 | Assert.assertFalse(node.children().hasNext()); 76 | } 77 | 78 | @Test 79 | public void tree_ChildIterator() 80 | { 81 | ParseTree pt = ParseTree.fromPennFormat("(A b c)"); 82 | ParseTree.Node node = pt.root(); 83 | Iterator iter = node.children(); 84 | Assert.assertTrue(iter.hasNext()); 85 | node = iter.next(); 86 | Assert.assertEquals(node.label(), "b"); 87 | Assert.assertEquals(node.spanStart(), 0); 88 | Assert.assertEquals(node.spanEnd(), 1); 89 | Assert.assertFalse(node.children().hasNext()); 90 | Assert.assertTrue(iter.hasNext()); 91 | node = iter.next(); 92 | Assert.assertEquals(node.label(), "c"); 93 | Assert.assertEquals(node.spanStart(), 1); 94 | Assert.assertEquals(node.spanEnd(), 2); 95 | Assert.assertFalse(node.children().hasNext()); 96 | Assert.assertFalse(iter.hasNext()); 97 | } 98 | } 99 | 100 | -------------------------------------------------------------------------------- /testng.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | --------------------------------------------------------------------------------