├── .classpath
├── .gitignore
├── .project
├── AwsCredentials.properties
├── LICENSE.txt
├── README
├── build.xml
├── example
    ├── counts
    │   ├── corpus.a
    │   ├── corpus.en
    │   ├── corpus.es
    │   └── thrax-phrase.conf
    ├── europarl.unified.1
    ├── hiero.conf
    ├── nist09.unified.1
    └── samt.conf
├── lib
    ├── aws-java-sdk-1.1.3.jar
    ├── commons-lang3-3.1.jar
    ├── hadoop-common-2.5.2.jar
    ├── hadoop-mapreduce-client-core-2.5.2.jar
    ├── jerboa.jar
    └── testng-5.8-jdk15.jar
├── scripts
    ├── berant_to_reference.py
    ├── create_glue_grammar.sh
    ├── filter_rules.sh
    └── run_on_amazon.sh
├── src
    └── edu
    │   └── jhu
    │       └── thrax
    │           ├── Thrax.java
    │           ├── datatypes
    │               ├── AlignedSentencePair.java
    │               ├── Alignment.java
    │               ├── ArrayAlignment.java
    │               ├── HierarchicalRule.java
    │               ├── IntPair.java
    │               └── PhrasePair.java
    │           ├── distributional
    │               ├── ContextPhrase.java
    │               ├── ContextPhraseExtractor.java
    │               ├── FeatureClass.java
    │               ├── FeatureEncoder.java
    │               ├── FeatureSet.java
    │               └── FeatureTypes.java
    │           ├── extraction
    │               ├── HierarchicalRuleExtractor.java
    │               ├── HieroLabeler.java
    │               ├── LabelCache.java
    │               ├── Labeling.java
    │               ├── ManualSpanLabeler.java
    │               ├── SAMTLabeler.java
    │               └── SpanLabeler.java
    │           ├── hadoop
    │               ├── comparators
    │               │   ├── FieldComparator.java
    │               │   ├── PrimitiveArrayMarginalComparator.java
    │               │   └── TextMarginalComparator.java
    │               ├── datatypes
    │               │   ├── AlignedRuleWritable.java
    │               │   ├── AlignmentWritable.java
    │               │   ├── Annotation.java
    │               │   ├── FeatureMap.java
    │               │   ├── FeaturePair.java
    │               │   ├── FeatureValue.java
    │               │   ├── IntPair.java
    │               │   ├── PrimitiveUtils.java
    │               │   ├── RuleWritable.java
    │               │   └── TextPair.java
    │               ├── distributional
    │               │   ├── CommonLSH.java
    │               │   ├── ContextWritable.java
    │               │   ├── DistributionalContextCombiner.java
    │               │   ├── DistributionalContextMapper.java
    │               │   ├── DistributionalContextReducer.java
    │               │   └── SignatureWritable.java
    │               ├── extraction
    │               │   ├── ExtractionCombiner.java
    │               │   ├── ExtractionMapper.java
    │               │   ├── ExtractionReducer.java
    │               │   ├── HierarchicalRuleWritableExtractor.java
    │               │   ├── RuleWritableExtractor.java
    │               │   └── RuleWritableExtractorFactory.java
    │               ├── features
    │               │   ├── AbstractnessFeature.java
    │               │   ├── AdjacentNonTerminalsFeature.java
    │               │   ├── CharacterCompressionRatioFeature.java
    │               │   ├── CharacterCountDifferenceFeature.java
    │               │   ├── ConsumeSourceTerminalsFeature.java
    │               │   ├── Feature.java
    │               │   ├── GlueRuleFeature.java
    │               │   ├── IdentityFeature.java
    │               │   ├── LexicalityFeature.java
    │               │   ├── MonotonicFeature.java
    │               │   ├── PhrasePenaltyFeature.java
    │               │   ├── ProduceTargetTerminalsFeature.java
    │               │   ├── SimpleFeature.java
    │               │   ├── SimpleFeatureFactory.java
    │               │   ├── SourceWordCounterFeature.java
    │               │   ├── TargetWordCounterFeature.java
    │               │   ├── WordCompressionRatioFeature.java
    │               │   ├── WordCountDifferenceFeature.java
    │               │   ├── WordLengthDifferenceFeature.java
    │               │   ├── WordLexicalProbabilityCalculator.java
    │               │   ├── XRuleFeature.java
    │               │   ├── annotation
    │               │   │   ├── AlignmentFeature.java
    │               │   │   ├── AnnotationFeature.java
    │               │   │   ├── AnnotationFeatureFactory.java
    │               │   │   ├── AnnotationFeatureJob.java
    │               │   │   ├── AnnotationPassthroughFeature.java
    │               │   │   ├── AnnotationReducer.java
    │               │   │   ├── CountFeature.java
    │               │   │   ├── LogCountFeature.java
    │               │   │   ├── RarityPenaltyFeature.java
    │               │   │   ├── SourceGivenTargetLexicalProbabilityFeature.java
    │               │   │   ├── TargetGivenSourceLexicalProbabilityFeature.java
    │               │   │   ├── UnalignedSourceCounterFeature.java
    │               │   │   └── UnalignedTargetCounterFeature.java
    │               │   ├── mapred
    │               │   │   ├── CountOfRuleCountsEstimationJob.java
    │               │   │   ├── GoodTuringSmoothedSourcePhraseGivenTargetFeature.java
    │               │   │   ├── GoodTuringSmoothedTargetPhraseGivenSourceFeature.java
    │               │   │   ├── LhsGivenSourcePhraseFeature.java
    │               │   │   ├── LhsGivenTargetPhraseFeature.java
    │               │   │   ├── MapReduceFeature.java
    │               │   │   ├── MapReduceFeatureFactory.java
    │               │   │   ├── SourceCountFeature.java
    │               │   │   ├── SourcePhraseGivenLHSFeature.java
    │               │   │   ├── SourcePhraseGivenTargetFeature.java
    │               │   │   ├── SourcePhraseGivenTargetandLHSFeature.java
    │               │   │   ├── TargetCountFeature.java
    │               │   │   ├── TargetPhraseGivenLHSFeature.java
    │               │   │   ├── TargetPhraseGivenSourceFeature.java
    │               │   │   ├── TargetPhraseGivenSourceandLHSFeature.java
    │               │   │   └── coc
    │               │   │   │   ├── CountOfCountsEstimator.java
    │               │   │   │   └── GoodTuringSmoother.java
    │               │   └── pivot
    │               │   │   ├── NonAggregatingPivotedFeature.java
    │               │   │   ├── PivotedAnnotationFeature.java
    │               │   │   ├── PivotedFeature.java
    │               │   │   ├── PivotedFeatureFactory.java
    │               │   │   ├── PivotedLexicalSourceGivenTargetFeature.java
    │               │   │   ├── PivotedLexicalTargetGivenSourceFeature.java
    │               │   │   ├── PivotedLhsGivenSourcePhraseFeature.java
    │               │   │   ├── PivotedLhsGivenTargetPhraseFeature.java
    │               │   │   ├── PivotedNegLogProbFeature.java
    │               │   │   ├── PivotedRarityPenaltyFeature.java
    │               │   │   ├── PivotedSourcePhraseGivenLHSFeature.java
    │               │   │   ├── PivotedSourcePhraseGivenTargetAndLHSFeature.java
    │               │   │   ├── PivotedSourcePhraseGivenTargetFeature.java
    │               │   │   ├── PivotedTargetPhraseGivenLHSFeature.java
    │               │   │   ├── PivotedTargetPhraseGivenSourceAndLHSFeature.java
    │               │   │   └── PivotedTargetPhraseGivenSourceFeature.java
    │               ├── jobs
    │               │   ├── DefaultValues.java
    │               │   ├── DistributionalContextExtractionJob.java
    │               │   ├── DistributionalContextSortingJob.java
    │               │   ├── ExtractionJob.java
    │               │   ├── FeatureCollectionJob.java
    │               │   ├── JobState.java
    │               │   ├── OutputJob.java
    │               │   ├── ParaphraseAggregationJob.java
    │               │   ├── ParaphrasePivotingJob.java
    │               │   ├── Scheduler.java
    │               │   ├── SchedulerException.java
    │               │   ├── SourceWordGivenTargetWordProbabilityJob.java
    │               │   ├── TargetWordGivenSourceWordProbabilityJob.java
    │               │   ├── ThraxJob.java
    │               │   ├── VocabularyJob.java
    │               │   └── WordLexprobJob.java
    │               ├── output
    │               │   └── OutputReducer.java
    │               ├── paraphrasing
    │               │   ├── AggregationCombiner.java
    │               │   ├── AggregationMapper.java
    │               │   ├── AggregationReducer.java
    │               │   ├── FeatureCollectionReducer.java
    │               │   ├── PivotingMapper.java
    │               │   └── PivotingReducer.java
    │               └── tools
    │               │   ├── ExtractionTool.java
    │               │   ├── FeatureTool.java
    │               │   ├── OutputTool.java
    │               │   ├── SourceWordGivenTargetWordProbabilityTool.java
    │               │   └── TargetWordGivenSourceWordProbabilityTool.java
    │           ├── lexprob
    │               ├── HashMapLexprobTable.java
    │               ├── LexicalProbabilityTable.java
    │               ├── LexprobTest.java
    │               ├── SequenceFileLexprobTable.java
    │               ├── TableEntry.java
    │               └── TrieLexprobTable.java
    │           ├── syntax
    │               ├── LatticeArray.java
    │               ├── ParseLattice.java
    │               └── ParseTree.java
    │           ├── tools
    │               ├── ExtractPropbankRules.java
    │               ├── JudgeParaphrases.java
    │               ├── ParaphraseCoverage.java
    │               ├── ParaphraseIntersect.java
    │               ├── ParaphraseOverlap.java
    │               ├── ParaphraseScore.java
    │               ├── ParaphraseWordNet.java
    │               ├── SequenceToGrammar.java
    │               ├── SequenceToSignatures.java
    │               └── SplitAndFilter.java
    │           └── util
    │               ├── BackwardsCompatibility.java
    │               ├── ConfFileParser.java
    │               ├── CreateGlueGrammar.java
    │               ├── DefaultConfigFileLoader.java
    │               ├── ExternalizableToUtf8.java
    │               ├── FormatUtils.java
    │               ├── GrammarComparison.java
    │               ├── HdfsUtils.java
    │               ├── Intersect.java
    │               ├── MalformedInput.java
    │               ├── MalformedInput.properties
    │               ├── MurmurHash.java
    │               ├── NegLogMath.java
    │               ├── SequenceFileCreator.java
    │               ├── TestSetFilter.java
    │               ├── Vocabulary.java
    │               ├── amazon
    │                   └── AmazonConfigFileLoader.java
    │               ├── exceptions
    │                   ├── ConfigurationException.java
    │                   ├── EmptyAlignmentException.java
    │                   ├── EmptySentenceException.java
    │                   ├── InconsistentAlignmentException.java
    │                   ├── MalformedInputException.java
    │                   ├── MalformedParseException.java
    │                   └── NotEnoughFieldsException.java
    │               └── io
    │                   ├── InputUtilities.java
    │                   ├── LineReader.java
    │                   └── Reader.java
├── test
    └── edu
    │   └── jhu
    │       └── thrax
    │           ├── datatypes
    │               └── ArrayAlignmentTest.java
    │           ├── extraction
    │               └── SAMTLabelerTest.java
    │           ├── hadoop
    │               └── features
    │               │   └── mapred
    │               │       └── coc
    │               │           └── CountOfCountsEstimatorTest.java
    │           ├── syntax
    │               └── ParseTreeTest.java
    │           └── util
    │               └── io
    │                   └── InputUtilitiesTest.java
└── testng.xml


/.classpath:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <classpath>
 3 | 	<classpathentry kind="src" path="src"/>
 4 | 	<classpathentry kind="src" path="test"/>
 5 | 	<classpathentry kind="lib" path="lib/commons-lang3-3.1.jar"/>
 6 | 	<classpathentry kind="lib" path="lib/jerboa.jar"/>
 7 | 	<classpathentry kind="lib" path="lib/testng-5.8-jdk15.jar"/>
 8 | 	<classpathentry kind="lib" path="lib/hadoop-common-2.5.2.jar"/>
 9 | 	<classpathentry kind="lib" path="lib/aws-java-sdk-1.1.3.jar"/>
10 | 	<classpathentry kind="lib" path="lib/hadoop-mapreduce-client-core-2.5.2.jar"/>
11 | 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
12 | 	<classpathentry kind="output" path="bin"/>
13 | </classpath>
14 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | bin/
2 | test-output/
3 | doc/
4 | AwsCredentials.properties
5 | 
6 | .DS_Store


--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>Thrax</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.eclipse.jdt.core.javabuilder</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 	</buildSpec>
14 | 	<natures>
15 | 		<nature>org.eclipse.jdt.core.javanature</nature>
16 | 	</natures>
17 | </projectDescription>
18 | 


--------------------------------------------------------------------------------
/AwsCredentials.properties:
--------------------------------------------------------------------------------
1 | accessKey=
2 | secretKey=
3 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2010-13 the Thrax team
 2 |     Jonny Weese <jonny@cs.jhu.edu>
 3 |     Juri Ganitkevitch <juri@cs.jhu.edu>
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | Thrax uses Apache hadoop (an open-source implementation of MapReduce) to
 2 | efficiently extract a synchronous context-free grammar translation model
 3 | for use in modern machine translation systems.
 4 | 
 5 | Thrax currently has support for both Hiero-style grammars (with a single
 6 | non-terminal symbol) and SAMT-style grammars (where non-terminal symbols are
 7 | calculated by projecting onto the span from a target-side parse tree).
 8 | 
 9 | COMPILING:
10 | 
11 | First, you need to set two environment variables:
12 | $HADOOP should point to the directory where Hadoop is installed.
13 | $AWS_SDK should point to the directory where the Amazon Web Services SDK
14 | is installed.
15 | 
16 | To compile, type
17 | 
18 |     ant
19 | 
20 | This will compile all classes and package them into a jar for use on a 
21 | Hadoop cluster.
22 | 
23 | At the end of the compilation, ant should report that the build was successful.
24 | 
25 | RUNNING THRAX:
26 | Thrax can be invoked with
27 | 
28 |     hadoop jar $THRAX/bin/thrax.jar <configuration file>
29 | 
30 | Some example configuration files have been included with this distribution:
31 | 
32 |     example/hiero.conf
33 |     example/samt.conf
34 | 
35 | COPYRIGHT AND LICENSE:
36 | Copyright (c) 2010-13 by the Thrax team:
37 |     Jonny Weese <jonny@cs.jhu.edu>
38 |     Juri Ganitkevitch <juri@cs.jhu.edu>
39 | 
40 | See LICENSE.txt (included with this distribution) for the complete terms.
41 | 


--------------------------------------------------------------------------------
/build.xml:
--------------------------------------------------------------------------------
 1 | <project name="Thrax" basedir="." default="jar">
 2 | 
 3 |     <property environment="env"/>
 4 | 
 5 |     <property name="src" value="./src"/>
 6 |     <property name="build" value="./bin"/>
 7 |     <property name="lib" value="./lib"/>
 8 |     <property name="doc" value="./doc"/>
 9 |     <property name="test" value="./test"/>
10 |     <property name="hadoop-src" value="edu/jhu/thrax/hadoop"/>
11 | 
12 |     <property name="testng" value="${lib}/testng-5.8-jdk15.jar"/>
13 |     <property name="jarfiles" value="${lib}/jerboa.jar:${lib}/commons-lang3-3.1.jar:${lib}/hadoop-common-2.5.2.jar:${lib}/hadoop-mapreduce-client-core-2.5.2.jar:${lib}/aws-java-sdk-1.1.3.jar"/> 
14 | 
15 |     <target name="init">
16 |         <tstamp/>
17 |         <mkdir dir="${build}"/>
18 |     </target>
19 | 
20 |     <target name="compile" depends="init">
21 |         <javac srcdir="${src}" destdir="${build}"
22 |             classpath="${jarfiles}"
23 |             debug="on" includeantruntime="false">
24 |             <compilerarg value="-Xlint"/>
25 |         </javac>
26 |     </target>
27 | 
28 |     <target name="jar" depends="compile">
29 |         <jar destfile="${build}/thrax.jar">
30 |             <fileset dir="${build}">
31 |                 <include name="**/*.class"/>
32 |             </fileset>
33 |             <fileset dir=".">
34 | 		<include name="AwsCredentials.properties"/>
35 |             </fileset>
36 |             <manifest>
37 |                 <attribute name="Main-Class" value="edu.jhu.thrax.Thrax"/>
38 |             </manifest>
39 |         </jar>
40 |     </target>
41 | 
42 |     <target name="source-jar">
43 |         <jar destfile="${build}/thrax-src.jar">
44 |             <fileset dir="${src}">
45 |                 <include name="**/*.java"/>
46 |             </fileset>
47 |         </jar>
48 |     </target>
49 | 
50 |     <target name="clean">
51 |         <delete verbose="true" quiet="true" dir="${build}"/>
52 |     </target>
53 | 
54 |     <target name="compile-tests" depends="compile"> 
55 |         <javac srcdir="${test}" destdir="${build}" 
56 |             classpath="${testng}:${build}:${jarfiles}" debug="on"
57 |             includeantruntime="false"/>
58 |     </target>
59 | 
60 |     <taskdef resource="testngtasks" classpath="${testng}"/>
61 | 
62 |     <target name="test" depends="compile-tests">
63 |         <testng classpath="${build}:${jarfiles}:${env.HADOOP}/lib/*" sourcedir="${test}">
64 |             <xmlfileset dir="." includes="testng.xml"/>
65 |         </testng>
66 |     </target>
67 | 
68 |     <target name="javadoc">
69 |         <mkdir dir="${doc}"/>
70 |         <javadoc packagenames="edu.jhu.thrax.*"
71 |             classpath="${cli}"
72 |             sourcepath="${src}"
73 |             destdir="${doc}"
74 |             charset="utf-8"
75 |             >
76 |             <link href="http://java.sun.com/j2se/1.5.0/docs/api" />
77 |             <link href="http://commons.apache.org/cli/api-release" />
78 |         </javadoc>
79 |     </target>
80 | 
81 | 
82 | </project>
83 | 


--------------------------------------------------------------------------------
/example/counts/thrax-phrase.conf:
--------------------------------------------------------------------------------
 1 | # this is an example Thrax configuration file
 2 | # <- this symbol indicates a comment
 3 | # each line should be a key-value pair separated by whitespace
 4 | 
 5 | ###
 6 | ### GRAMMAR OPTIONS
 7 | ###
 8 | 
 9 | grammar     hiero   # or samt
10 | reverse     false
11 | source-is-parsed    false
12 | target-is-parsed    false
13 | # default-nt    X   # X is the default anyway
14 | 
15 | min-rule-count 1
16 | 
17 | # the number of reducers
18 | reducers 16
19 | 
20 | # Maximum length of initial phrase pairs. These are set to be shorter than
21 | # used by Hiero.
22 | initial-phrase-length   5
23 | lex-source-words        5
24 | lex-target-words        5
25 | 
26 | # maximum number of NTs in a rule
27 | arity                   0
28 | 
29 | # minimum number of aligned terminals in a rule
30 | lexicality              1   
31 | 
32 | # allow adjacent nonterminals on source side
33 | adjacent-nts    false   
34 | 
35 | # allow unaligned words at boundaries of phrases
36 | loose           true
37 | 
38 | allow-abstract-rules    false
39 | allow-nonlexical-x      false
40 | allow-full-sentence-rules   false
41 | 
42 | nonlex-source-length    5
43 | nonlex-target-length    5
44 | nonlex-source-words     5
45 | nonlex-target-words     5
46 | 
47 | allow-double-plus    false
48 | 
49 | rule-span-limit         12
50 | 
51 | phrase-penalty  2.718
52 | 
53 | # a whitespace seperated list of features
54 | # in this example, the features are phrase translation probability,
55 | # lexical probability, and phrase penalty
56 | # features        phrase-penalty e2fphrase f2ephrase lexprob lexical abstract adjacent x-rule source-terminals-without-target target-terminals-without-source monotonic glue-rule rarity target-word-count unaligned-count
57 | features        e_given_f_phrase f_given_e_phrase e_given_f_lex f_given_e_lex rarity phrase-penalty alignment count
58 | 
59 | # the only option and default later we will want to add formats for other decoders such as moses and
60 | # cdec, if they use other formats
61 | output-format   joshua  
62 | 
63 | # label feature scores? each score will be output as name=score
64 | label-feature-scores false
65 | 
66 | amazon-work s3://edu.jhu.cs.jonny/wmt11/fr-en/hiero
67 | amazon-jar  s3://edu.jhu.cs.jonny/thrax.jar
68 | amazon-num-instances    15
69 | 
70 | max-split-size  8388608
71 | 
72 | # the format should be:
73 | # foreign sentence ||| english sentence ||| alignment
74 | # where the english is either parsed or not depending on whether you want
75 | # SAMT or you want Hiero.
76 | #input-file  s3://edu.jhu.cs.jonny/wmt11/corpus.fr-en
77 | input-file pipeline-es-en-phrase-_export_projects_mpost_language-packs_es-en_1.3/input-file
78 | 


--------------------------------------------------------------------------------
/example/europarl.unified.1:
--------------------------------------------------------------------------------
1 | declaro reanudado el período de sesiones del parlamento europeo , interrumpido el viernes 17 de diciembre pasado , y reitero a sus señorías mi deseo de que hayan tenido unas buenas vacaciones .   |||     i declare resumed the session of the european parliament adjourned on friday 17 december 1999 , and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period .   |||     0-0 0-1 1-1 2-1 3-1 0-2 0-3 5-4 4-5 6-5 8-6 8-7 7-8 10-9 12-10 11-11 12-11 13-12 14-13 15-13 16-13 16-14 17-15 18-16 19-17 19-18 19-19 19-20 19-21 20-22 21-24 22-24 25-29 24-31 26-32 27-33 28-34 30-35 31-36 29-37 30-37 31-37 31-38 32-39
2 | 


--------------------------------------------------------------------------------
/example/nist09.unified.1:
--------------------------------------------------------------------------------
1 | اس ملک کا مغربی صحرائے راجھستان بھی مسلسل اپنے پانچ سال سے سخت خشک سالی کی لپیٹ میں ہے .    ||| (TOP (S (NP (NP (NP (DT The) (NN country) (POS 's)) (JJ western) (NN desert) (NN state)) (PP (IN of) (NP (NNP Rajasthan)))) (VP (VBZ is) (ADVP (RB also)) (VP (VBG bracing) (PP (IN for) (NP (NP (PRP$ its) (JJ fifth) (JJ straight) (NN year)) (PP (IN of) (NP (NN drought))))))) (. .)))  ||| 0-0 15-16 10-15 11-16 13-17 14-17 8-12 18-8 4-10 5-10 19-18 6-9 9-13 1-1 2-2 3-3
2 | 


--------------------------------------------------------------------------------
/lib/aws-java-sdk-1.1.3.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joshua-decoder/thrax/0d766be2e7d0fbbad0734064e55699d56048a30c/lib/aws-java-sdk-1.1.3.jar


--------------------------------------------------------------------------------
/lib/commons-lang3-3.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joshua-decoder/thrax/0d766be2e7d0fbbad0734064e55699d56048a30c/lib/commons-lang3-3.1.jar


--------------------------------------------------------------------------------
/lib/hadoop-common-2.5.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joshua-decoder/thrax/0d766be2e7d0fbbad0734064e55699d56048a30c/lib/hadoop-common-2.5.2.jar


--------------------------------------------------------------------------------
/lib/hadoop-mapreduce-client-core-2.5.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joshua-decoder/thrax/0d766be2e7d0fbbad0734064e55699d56048a30c/lib/hadoop-mapreduce-client-core-2.5.2.jar


--------------------------------------------------------------------------------
/lib/jerboa.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joshua-decoder/thrax/0d766be2e7d0fbbad0734064e55699d56048a30c/lib/jerboa.jar


--------------------------------------------------------------------------------
/lib/testng-5.8-jdk15.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joshua-decoder/thrax/0d766be2e7d0fbbad0734064e55699d56048a30c/lib/testng-5.8-jdk15.jar


--------------------------------------------------------------------------------
/scripts/berant_to_reference.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os, sys, codecs
 4 | 
 5 | def main():
 6 |   # <eat::animal_1::animal_2>	<kill::animal_1::animal_2>
 7 |   # <be affected than::animal_2::animal_1>	<be susceptible than::animal_2::animal_1>
 8 |   for line in sys.stdin:
 9 |     (source, target) = line.lstrip().rstrip().split("\t")
10 |     (s_phr, s1, s2) = source[1:-1].split("::")
11 |     (t_phr, t1, t2) = target[1:-1].split("::")
12 |     if (s1[-2:] == t1[-2:]):
13 |       t1 = "[1]"
14 |       t2 = "[2]"
15 |     else:
16 |       t1 = "[2]"
17 |       t2 = "[1]"
18 |     s1 = "[1]"
19 |     s2 = "[2]"
20 |     print s1 + " " + s_phr + " " + s2 + " ||| " + t1 + " " + t_phr + " " + t2
21 |     
22 | 
23 | if __name__ == "__main__":
24 |     main()
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/scripts/create_glue_grammar.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # this script just wraps a java call
 3 | 
 4 | if [[ -z "$THRAX" ]]
 5 | then
 6 |     THRAX="`basename $0`/.."
 7 | fi
 8 | 
 9 | java -cp $THRAX/bin/thrax.jar edu.jhu.thrax.util.CreateGlueGrammar $1
10 | 
11 | 


--------------------------------------------------------------------------------
/scripts/filter_rules.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if (($# < 1))
 4 | then
 5 |     cat << END_USAGE
 6 | usage: filter_rules.sh [-v|-p|-f] <test set> [test set ...]
 7 |     -v  verbose mode
 8 |     -p  parallel compatibility: print blank lines, don't buffer output
 9 |     -f  fast mode: not as aggressive
10 | END_USAGE
11 |     exit 1
12 | fi
13 | 
14 | java -Dfile.encoding=utf8 -cp $THRAX/bin/thrax.jar edu.jhu.thrax.util.TestSetFilter $*
15 | 
16 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/datatypes/AlignedSentencePair.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.datatypes;
 2 | 
 3 | import java.util.Arrays;
 4 | 
 5 | public class AlignedSentencePair {
 6 |   public final int[] source;
 7 |   public final int[] target;
 8 |   public final Alignment alignment;
 9 | 
10 |   public AlignedSentencePair(int[] ss, int[] ts, Alignment a) {
11 |     source = ss;
12 |     target = ts;
13 |     alignment = a;
14 |   }
15 | 
16 |   public boolean equals(Object o) {
17 |     if (o == this) return true;
18 |     if (!(o instanceof AlignedSentencePair)) return false;
19 |     AlignedSentencePair other = (AlignedSentencePair) o;
20 |     return Arrays.equals(source, other.source) && Arrays.equals(target, other.target)
21 |         && alignment.equals(other.alignment);
22 |   }
23 | 
24 |   public int hashCode() {
25 |     int result = 137;
26 |     result = result * 67 + Arrays.hashCode(source);
27 |     result = result * 67 + Arrays.hashCode(target);
28 |     result = result * 67 + alignment.hashCode();
29 |     return result;
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/datatypes/Alignment.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.datatypes;
 2 | 
 3 | import java.util.Iterator;
 4 | 
 5 | /**
 6 |  * This interface represents a word-level alignment of a sentence pair.
 7 |  */
 8 | public interface Alignment {
 9 | 
10 |   public boolean sourceIndexIsAligned(int i);
11 | 
12 |   public boolean targetIndexIsAligned(int i);
13 | 
14 |   public int numTargetWordsAlignedTo(int i);
15 | 
16 |   public int numSourceWordsAlignedTo(int i);
17 | 
18 |   public Iterator<Integer> targetIndicesAlignedTo(int i);
19 | 
20 |   public Iterator<Integer> sourceIndicesAlignedTo(int i);
21 | 
22 |   public boolean consistentWith(int sourceLength, int targetLength);
23 | }
24 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/datatypes/IntPair.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.datatypes;
 2 | 
 3 | import edu.jhu.thrax.util.FormatUtils;
 4 | 
 5 | /**
 6 |  * A class that represents a pair of integers.
 7 |  */
 8 | public class IntPair implements Comparable<IntPair> {
 9 | 
10 |     /**
11 |      * The first integer of the pair ("car").
12 |      */
13 |     public final int fst;
14 | 
15 |     /**
16 |      * The second integer of the pair ("cdr").
17 |      */
18 |     public final int snd;
19 | 
20 |     /**
21 |      * Constructor that sets the two ints of the pair.
22 |      *
23 |      * @param a the first int of the pair
24 |      * @param b the second int of the pair
25 |      */
26 |     public IntPair(int a, int b)
27 |     {
28 |         fst = a;
29 |         snd = b;
30 |     }
31 | 
32 |     /**
33 |      * Create a new IntPair that is the reverse of this pair; that is, puts 
34 | 	 * the second int first and the first int second.
35 |      */
36 |     public IntPair reverse()
37 |     {
38 |         return new IntPair(snd, fst);
39 |     }
40 | 
41 |     /**
42 |      * Builds a pair from the type of String that you would see in Berkeley
43 |      * aligner output. For example, the String "3-4" would yield the pair
44 |      * (3,4).
45 |      *
46 |      * @param s a string in Berkeley aligner format
47 |      * @return a new IntPair representing that string
48 |      */
49 |     public static IntPair fromHyphenatedString(String s)
50 |     {
51 |         String [] nums = FormatUtils.P_DASH.split(s);
52 |         if (nums.length != 2) {
53 |             return null;
54 |         }
55 |         return new IntPair(Integer.parseInt(nums[0]), Integer.parseInt(nums[1]));
56 |     }
57 | 
58 |     public String toString()
59 |     {
60 |         return String.format("(%d,%d)", fst, snd);
61 |     }
62 | 
63 |     public boolean equals(Object o)
64 |     {
65 |         if (o instanceof IntPair) {
66 |             IntPair ip = (IntPair) o;
67 |             return this.fst == ip.fst && this.snd == ip.snd;
68 |         }
69 |         return false;
70 |     }
71 | 
72 |     public int compareTo(IntPair ip)
73 |     {
74 |         if (this.fst == ip.fst) {
75 |             return this.snd - ip.snd;
76 |         }
77 |         return this.fst - ip.fst;
78 |     }
79 | 
80 |     public int hashCode()
81 |     {
82 |         return fst * 37 + snd;
83 |     }
84 | 
85 | }
86 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/distributional/ContextPhrase.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.distributional;
 2 | 
 3 | import org.apache.hadoop.io.IntWritable;
 4 | import org.apache.hadoop.io.MapWritable;
 5 | import org.apache.hadoop.io.Text;
 6 | import org.apache.hadoop.io.Writable;
 7 | 
 8 | public class ContextPhrase {
 9 | 
10 |   private final Text phrase;
11 | 
12 |   private MapWritable features;
13 | 
14 |   public ContextPhrase(String phrase) {
15 |     this.phrase = new Text(phrase);
16 |     this.features = new MapWritable();
17 |   }
18 | 
19 |   public void addFeature(String feature_name) {
20 |     addFeature(feature_name, 1);
21 |   }
22 | 
23 |   public void addFeature(String feature_name, int feature_value) {
24 |     Text feature_text = new Text(feature_name);
25 |     Writable current_value = features.get(feature_text);
26 |     if (current_value != null)
27 |       features.put(feature_text, new IntWritable(((IntWritable) current_value).get()
28 |           + feature_value));
29 |     else
30 |       features.put(feature_text, new IntWritable(feature_value));
31 |   }
32 | 
33 |   public Text getPhrase() {
34 |     return phrase;
35 |   }
36 | 
37 |   public MapWritable getFeatures() {
38 |     return features;
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/distributional/FeatureClass.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.distributional;
 2 | 
 3 | import edu.jhu.thrax.distributional.FeatureTypes.Label;
 4 | import edu.jhu.thrax.distributional.FeatureTypes.Type;
 5 | 
 6 | public class FeatureClass {
 7 |   public final Type type;
 8 |   public final Label label;
 9 |   public final int max_context;
10 |   public final int max_gram;
11 |   
12 |   public FeatureClass(Type type, Label label) {
13 |     this(type, label, -1, -1);
14 |   }
15 |   
16 |   public FeatureClass(Type type, Label label, int max_context, int max_gram) {
17 |     this.type = type;
18 |     this.label = label;
19 |     this.max_context = max_context;
20 |     this.max_gram = max_gram;
21 |   }
22 | }


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/distributional/FeatureEncoder.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.distributional;
 2 | 
 3 | import edu.jhu.thrax.distributional.FeatureTypes.Directionality;
 4 | import edu.jhu.thrax.distributional.FeatureTypes.Flavor;
 5 | import edu.jhu.thrax.distributional.FeatureTypes.Label;
 6 | import edu.jhu.thrax.distributional.FeatureTypes.Type;
 7 | 
 8 | public class FeatureEncoder {
 9 | 
10 |   public static long encode(Type type, Label label, Flavor flavor, Directionality directionality) {
11 |     return 0;
12 |   }
13 |   
14 |   public static String type(long coded) {
15 |     int feature_code = (int) (coded >> 32);
16 |     
17 |     return new Integer(feature_code).toString();
18 |   }
19 |   
20 |   public static int feature(long coded) {
21 |     return (int) (coded & 0x00000000FFFFFFFF);
22 |   }
23 |   
24 | }
25 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/distributional/FeatureSet.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.distributional;
 2 | 
 3 | import java.util.HashSet;
 4 | import java.util.Set;
 5 | 
 6 | import edu.jhu.thrax.distributional.FeatureTypes.Label;
 7 | import edu.jhu.thrax.distributional.FeatureTypes.Type;
 8 | import edu.jhu.thrax.util.FormatUtils;
 9 | 
10 | 
11 | public class FeatureSet {
12 | 
13 |   private Set<FeatureClass> features;
14 | 
15 |   private boolean active[][];
16 | 
17 |   public FeatureSet() {
18 |     features = new HashSet<FeatureClass>();
19 |     active = new boolean[Type.values().length][Label.values().length];
20 |   }
21 | 
22 |   public void addFeatureClass(String entry) {
23 |     String[] fields = FormatUtils.P_DASH.split(entry);
24 |     for (String f : fields) {
25 |       System.err.println(f);
26 |     }
27 |   }
28 | 
29 |   public void addFeatureSet(FeatureSet set) {
30 |     for (FeatureClass fc : set.features)
31 |       this.features.add(fc);
32 | 
33 |     for (int i = 0; i < active.length; ++i)
34 |       for (int j = 0; j < active[i].length; ++j)
35 |         active[i][j] = active[i][j] || set.active[i][j];
36 |   }
37 | 
38 |   public boolean active(Type type, Label label) {
39 |     return active[type.code][label.code];
40 |   }
41 | 
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/distributional/FeatureTypes.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.distributional;
 2 | 
 3 | import java.util.HashMap;
 4 | import java.util.Map;
 5 | 
 6 | public class FeatureTypes {
 7 | 
 8 |   public enum Type {
 9 |     NGRAM(0, "ngram"), SYN(1, "syn"), DEP(2, "dep"), CDEP(3, "cdep"), CPDEP(4, "cpdep");
10 | 
11 |     private static Map<Integer, Type> map;
12 | 
13 |     static {
14 |       map = new HashMap<Integer, Type>();
15 |       for (Type t : Type.values())
16 |         map.put(t.code, t);
17 |     }
18 | 
19 |     public static Type get(int code) {
20 |       return map.get(code);
21 |     }
22 | 
23 |     public final int code;
24 |     public final String name;
25 | 
26 |     Type(int code, String name) {
27 |       this.code = code;
28 |       this.name = name;
29 |     }
30 |   }
31 | 
32 |   public enum Label {
33 |     NONE(0, "none"), LEX(1, "lex"), LEM(2, "lem"), POS(3, "pos"), NER(4, "ner");
34 | 
35 |     public final int code;
36 |     public final String name;
37 | 
38 |     Label(int code, String name) {
39 |       this.code = code;
40 |       this.name = name;
41 |     }
42 |   }
43 | 
44 |   public enum Directionality {
45 |     NONE(0, "none"), LEFT(1, "left"), RIGHT(2, "right"), CENTER(3, "center");
46 | 
47 |     public final int code;
48 |     public final String name;
49 | 
50 |     Directionality(int code, String name) {
51 |       this.code = code;
52 |       this.name = name;
53 |     }
54 |   }
55 | 
56 |   public enum Flavor {
57 |     NONE(0, "none"), GOV(1, "gov"), DEP(2, "dep"), HEAD(3, "head");
58 | 
59 |     public final int code;
60 |     public final String name;
61 | 
62 |     Flavor(int code, String name) {
63 |       this.code = code;
64 |       this.name = name;
65 |     }
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/extraction/HieroLabeler.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.extraction;
 2 | 
 3 | public class HieroLabeler implements SpanLabeler
 4 | {
 5 |     private final int label;
 6 | 
 7 |     public HieroLabeler(int s)
 8 |     {
 9 |         label = s;
10 |     }
11 | 
12 |     public int getLabel(int start, int end)
13 |     {
14 |         return label;
15 |     }
16 | }
17 | 
18 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/extraction/LabelCache.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.extraction;
 2 | 
 3 | import java.util.HashMap;
 4 | 
 5 | import edu.jhu.thrax.util.Vocabulary;
 6 | 
 7 | public enum LabelCache {
 8 |   SLASH("/"), BACKSLASH("\\"), PLUS("+");
 9 |   
10 |   private HashMap<Long, Integer> cache = new HashMap<Long, Integer>();
11 |   private String glue;
12 |   
13 |   private LabelCache(String g) {
14 |     glue = g;
15 |   }
16 |   
17 |   public final int get(int left, int right) {
18 |     long key = ((long) left << 32) | ((long) right & 0x00000000FFFFFFFFL);
19 |     Integer val = cache.get(key);
20 |     if (val == null) {
21 |       val = join(left, right, glue);
22 |       cache.put(key, val);
23 |     }
24 |     return val;
25 |   }
26 |   
27 |   private static final int join(int a, int b, String glue) {
28 |     String word_a = Vocabulary.word(a);
29 |     String word_b = Vocabulary.word(b);
30 |     return Vocabulary.id(word_a.substring(0, word_a.length() - 1) + glue
31 |         + word_b.substring(1));
32 |   }
33 | }


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/extraction/Labeling.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.extraction;
2 | 
3 | public enum Labeling {
4 |   HIERO, SYNTAX, MANUAL;
5 | }
6 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/extraction/ManualSpanLabeler.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.extraction;
 2 | 
 3 | public class ManualSpanLabeler implements SpanLabeler
 4 | {
 5 |     private final int [] labels;
 6 | 	private final int defaultLabel;
 7 |     private final int sentenceLength;
 8 | 
 9 |     public ManualSpanLabeler(int[] ls, int def)
10 |     {
11 | 		labels = ls;
12 | 		defaultLabel = def;
13 | 		sentenceLength = getSentenceLength(labels.length);
14 |     }
15 | 
16 |     public int getLabel(int from, int to)
17 |     {
18 |         int idx = getLabelIndex(from, to, sentenceLength);
19 |         if (idx >= labels.length || idx < 0) {
20 |             return defaultLabel;
21 |         }
22 |         else {
23 |             return labels[idx];
24 |         }
25 |     }
26 | 
27 |     private static int getSentenceLength(int numLabels)
28 |     {
29 |         if (numLabels < 0)
30 |             return 0;
31 |         // 0 labels => sentence length 0
32 |         // 1 label => 1
33 |         // 3 labels => 2
34 |         // T_n labels => n, where T_n is the nth triangle number
35 |         int result = 0;
36 |         int triangle = 0;
37 |         while (triangle != numLabels) {
38 |             result++;
39 |             triangle += result;
40 |         }
41 |         return result;
42 |     }
43 | 
44 |     private static int getLabelIndex(int from, int to, int length)
45 |     {
46 |         // let the length of the target sentence be L
47 |         // the first L labels are for spans (0,1) ... (0,L)
48 |         // the next L - 1 are for (1,2) ... (1,L)
49 |         // and so on
50 |         int result = 0;
51 |         int offset = length;
52 |         for (int i = 0; i < from; i++) {
53 |             result += offset;
54 |             offset--;
55 |         }
56 |         int difference = to - from - 1;
57 |         result += difference;
58 |         return result;
59 |     }
60 | }
61 | 
62 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/extraction/SpanLabeler.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.extraction;
2 | 
3 | public interface SpanLabeler
4 | {
5 |     public int getLabel(int start, int end);
6 | }
7 | 
8 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/comparators/FieldComparator.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.comparators;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.io.WritableComparator;
 6 | import org.apache.hadoop.io.WritableUtils;
 7 | 
 8 | public class FieldComparator {
 9 |   private final int fieldNumber;
10 |   private final WritableComparator comparator;
11 |   
12 |   public int offset;
13 | 
14 |   public FieldComparator(int field, WritableComparator comparator) {
15 |     if (field < 0)
16 |       throw new IllegalArgumentException("TextFieldComparator: cannot compare field of index "
17 |           + field);
18 |     fieldNumber = field;
19 |     this.comparator = comparator;
20 |   }
21 | 
22 |   public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) throws IOException {
23 |     int start1 = getFieldStart(fieldNumber, b1, s1);
24 |     int start2 = getFieldStart(fieldNumber, b2, s2);
25 | 
26 |     int length1 = getFieldLength(b1, start1);
27 |     int length2 = getFieldLength(b2, start2);
28 | 
29 |     // TODO: l1 and l2 may need to be adjusted to reflect offset.
30 |     return comparator.compare(b1, start1, length1, b2, start2, length2);
31 |   }
32 | 
33 |   private final int getFieldStart(int field, byte[] bytes, int start) throws IOException {
34 |     // if we want the first field, just return current start
35 |     if (field == 0) return start;
36 |     // otherwise, find out how long this field is ...
37 |     int fieldLength = getFieldLength(bytes, start);
38 |     // then decrement the field number and find the next start
39 |     return getFieldStart(field - 1, bytes, start + fieldLength);
40 |   }
41 | 
42 |   private static final int getFieldLength(byte[] bytes, int start) throws IOException {
43 |     // Text is serialized as vInt (the length) plus that many bytes
44 |     int vint_size = WritableUtils.decodeVIntSize(bytes[start]);
45 |     int field_length = WritableComparator.readVInt(bytes, start);
46 |     return vint_size + field_length;
47 |   }
48 | 
49 |   public int fieldEndIndex(byte[] bytes, int start) throws IOException {
50 |     int fieldStart = getFieldStart(fieldNumber, bytes, start);
51 |     int fieldLength = getFieldLength(bytes, fieldStart);
52 |     return fieldStart + fieldLength;
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/comparators/PrimitiveArrayMarginalComparator.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.comparators;
 2 | 
 3 | import org.apache.hadoop.io.Text;
 4 | import org.apache.hadoop.io.WritableComparator;
 5 | import org.apache.hadoop.io.WritableUtils;
 6 | 
 7 | /**
 8 |  * Compares two primitive array objects lexicographically, except the zero-length array should be
 9 |  * sorted before any other.
10 |  */
11 | public class PrimitiveArrayMarginalComparator extends WritableComparator {
12 |   
13 |   public static final int[] MARGINAL = new int[0];
14 |   
15 |   public PrimitiveArrayMarginalComparator() {
16 |     super(Text.class);
17 |   }
18 | 
19 |   public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
20 |     int h1 = WritableUtils.decodeVIntSize(b1[s1]);
21 |     int length1 = (h1 == 1 ? b1[s1] : -1);
22 |     
23 |     int h2 = WritableUtils.decodeVIntSize(b2[s2]);
24 |     int length2 = (h2 == 1 ? b2[s2] : -1);
25 | 
26 |     if (length1 == 0 && length2 == 0) return 0;
27 |     if (length1 == 0) return -1;
28 |     if (length2 == 0) return 1;
29 |     return WritableComparator.compareBytes(b1, s1 + h1, l1 - h1, b2, s2 + h2, l2 - h2);
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/comparators/TextMarginalComparator.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.comparators;
 2 | 
 3 | import org.apache.hadoop.io.Text;
 4 | import org.apache.hadoop.io.WritableComparator;
 5 | import org.apache.hadoop.io.WritableUtils;
 6 | 
 7 | /**
 8 |  * Compares two Text objects lexicographically, except the Text "/MARGINAL/"
 9 |  * should be sorted before any other string.
10 |  */
11 | public class TextMarginalComparator extends WritableComparator
12 | {
13 |     private static final Text.Comparator TEXT_COMPARATOR = new Text.Comparator();
14 | 
15 |     public static final Text MARGINAL = new Text("/MARGINAL/");
16 |     private static final byte [] MARGINAL_BYTES = MARGINAL.getBytes();
17 |     private static final int MARGINAL_LENGTH = MARGINAL.getLength();
18 | 
19 |     public TextMarginalComparator()
20 |     {
21 |         super(Text.class);
22 |     }
23 | 
24 |     public int compare(byte [] b1, int s1, int l1,
25 |             byte [] b2, int s2, int l2)
26 |     {
27 |         // if they're equal, return zero
28 |         int cmp = TEXT_COMPARATOR.compare(b1, s1, l1, b2, s2, l2);
29 |         if (cmp == 0) {
30 |             return 0;
31 |         }
32 |         // else if the first string is "/MARGINAL/", return -1
33 |         int vIntSize = WritableUtils.decodeVIntSize(b1[s1]);
34 |         int cmpMarginal = compareBytes(b1, s1 + vIntSize, l1 - vIntSize,
35 |                 MARGINAL_BYTES, 0, MARGINAL_LENGTH);
36 |         if (cmpMarginal == 0)
37 |             return -1;
38 |         // else if the second is "/MARGINAL/", return 1
39 |         vIntSize = WritableUtils.decodeVIntSize(b2[s2]);
40 |         cmpMarginal = compareBytes(b2, s2 + vIntSize, l2 - vIntSize,
41 |                 MARGINAL_BYTES, 0, MARGINAL_LENGTH);
42 |         if (cmpMarginal == 0)
43 |             return 1;
44 |         // else, just return the result of the comparison
45 |         return cmp;
46 |     }
47 | }
48 | 
49 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/datatypes/Annotation.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.datatypes;
 2 | 
 3 | import java.io.DataInput;
 4 | import java.io.DataOutput;
 5 | import java.io.IOException;
 6 | 
 7 | import org.apache.hadoop.io.Writable;
 8 | import org.apache.hadoop.io.WritableUtils;
 9 | 
10 | public class Annotation implements Writable {
11 | 
12 |   // Source-to-target alignment.
13 |   private AlignmentWritable f2e = null;
14 | 
15 |   // Rule occurrence count.
16 |   private int count;
17 | 
18 |   public Annotation() {
19 |     count = 0;
20 |   }
21 | 
22 |   public Annotation(int c) {
23 |     count = c;
24 |   }
25 | 
26 |   public Annotation(Annotation a) {
27 |     count = a.count;
28 |     this.f2e = new AlignmentWritable(a.f2e);
29 |   }
30 |   
31 |   public Annotation(AlignmentWritable f2e) {
32 |     count = 1;
33 |     this.f2e = f2e;
34 |   }
35 | 
36 |   public void merge(Annotation that) {
37 |     this.count += that.count;
38 |   }
39 | 
40 |   @Override
41 |   public void readFields(DataInput in) throws IOException {
42 |     boolean has_alignments = false;
43 |     count = WritableUtils.readVInt(in);
44 |     if (count < 0) {
45 |       count = -count;
46 |       has_alignments = true;
47 |     }
48 |     if (has_alignments) {
49 |       f2e = new AlignmentWritable();
50 |       f2e.readFields(in);
51 |     }
52 |   }
53 | 
54 |   @Override
55 |   public void write(DataOutput out) throws IOException {
56 |     WritableUtils.writeVInt(out, (f2e != null ? -count : count));
57 |     if (f2e != null) f2e.write(out);
58 |   }
59 | 
60 |   public AlignmentWritable e2f() {
61 |       return f2e.flip();
62 |   }
63 | 
64 |   public AlignmentWritable f2e() {
65 |     return f2e;
66 |   }
67 | 
68 |   public void setAlignment(AlignmentWritable a) {
69 |     f2e = a;
70 |   }
71 | 
72 |   public int count() {
73 |     return count;
74 |   }
75 | }
76 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/datatypes/FeatureMap.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.datatypes;
 2 | 
 3 | import java.io.DataInput;
 4 | import java.io.DataOutput;
 5 | import java.io.IOException;
 6 | import java.util.HashMap;
 7 | import java.util.Map;
 8 | import java.util.Set;
 9 | 
10 | import org.apache.hadoop.io.FloatWritable;
11 | import org.apache.hadoop.io.Writable;
12 | import org.apache.hadoop.io.WritableUtils;
13 | 
14 | import edu.jhu.thrax.hadoop.features.annotation.AnnotationPassthroughFeature;
15 | import edu.jhu.thrax.util.Vocabulary;
16 | 
17 | public class FeatureMap implements Writable {
18 | 
19 |   private Map<Integer, Writable> map;
20 | 
21 |   public FeatureMap() {
22 |     map = new HashMap<Integer, Writable>();
23 |   }
24 | 
25 |   public FeatureMap(FeatureMap fm) {
26 |     this();
27 |     for (int key : fm.map.keySet())
28 |       this.map.put(key, fm.map.get(key));
29 |   }
30 | 
31 |   public Writable get(int key) {
32 |     return map.get(key);
33 |   }
34 | 
35 |   public Writable get(String key) {
36 |     return map.get(Vocabulary.id(key));
37 |   }
38 | 
39 |   public void put(int key, Writable val) {
40 |     map.put(key, val);
41 |   }
42 |   
43 |   public void put(String key, Writable val) {
44 |     map.put(Vocabulary.id(key), val);
45 |   }
46 |   
47 |   public boolean containsKey(int key) {
48 |     return map.containsKey(key);
49 |   }
50 |   
51 |   public Set<Integer> keySet() {
52 |     return map.keySet();
53 |   }
54 |   
55 |   @Override
56 |   public void readFields(DataInput in) throws IOException {
57 |     map.clear();
58 |     int size = WritableUtils.readVInt(in);
59 |     for (int i = 0; i < size; ++i) {
60 |       int key = 0;
61 |       Writable val = null;
62 |       key = WritableUtils.readVInt(in);
63 |       if (key == Vocabulary.id(AnnotationPassthroughFeature.NAME)) {
64 |         val = new Annotation();
65 |         val.readFields(in);
66 |       } else {
67 |         val = new FloatWritable();
68 |         val.readFields(in);
69 |       }
70 |       map.put(key, val);
71 |     }
72 |   }
73 | 
74 |   @Override
75 |   public void write(DataOutput out) throws IOException {
76 |     WritableUtils.writeVInt(out, map.size());
77 |     for (int key : map.keySet()) {
78 |       WritableUtils.writeVInt(out, key);
79 |       if (key == Vocabulary.id(AnnotationPassthroughFeature.NAME)) {
80 |         ((Annotation) this.get(key)).write(out);
81 |       } else {
82 |         ((FloatWritable) this.get(key)).write(out);
83 |       }
84 |     }
85 |   }
86 | }
87 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/datatypes/FeaturePair.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.datatypes;
 2 | 
 3 | import java.io.DataInput;
 4 | import java.io.DataOutput;
 5 | import java.io.IOException;
 6 | 
 7 | import org.apache.hadoop.io.Writable;
 8 | import org.apache.hadoop.io.WritableUtils;
 9 | 
10 | import edu.jhu.thrax.util.Vocabulary;
11 | 
12 | public class FeaturePair implements Writable {
13 |   public int key;
14 |   public FeatureValue val;
15 | 
16 |   public FeaturePair() {
17 |     key = 0;
18 |     val = new FeatureValue();
19 |   }
20 | 
21 |   public FeaturePair(int k, Writable v) {
22 |     key = k;
23 |     val = new FeatureValue(v);
24 |   }
25 | 
26 |   public void write(DataOutput out) throws IOException {
27 |     WritableUtils.writeVInt(out, key);
28 |     val.write(out);
29 |   }
30 | 
31 |   public void readFields(DataInput in) throws IOException {
32 |     key = WritableUtils.readVInt(in);
33 |     val.readFields(in);
34 |   }
35 | 
36 |   public int hashCode() {
37 |     return key * 163 + val.hashCode();
38 |   }
39 | 
40 |   public boolean equals(Object o) {
41 |     if (o instanceof FeaturePair) {
42 |       FeaturePair that = (FeaturePair) o;
43 |       return key == that.key && val.equals(that.val);
44 |     }
45 |     return false;
46 |   }
47 | 
48 |   public String toString() {
49 |     return Vocabulary.word(key) + "=" + val.toString();
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/datatypes/FeatureValue.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.datatypes;
 2 | 
 3 | import org.apache.hadoop.io.FloatWritable;
 4 | import org.apache.hadoop.io.GenericWritable;
 5 | import org.apache.hadoop.io.IntWritable;
 6 | import org.apache.hadoop.io.Text;
 7 | import org.apache.hadoop.io.Writable;
 8 | 
 9 | public class FeatureValue extends GenericWritable {
10 | 
11 |   @SuppressWarnings("rawtypes")
12 |   private static Class[] TYPES = {FloatWritable.class, IntWritable.class, Text.class,
13 |       Annotation.class, AlignmentWritable.class};
14 | 
15 |   FeatureValue() {}
16 | 
17 |   FeatureValue(Writable val) {
18 |     this.set(val);
19 |   }
20 | 
21 |   @SuppressWarnings("unchecked")
22 |   @Override
23 |   protected Class<? extends Writable>[] getTypes() {
24 |     return TYPES;
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/datatypes/IntPair.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.datatypes;
 2 | 
 3 | import java.io.DataInput;
 4 | import java.io.DataOutput;
 5 | import java.io.IOException;
 6 | 
 7 | import org.apache.hadoop.io.WritableComparable;
 8 | import org.apache.hadoop.io.WritableComparator;
 9 | 
10 | public class IntPair implements WritableComparable<IntPair> {
11 |   public int fst;
12 |   public int snd;
13 | 
14 |   public IntPair() {
15 |     // do nothing
16 |   }
17 | 
18 |   public IntPair(int car, int cdr) {
19 |     fst = car;
20 |     snd = cdr;
21 |   }
22 | 
23 |   public void reverse() {
24 |     int tmp = fst;
25 |     fst = snd;
26 |     snd = tmp;
27 |   }
28 | 
29 |   public void write(DataOutput out) throws IOException {
30 |     out.writeInt(fst);
31 |     out.writeInt(snd);
32 |   }
33 | 
34 |   public void readFields(DataInput in) throws IOException {
35 |     fst = in.readInt();
36 |     snd = in.readInt();
37 |   }
38 | 
39 |   public int hashCode() {
40 |     return fst * 163 + snd;
41 |   }
42 | 
43 |   public boolean equals(Object o) {
44 |     if (o instanceof IntPair) {
45 |       IntPair ip = (IntPair) o;
46 |       return fst == ip.fst && snd == ip.snd;
47 |     }
48 |     return false;
49 |   }
50 | 
51 |   public String toString() {
52 |     return fst + "\t" + snd;
53 |   }
54 | 
55 |   public int compareTo(IntPair ip) {
56 |     int cmp = ip.fst - fst;
57 |     if (cmp != 0) {
58 |       return cmp;
59 |     }
60 |     return ip.snd - snd;
61 |   }
62 | 
63 |   public static class Comparator extends WritableComparator {
64 |     public Comparator() {
65 |       super(IntPair.class);
66 |     }
67 | 
68 |     public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
69 |       int fst1 = readInt(b1, s1);
70 |       int fst2 = readInt(b2, s2);
71 |       if (fst1 != fst2) {
72 |         return fst2 - fst1;
73 |       }
74 |       int snd1 = readInt(b1, s1 + 4);
75 |       int snd2 = readInt(b2, s2 + 4);
76 |       return snd2 - snd1;
77 |     }
78 |   }
79 | 
80 |   static {
81 |     WritableComparator.define(IntPair.class, new Comparator());
82 |   }
83 | 
84 | }
85 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/distributional/CommonLSH.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.distributional;
 2 | 
 3 | import org.apache.hadoop.conf.Configuration;
 4 | 
 5 | import edu.jhu.jerboa.sim.SLSH;
 6 | 
 7 | public class CommonLSH {
 8 | 
 9 |   public static SLSH getSLSH(Configuration conf) {
10 |     SLSH slsh = null;
11 |     try {
12 |       slsh = new SLSH();
13 |       slsh.initialize(conf.getInt("thrax.lsh-num-bits", 256),
14 |           conf.getInt("thrax.lsh-pool-size", 100000), conf.getInt("thrax.lsh-random-seed", 42));
15 |     } catch (Exception e) {
16 |       e.printStackTrace();
17 |       System.exit(1);
18 |     }
19 |     return slsh;
20 |   }
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/distributional/DistributionalContextCombiner.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.distributional;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.apache.hadoop.io.Text;
 7 | import org.apache.hadoop.mapreduce.Reducer;
 8 | 
 9 | import edu.jhu.jerboa.sim.SLSH;
10 | 
11 | public class DistributionalContextCombiner
12 |     extends Reducer<Text, ContextWritable, Text, ContextWritable> {
13 | 
14 |   private SLSH slsh;
15 | 
16 |   public void setup(Context context) throws IOException, InterruptedException {
17 |     Configuration conf = context.getConfiguration();
18 |     slsh = CommonLSH.getSLSH(conf);
19 |   }
20 | 
21 |   protected void reduce(Text key, Iterable<ContextWritable> values, Context context)
22 |       throws IOException, InterruptedException {
23 |     ContextWritable combined = new ContextWritable();
24 |     for (ContextWritable input : values) {
25 |       combined.merge(input, slsh);
26 |     }
27 |     if (!combined.compacted.get()) combined.compact(slsh);
28 |     context.write(key, combined);
29 |     return;
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/distributional/DistributionalContextMapper.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.distributional;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.List;
 5 | 
 6 | import org.apache.hadoop.conf.Configuration;
 7 | import org.apache.hadoop.io.LongWritable;
 8 | import org.apache.hadoop.io.Text;
 9 | import org.apache.hadoop.mapreduce.Mapper;
10 | 
11 | import edu.jhu.thrax.distributional.ContextPhrase;
12 | import edu.jhu.thrax.distributional.ContextPhraseExtractor;
13 | import edu.jhu.thrax.util.MalformedInput;
14 | import edu.jhu.thrax.util.exceptions.EmptySentenceException;
15 | import edu.jhu.thrax.util.exceptions.MalformedInputException;
16 | import edu.jhu.thrax.util.exceptions.MalformedParseException;
17 | import edu.jhu.thrax.util.exceptions.NotEnoughFieldsException;
18 | 
19 | public class DistributionalContextMapper extends Mapper<LongWritable, Text, Text, ContextWritable> {
20 | 
21 |   private ContextPhraseExtractor extractor;
22 | 
23 |   protected void setup(Context context) throws IOException, InterruptedException {
24 |     Configuration conf = context.getConfiguration();
25 |     extractor = new ContextPhraseExtractor(conf);
26 |   }
27 | 
28 |   protected void map(LongWritable key, Text value, Context context) throws IOException,
29 |       InterruptedException {
30 |     if (extractor == null) return;
31 |     String line = value.toString();
32 |     try {
33 |       List<ContextPhrase> phrases = extractor.extract(line);
34 |       for (ContextPhrase cp : phrases) {
35 |         context.write(cp.getPhrase(), new ContextWritable(1, cp.getFeatures()));
36 |       }
37 |     } catch (NotEnoughFieldsException e) {
38 |       context.getCounter(MalformedInput.NOT_ENOUGH_FIELDS).increment(1);
39 |     } catch (EmptySentenceException e) {
40 |       context.getCounter(MalformedInput.EMPTY_SENTENCE).increment(1);
41 |     } catch (MalformedParseException e) {
42 |       context.getCounter(MalformedInput.MALFORMED_PARSE).increment(1);
43 |     } catch (MalformedInputException e) {
44 |       context.getCounter(MalformedInput.UNKNOWN).increment(1);
45 |     }
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/distributional/DistributionalContextReducer.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.distributional;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.apache.hadoop.io.NullWritable;
 7 | import org.apache.hadoop.io.Text;
 8 | import org.apache.hadoop.mapreduce.Reducer;
 9 | 
10 | import edu.jhu.jerboa.sim.SLSH;
11 | import edu.jhu.jerboa.sim.Signature;
12 | 
13 | public class DistributionalContextReducer
14 |     extends Reducer<Text, ContextWritable, SignatureWritable, NullWritable> {
15 | 
16 |   private int minCount;
17 |   private SLSH slsh;
18 | 
19 |   public void setup(Context context) throws IOException, InterruptedException {
20 |     Configuration conf = context.getConfiguration();
21 |     minCount = conf.getInt("thrax.min-phrase-count", 3);
22 |     slsh = CommonLSH.getSLSH(conf);
23 |   }
24 | 
25 |   protected void reduce(Text key, Iterable<ContextWritable> values, Context context)
26 |       throws IOException, InterruptedException {
27 |     ContextWritable reduced = new ContextWritable();
28 |     for (ContextWritable input : values) {
29 |       reduced.merge(input, slsh);
30 |     }
31 |     if (!reduced.compacted.get()) reduced.compact(slsh);
32 |     if (reduced.strength.get() >= minCount) {
33 |       Signature reduced_signature = new Signature();
34 |       // TODO: double-check need for deep copy?
35 |       reduced_signature.sums = reduced.sums;
36 |       slsh.buildSignature(reduced_signature, false);
37 |       context.write(new SignatureWritable(key, reduced_signature, reduced.strength.get()),
38 |           NullWritable.get());
39 |     }
40 |     return;
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/distributional/SignatureWritable.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.distributional;
 2 | 
 3 | import java.io.DataInput;
 4 | import java.io.DataOutput;
 5 | import java.io.IOException;
 6 | 
 7 | import org.apache.hadoop.io.IntWritable;
 8 | import org.apache.hadoop.io.Text;
 9 | import org.apache.hadoop.io.Writable;
10 | import org.apache.hadoop.io.WritableComparable;
11 | import org.apache.hadoop.mapreduce.Partitioner;
12 | 
13 | import edu.jhu.jerboa.sim.Signature;
14 | import edu.jhu.thrax.hadoop.datatypes.PrimitiveUtils;
15 | 
16 | public class SignatureWritable implements WritableComparable<SignatureWritable> {
17 |   public Text key;
18 |   public byte[] bytes;
19 |   public IntWritable strength;
20 | 
21 |   public SignatureWritable() {
22 |     this.key = new Text();
23 |     this.bytes = null;
24 |     this.strength = new IntWritable();
25 |   }
26 | 
27 |   public SignatureWritable(Text key, Signature signature, int strength) {
28 |     this.key = new Text(key);
29 |     // TODO: deep copy?
30 |     this.bytes = signature.bytes;
31 |     this.strength = new IntWritable(strength);
32 |   }
33 | 
34 |   @Override
35 |   public void readFields(DataInput in) throws IOException {
36 |     key.readFields(in);
37 |     bytes = PrimitiveUtils.readByteArray(in);
38 |     strength.readFields(in);
39 |   }
40 | 
41 |   @Override
42 |   public void write(DataOutput out) throws IOException {
43 |     key.write(out);
44 |     PrimitiveUtils.writeByteArray(out, bytes);
45 |     strength.write(out);
46 |   }
47 | 
48 |   @Override
49 |   public int compareTo(SignatureWritable that) {
50 |     int cmp = strength.compareTo(that.strength);
51 |     // Flip sign for descending sort order.
52 |     if (cmp != 0) return -cmp;
53 |     return key.compareTo(that.key);
54 |   }
55 | 
56 |   public static class SignaturePartitioner extends Partitioner<SignatureWritable, Writable> {
57 |     public int getPartition(SignatureWritable signature, Writable value, int num_partitions) {
58 |       int hash = 163;
59 |       hash = 37 * hash + signature.key.hashCode();
60 |       hash = 37 * hash + signature.bytes.hashCode();
61 |       hash = 37 * hash + signature.strength.hashCode();
62 |       return (hash & Integer.MAX_VALUE) % num_partitions;
63 |     }
64 |   }
65 | }
66 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/extraction/ExtractionCombiner.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.extraction;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.mapreduce.Reducer;
 6 | 
 7 | import edu.jhu.thrax.hadoop.datatypes.AlignedRuleWritable;
 8 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
 9 | 
10 | public class ExtractionCombiner extends Reducer<AlignedRuleWritable, Annotation, AlignedRuleWritable, Annotation> {
11 | 
12 |   protected void reduce(AlignedRuleWritable key, Iterable<Annotation> values, Context context)
13 |       throws IOException, InterruptedException {
14 |     context.progress();
15 |     Annotation merged = new Annotation();
16 |     for (Annotation a : values) merged.merge(a);
17 |     context.write(key, merged);
18 |   }
19 | }
20 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/extraction/ExtractionMapper.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.extraction;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.apache.hadoop.io.LongWritable;
 7 | import org.apache.hadoop.io.Text;
 8 | import org.apache.hadoop.mapreduce.Mapper;
 9 | 
10 | import edu.jhu.thrax.hadoop.datatypes.AlignedRuleWritable;
11 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
12 | import edu.jhu.thrax.util.Vocabulary;
13 | 
14 | public class ExtractionMapper extends Mapper<LongWritable, Text, AlignedRuleWritable, Annotation> {
15 |   private RuleWritableExtractor extractor;
16 | 
17 |   protected void setup(Context context) throws IOException, InterruptedException {
18 |     Configuration conf = context.getConfiguration();
19 |     String vocabulary_path = conf.getRaw("thrax.work-dir") + "vocabulary/part-*";
20 |     Vocabulary.initialize(conf, vocabulary_path);
21 | 
22 |     // TODO: static initializer call for what Annotation actually carries would go here.
23 |     extractor = RuleWritableExtractorFactory.create(context);
24 |     if (extractor == null) {
25 |       System.err.println("WARNING: could not create rule extractor as configured!");
26 |     }
27 |   }
28 | 
29 |   protected void map(LongWritable key, Text value, Context context) throws IOException,
30 |       InterruptedException {
31 |     if (extractor == null) return;
32 |     for (AnnotatedRule ar : extractor.extract(value))
33 |       context.write(new AlignedRuleWritable(ar.rule, ar.f2e), ar.annotation);
34 |     context.progress();
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/extraction/ExtractionReducer.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.extraction;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.apache.hadoop.mapreduce.Reducer;
 7 | 
 8 | import edu.jhu.thrax.hadoop.datatypes.AlignedRuleWritable;
 9 | import edu.jhu.thrax.hadoop.datatypes.AlignmentWritable;
10 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
11 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
12 | import edu.jhu.thrax.util.Vocabulary;
13 | 
14 | public class ExtractionReducer
15 |     extends Reducer<AlignedRuleWritable, Annotation, RuleWritable, Annotation> {
16 | 
17 |   private RuleWritable currentRule = null;
18 |   private Annotation currentAnnotation = null;
19 |   private AlignmentWritable maxAlignment = null;
20 |   private int alignmentCount;
21 | 
22 |   private int minCount;
23 | 
24 |   protected void setup(Context context) throws IOException, InterruptedException {
25 |     Configuration conf = context.getConfiguration();
26 |     String vocabulary_path = conf.getRaw("thrax.work-dir") + "vocabulary/part-*";
27 |     Vocabulary.initialize(conf, vocabulary_path);
28 |     minCount = conf.getInt("thrax.min-rule-count", 1);
29 |   }
30 | 
31 |   protected void reduce(AlignedRuleWritable key, Iterable<Annotation> values, Context context)
32 |       throws IOException, InterruptedException {
33 |     RuleWritable rule = key.getRule();
34 |     AlignmentWritable alignment = key.getAlignment();
35 | 
36 |     Annotation merged = new Annotation();
37 |     for (Annotation a : values)
38 |       merged.merge(a);
39 | 
40 |     if (!rule.equals(currentRule)) {
41 |       if (currentRule != null
42 |           && (currentAnnotation.count() >= minCount || isUnigramRule(currentRule))) {
43 |         currentAnnotation.setAlignment(maxAlignment);
44 |         context.write(currentRule, currentAnnotation);
45 |         context.progress();
46 |       }
47 |       currentRule = new RuleWritable(rule);
48 |       currentAnnotation = new Annotation();
49 |       alignmentCount = 0;
50 |       maxAlignment = null;
51 |     }
52 |     currentAnnotation.merge(merged);
53 |     if (alignmentCount < merged.count()) {
54 |       maxAlignment = new AlignmentWritable(alignment);
55 |       alignmentCount = merged.count();
56 |     }
57 |   }
58 | 
59 |   protected void cleanup(Context context) throws IOException, InterruptedException {
60 |     if (currentRule != null) {
61 |       if (currentAnnotation.count() >= minCount || isUnigramRule(currentRule)) {
62 |         currentAnnotation.setAlignment(maxAlignment);
63 |         context.write(currentRule, currentAnnotation);        
64 |         context.progress();
65 |       }
66 |     }
67 |   }
68 | 
69 |   private static boolean isUnigramRule(RuleWritable rule) {
70 |     if (rule.source.length == 1) return !Vocabulary.nt(rule.source[0]);
71 |     return rule.target.length == 1 && !Vocabulary.nt(rule.target[0]);
72 |   }
73 | }
74 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/extraction/RuleWritableExtractor.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.extraction;
 2 | 
 3 | import org.apache.hadoop.io.Text;
 4 | 
 5 | import edu.jhu.thrax.hadoop.datatypes.AlignmentWritable;
 6 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
 7 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
 8 | 
 9 | public interface RuleWritableExtractor {
10 |   public Iterable<AnnotatedRule> extract(Text line);
11 | }
12 | 
13 | 
14 | class AnnotatedRule {
15 |   public RuleWritable rule = null;
16 |   public AlignmentWritable f2e = null;
17 |   public Annotation annotation = null;
18 | 
19 |   public AnnotatedRule(RuleWritable r) {
20 |     rule = r;
21 |   }
22 | 
23 |   public AnnotatedRule(RuleWritable r, AlignmentWritable f2e, Annotation a) {
24 |     this.rule = r;
25 |     this.f2e = f2e;
26 |     this.annotation = a;
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/extraction/RuleWritableExtractorFactory.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.extraction;
 2 | 
 3 | import org.apache.hadoop.io.LongWritable;
 4 | import org.apache.hadoop.io.Text;
 5 | import org.apache.hadoop.mapreduce.Mapper;
 6 | 
 7 | import edu.jhu.thrax.hadoop.datatypes.AlignedRuleWritable;
 8 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
 9 | 
10 | public class RuleWritableExtractorFactory {
11 |   public static RuleWritableExtractor create(
12 |       Mapper<LongWritable, Text, AlignedRuleWritable, Annotation>.Context context) {
13 |     return new HierarchicalRuleWritableExtractor(context);
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/AbstractnessFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features;
 2 | 
 3 | import java.util.Map;
 4 | 
 5 | import org.apache.hadoop.io.IntWritable;
 6 | import org.apache.hadoop.io.Writable;
 7 | 
 8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
 9 | import edu.jhu.thrax.util.Vocabulary;
10 | 
11 | public class AbstractnessFeature implements SimpleFeature {
12 |   
13 |   public static final String NAME = "abstract";
14 |   
15 |   private static final IntWritable ZERO = new IntWritable(0);
16 |   private static final IntWritable ONE = new IntWritable(1);
17 | 
18 |   public Writable score(RuleWritable r) {
19 |     for (int word : r.source) {
20 |       if (!Vocabulary.nt(word)) {
21 |         return ZERO;
22 |       }
23 |     }
24 |     for (int word : r.target) {
25 |       if (!Vocabulary.nt(word)) {
26 |         return ZERO;
27 |       }
28 |     }
29 |     return ONE;
30 |   }
31 |   
32 |   public String getName() {
33 |     return NAME;
34 |   }
35 | 
36 |   public void unaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
37 |     map.put(Vocabulary.id(NAME), ONE);
38 |   }
39 | 
40 |   public void binaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
41 |     map.put(Vocabulary.id(NAME), ONE);
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/AdjacentNonTerminalsFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features;
 2 | 
 3 | import java.util.Map;
 4 | 
 5 | import org.apache.hadoop.io.IntWritable;
 6 | import org.apache.hadoop.io.Writable;
 7 | 
 8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
 9 | import edu.jhu.thrax.util.Vocabulary;
10 | 
11 | public class AdjacentNonTerminalsFeature implements SimpleFeature {
12 | 
13 |   public static final String NAME = "adjacent";
14 | 
15 |   private static final IntWritable ZERO = new IntWritable(0);
16 |   private static final IntWritable ONE = new IntWritable(1);
17 | 
18 |   public Writable score(RuleWritable r) {
19 |     for (int i = 0; i < r.source.length - 1; ++i)
20 |       if (Vocabulary.nt(r.source[i])) {
21 |         if (Vocabulary.nt(r.source[i + 1])) {
22 |           return ONE;
23 |         } else {
24 |           i += 2;
25 |           continue;
26 |         }
27 |       }
28 |     return ZERO;
29 |   }
30 | 
31 |   public String getName() {
32 |     return NAME;
33 |   }
34 | 
35 |   public void unaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
36 |     map.put(Vocabulary.id(NAME), ZERO);
37 |   }
38 | 
39 |   public void binaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
40 |     map.put(Vocabulary.id(NAME), ONE);
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/CharacterCompressionRatioFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features;
 2 | 
 3 | import java.util.Map;
 4 | 
 5 | import org.apache.hadoop.io.FloatWritable;
 6 | import org.apache.hadoop.io.Writable;
 7 | 
 8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
 9 | import edu.jhu.thrax.util.Vocabulary;
10 | 
11 | public class CharacterCompressionRatioFeature implements SimpleFeature {
12 | 
13 |   private static final FloatWritable ZERO = new FloatWritable(0f);
14 | 
15 |   public static final String NAME = "char_cr";
16 |   
17 |   public Writable score(RuleWritable r) {
18 |     int src_length = 0;
19 |     for (int tok : r.source) {
20 |       if (!Vocabulary.nt(tok)) {
21 |         src_length += Vocabulary.word(tok).length();
22 |       }
23 |     }
24 |     src_length += r.source.length - 1;
25 | 
26 |     int tgt_length = 0;
27 |     for (int tok : r.target) {
28 |       if (!Vocabulary.nt(tok)) {
29 |         tgt_length += Vocabulary.word(tok).length();
30 |       }
31 |     }
32 |     tgt_length += r.target.length - 1;
33 | 
34 |     if (src_length == 0 || tgt_length == 0)
35 |       return ZERO;
36 |     else
37 |       return new FloatWritable((float) Math.log((float) tgt_length / src_length));
38 |   }
39 |   
40 |   public String getName() {
41 |     return NAME;
42 |   }
43 | 
44 |   public void unaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
45 |     map.put(Vocabulary.id(NAME), ZERO);
46 |   }
47 | 
48 |   public void binaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
49 |     map.put(Vocabulary.id(NAME), ZERO);
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/CharacterCountDifferenceFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features;
 2 | 
 3 | import java.util.Map;
 4 | 
 5 | import org.apache.hadoop.io.IntWritable;
 6 | import org.apache.hadoop.io.Writable;
 7 | 
 8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
 9 | import edu.jhu.thrax.util.Vocabulary;
10 | 
11 | public class CharacterCountDifferenceFeature implements SimpleFeature {
12 | 
13 |   private static final IntWritable ZERO = new IntWritable(0);
14 | 
15 |   public static final String NAME = "char_count_difference";
16 |   
17 |   public Writable score(RuleWritable r) {
18 |     int char_difference = 0;
19 |     for (int tok : r.source) {
20 |       if (!Vocabulary.nt(tok)) {
21 |         char_difference -= Vocabulary.word(tok).length();
22 |       }
23 |     }
24 |     char_difference -= r.source.length - 1;
25 | 
26 |     for (int tok : r.target) {
27 |       if (!Vocabulary.nt(tok)) {
28 |         char_difference += Vocabulary.word(tok).length();
29 |       }
30 |     }
31 |     char_difference += r.target.length - 1;
32 |     return new IntWritable(char_difference);
33 |   }
34 |   
35 |   public String getName() {
36 |     return NAME;
37 |   }
38 | 
39 |   public void unaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
40 |     map.put(Vocabulary.id(NAME), ZERO);
41 |   }
42 | 
43 |   public void binaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
44 |     map.put(Vocabulary.id(NAME), ZERO);
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/ConsumeSourceTerminalsFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features;
 2 | 
 3 | import java.util.Map;
 4 | 
 5 | import org.apache.hadoop.io.IntWritable;
 6 | import org.apache.hadoop.io.Writable;
 7 | 
 8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
 9 | import edu.jhu.thrax.util.Vocabulary;
10 | 
11 | public class ConsumeSourceTerminalsFeature implements SimpleFeature {
12 | 
13 |   public static final String NAME = "source_terminals_without_target";
14 |   
15 |   private static final IntWritable ZERO = new IntWritable(0);
16 |   private static final IntWritable ONE = new IntWritable(1);
17 | 
18 |   public Writable score(RuleWritable r) {
19 |     for (int tok : r.target) {
20 |       if (!Vocabulary.nt(tok)) {
21 |         return ZERO;
22 |       }
23 |     }
24 |     for (int tok : r.source) {
25 |       if (!Vocabulary.nt(tok)) {
26 |         return ONE;
27 |       }
28 |     }
29 |     return ZERO;
30 |   }
31 |   
32 |   public String getName() {
33 |     return NAME;
34 |   }
35 | 
36 |   public void unaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
37 |     map.put(Vocabulary.id(NAME), ZERO);
38 |   }
39 | 
40 |   public void binaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
41 |     map.put(Vocabulary.id(NAME), ZERO);
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/Feature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features;
 2 | 
 3 | import java.util.Map;
 4 | 
 5 | import org.apache.hadoop.io.Writable;
 6 | 
 7 | public interface Feature {
 8 |   
 9 |   public String getName();
10 |   
11 |   public void unaryGlueRuleScore(int nt, Map<Integer, Writable> map);
12 | 
13 |   public void binaryGlueRuleScore(int nt, Map<Integer, Writable> map);
14 | 
15 | }
16 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/GlueRuleFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features;
 2 | 
 3 | import java.util.Map;
 4 | 
 5 | import org.apache.hadoop.io.IntWritable;
 6 | import org.apache.hadoop.io.Writable;
 7 | 
 8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
 9 | import edu.jhu.thrax.util.Vocabulary;
10 | 
11 | public class GlueRuleFeature implements SimpleFeature {
12 | 
13 |   public static final String NAME = "glue_rule";
14 |   
15 |   private static final IntWritable ZERO = new IntWritable(0);
16 |   private static final IntWritable ONE = new IntWritable(1);
17 |   
18 |   public Writable score(RuleWritable r) {
19 |     return ZERO;
20 |   }
21 |   
22 |   public String getName() {
23 |     return NAME;
24 |   }
25 | 
26 |   public void unaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
27 |     map.put(Vocabulary.id(NAME), ONE);
28 |   }
29 | 
30 |   public void binaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
31 |     map.put(Vocabulary.id(NAME), ONE);
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/IdentityFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features;
 2 | 
 3 | import java.util.Arrays;
 4 | import java.util.Map;
 5 | 
 6 | import org.apache.hadoop.io.IntWritable;
 7 | import org.apache.hadoop.io.Writable;
 8 | 
 9 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
10 | import edu.jhu.thrax.util.Vocabulary;
11 | 
12 | public class IdentityFeature implements SimpleFeature {
13 | 
14 |   public static final String NAME = "identity";
15 |   
16 |   private static final IntWritable ZERO = new IntWritable(0);
17 |   private static final IntWritable ONE = new IntWritable(1);
18 | 
19 |   public Writable score(RuleWritable r) {
20 |     if (r.monotone && Arrays.equals(r.target, r.source))
21 |       return ONE;
22 |     else
23 |       return ZERO;
24 |   }
25 |   
26 |   public String getName() {
27 |     return NAME;
28 |   }
29 | 
30 |   public void unaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
31 |     map.put(Vocabulary.id(NAME), ZERO);
32 |   }
33 | 
34 |   public void binaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
35 |     map.put(Vocabulary.id(NAME), ZERO);
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/LexicalityFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features;
 2 | 
 3 | import java.util.Map;
 4 | 
 5 | import org.apache.hadoop.io.IntWritable;
 6 | import org.apache.hadoop.io.Writable;
 7 | 
 8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
 9 | import edu.jhu.thrax.util.Vocabulary;
10 | 
11 | public class LexicalityFeature implements SimpleFeature {
12 | 
13 |   public static final String NAME = "lexical";
14 | 
15 |   private static final IntWritable ZERO = new IntWritable(0);
16 |   private static final IntWritable ONE = new IntWritable(1);
17 | 
18 |   public Writable score(RuleWritable r) {
19 |     for (int tok : r.source)
20 |       if (Vocabulary.nt(tok)) return ZERO;
21 |     for (int tok : r.target)
22 |       if (Vocabulary.nt(tok)) return ZERO;
23 |     return ONE;
24 |   }
25 | 
26 |   public String getName() {
27 |     return NAME;
28 |   }
29 | 
30 |   public void unaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
31 |     map.put(Vocabulary.id(NAME), ZERO);
32 |   }
33 | 
34 |   public void binaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
35 |     map.put(Vocabulary.id(NAME), ZERO);
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/MonotonicFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features;
 2 | 
 3 | import java.util.Map;
 4 | 
 5 | import org.apache.hadoop.io.IntWritable;
 6 | import org.apache.hadoop.io.Writable;
 7 | 
 8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
 9 | import edu.jhu.thrax.util.Vocabulary;
10 | 
11 | public class MonotonicFeature implements SimpleFeature {
12 | 
13 |   public static final String NAME = "monotonic";
14 | 
15 |   private static final IntWritable ZERO = new IntWritable(0);
16 |   private static final IntWritable ONE = new IntWritable(1);
17 | 
18 |   public Writable score(RuleWritable r) {
19 |     return (r.monotone ? ONE : ZERO);
20 |   }
21 | 
22 |   public String getName() {
23 |     return NAME;
24 |   }
25 | 
26 |   public void unaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
27 |     map.put(Vocabulary.id(NAME), ONE);
28 |   }
29 | 
30 |   public void binaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
31 |     map.put(Vocabulary.id(NAME), ONE);
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/PhrasePenaltyFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features;
 2 | 
 3 | import java.util.Map;
 4 | 
 5 | import org.apache.hadoop.io.IntWritable;
 6 | import org.apache.hadoop.io.Writable;
 7 | 
 8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
 9 | import edu.jhu.thrax.util.Vocabulary;
10 | 
11 | public class PhrasePenaltyFeature implements SimpleFeature {
12 | 
13 |   public static final String NAME = "phrase_penalty";
14 | 
15 |   private static final IntWritable ONE = new IntWritable(1);
16 | 
17 |   public Writable score(RuleWritable r) {
18 |     return ONE;
19 |   }
20 | 
21 |   public String getName() {
22 |     return NAME;
23 |   }
24 | 
25 |   public void unaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
26 |     map.put(Vocabulary.id(NAME), ONE);
27 |   }
28 | 
29 |   public void binaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
30 |     map.put(Vocabulary.id(NAME), ONE);
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/ProduceTargetTerminalsFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features;
 2 | 
 3 | import java.util.Map;
 4 | 
 5 | import org.apache.hadoop.io.IntWritable;
 6 | import org.apache.hadoop.io.Writable;
 7 | 
 8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
 9 | import edu.jhu.thrax.util.Vocabulary;
10 | 
11 | public class ProduceTargetTerminalsFeature implements SimpleFeature {
12 | 
13 |   public static final String NAME = "target_terminals_without_source";
14 | 
15 |   private static final IntWritable ZERO = new IntWritable(0);
16 |   private static final IntWritable ONE = new IntWritable(1);
17 | 
18 |   public Writable score(RuleWritable r) {
19 |     for (int tok : r.source)
20 |       if (!Vocabulary.nt(tok)) return ZERO;
21 |     for (int tok : r.target)
22 |       if (!Vocabulary.nt(tok)) return ONE;
23 |     return ZERO;
24 |   }
25 | 
26 |   public String getName() {
27 |     return NAME;
28 |   }
29 | 
30 |   public void unaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
31 |     map.put(Vocabulary.id(NAME), ZERO);
32 |   }
33 | 
34 |   public void binaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
35 |     map.put(Vocabulary.id(NAME), ZERO);
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/SimpleFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features;
 2 | 
 3 | import org.apache.hadoop.io.Writable;
 4 | 
 5 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
 6 | 
 7 | public interface SimpleFeature extends Feature {
 8 | 
 9 |   public Writable score(RuleWritable r);
10 |   
11 | }
12 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/SimpleFeatureFactory.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | import edu.jhu.thrax.util.FormatUtils;
 7 | 
 8 | public class SimpleFeatureFactory {
 9 | 
10 |   public static SimpleFeature get(String name) {
11 |     if (name.equals(AbstractnessFeature.NAME))
12 |       return new AbstractnessFeature();
13 |     else if (name.equals(AdjacentNonTerminalsFeature.NAME))
14 |       return new AdjacentNonTerminalsFeature();
15 |     else if (name.equals(LexicalityFeature.NAME))
16 |       return new LexicalityFeature();
17 |     else if (name.equals(XRuleFeature.NAME))
18 |       return new XRuleFeature();
19 |     else if (name.equals(MonotonicFeature.NAME))
20 |       return new MonotonicFeature();
21 |     else if (name.equals(PhrasePenaltyFeature.NAME))
22 |       return new PhrasePenaltyFeature();
23 |     else if (name.equals(SourceWordCounterFeature.NAME))
24 |       return new SourceWordCounterFeature();
25 |     else if (name.equals(TargetWordCounterFeature.NAME))
26 |       return new TargetWordCounterFeature();
27 |     else if (name.equals(ConsumeSourceTerminalsFeature.NAME))
28 |       return new ConsumeSourceTerminalsFeature();
29 |     else if (name.equals(ProduceTargetTerminalsFeature.NAME))
30 |       return new ProduceTargetTerminalsFeature();
31 |     else if (name.equals(IdentityFeature.NAME))
32 |       return new IdentityFeature();
33 |     else if (name.equals(WordCountDifferenceFeature.NAME))
34 |       return new WordCountDifferenceFeature();
35 |     else if (name.equals(WordLengthDifferenceFeature.NAME))
36 |       return new WordLengthDifferenceFeature();
37 |     else if (name.equals(WordCompressionRatioFeature.NAME))
38 |       return new WordCompressionRatioFeature();
39 |     else if (name.equals(CharacterCountDifferenceFeature.NAME))
40 |       return new CharacterCountDifferenceFeature();
41 |     else if (name.equals(CharacterCompressionRatioFeature.NAME))
42 |       return new CharacterCompressionRatioFeature();
43 |     else if (name.equals(GlueRuleFeature.NAME)) return new GlueRuleFeature();
44 | 
45 |     return null;
46 |   }
47 | 
48 |   public static List<SimpleFeature> getAll(String names) {
49 |     String[] feature_names = FormatUtils.P_COMMA_OR_SPACE.split(names);
50 |     List<SimpleFeature> features = new ArrayList<SimpleFeature>();
51 | 
52 |     for (String feature_name : feature_names) {
53 |       SimpleFeature feature = get(feature_name);
54 |       if (feature != null) features.add(feature);
55 |     }
56 |     return features;
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/SourceWordCounterFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features;
 2 | 
 3 | import java.util.Map;
 4 | 
 5 | import org.apache.hadoop.io.IntWritable;
 6 | import org.apache.hadoop.io.Writable;
 7 | 
 8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
 9 | import edu.jhu.thrax.util.Vocabulary;
10 | 
11 | public class SourceWordCounterFeature implements SimpleFeature {
12 | 
13 |   public static final String NAME = "source_word_count";
14 | 
15 |   private static final IntWritable ZERO = new IntWritable(0);
16 | 
17 |   public Writable score(RuleWritable r) {
18 |     int words = 0;
19 |     for (int word : r.source)
20 |       if (!Vocabulary.nt(word)) words++;
21 |     return new IntWritable(words);
22 |   }
23 | 
24 |   public String getName() {
25 |     return NAME;
26 |   }
27 | 
28 |   public void unaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
29 |     map.put(Vocabulary.id(NAME), ZERO);
30 |   }
31 | 
32 |   public void binaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
33 |     map.put(Vocabulary.id(NAME), ZERO);
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/TargetWordCounterFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features;
 2 | 
 3 | import java.util.Map;
 4 | 
 5 | import org.apache.hadoop.io.IntWritable;
 6 | import org.apache.hadoop.io.Writable;
 7 | 
 8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
 9 | import edu.jhu.thrax.util.Vocabulary;
10 | 
11 | public class TargetWordCounterFeature implements SimpleFeature {
12 | 
13 |   public static final String NAME = "target_word_count";
14 | 
15 |   private static final IntWritable ZERO = new IntWritable(0);
16 | 
17 |   public Writable score(RuleWritable r) {
18 |     int words = 0;
19 |     for (int tok : r.target)
20 |       if (!Vocabulary.nt(tok)) words++;
21 |     return new IntWritable(words);
22 |   }
23 | 
24 |   public String getName() {
25 |     return NAME;
26 |   }
27 | 
28 |   public void unaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
29 |     map.put(Vocabulary.id(NAME), ZERO);
30 |   }
31 | 
32 |   public void binaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
33 |     map.put(Vocabulary.id(NAME), ZERO);
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/WordCompressionRatioFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features;
 2 | 
 3 | import java.util.Map;
 4 | 
 5 | import org.apache.hadoop.io.FloatWritable;
 6 | import org.apache.hadoop.io.IntWritable;
 7 | import org.apache.hadoop.io.Writable;
 8 | 
 9 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
10 | import edu.jhu.thrax.util.Vocabulary;
11 | 
12 | public class WordCompressionRatioFeature implements SimpleFeature {
13 | 
14 |   public static final String NAME = "word_cr";
15 | 
16 |   private static final IntWritable ZERO = new IntWritable(0);
17 | 
18 |   public Writable score(RuleWritable r) {
19 |     int src_count = 0;
20 |     for (int tok : r.source)
21 |       if (!Vocabulary.nt(tok)) src_count++;
22 |     int tgt_count = 0;
23 |     for (int tok : r.target)
24 |       if (!Vocabulary.nt(tok)) tgt_count++;
25 |     if (src_count == 0 || tgt_count == 0) {
26 |       return ZERO;
27 |     } else {
28 |       return new FloatWritable((float) Math.log((float) tgt_count / src_count));
29 |     }
30 |   }
31 | 
32 |   public String getName() {
33 |     return NAME;
34 |   }
35 | 
36 |   public void unaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
37 |     map.put(Vocabulary.id(NAME), ZERO);
38 |   }
39 | 
40 |   public void binaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
41 |     map.put(Vocabulary.id(NAME), ZERO);
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/WordCountDifferenceFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features;
 2 | 
 3 | import java.util.Map;
 4 | 
 5 | import org.apache.hadoop.io.IntWritable;
 6 | import org.apache.hadoop.io.Writable;
 7 | 
 8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
 9 | import edu.jhu.thrax.util.Vocabulary;
10 | 
11 | public class WordCountDifferenceFeature implements SimpleFeature {
12 | 
13 |   public static final String NAME = "word_count_difference";
14 | 
15 |   private static final IntWritable ZERO = new IntWritable(0);
16 | 
17 |   public Writable score(RuleWritable r) {
18 |     int word_difference = 0;
19 |     for (int tok : r.source)
20 |       if (!Vocabulary.nt(tok)) word_difference--;
21 |     for (int tok : r.target)
22 |       if (!Vocabulary.nt(tok)) word_difference++;
23 |     return new IntWritable(word_difference);
24 |   }
25 | 
26 |   public String getName() {
27 |     return NAME;
28 |   }
29 | 
30 |   public void unaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
31 |     map.put(Vocabulary.id(NAME), ZERO);
32 |   }
33 | 
34 |   public void binaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
35 |     map.put(Vocabulary.id(NAME), ZERO);
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/WordLengthDifferenceFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features;
 2 | 
 3 | import java.util.Map;
 4 | 
 5 | import org.apache.hadoop.io.FloatWritable;
 6 | import org.apache.hadoop.io.Writable;
 7 | 
 8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
 9 | import edu.jhu.thrax.util.Vocabulary;
10 | 
11 | public class WordLengthDifferenceFeature implements SimpleFeature {
12 | 
13 |   public static final String NAME = "word_length_difference";
14 | 
15 |   private static final FloatWritable ZERO = new FloatWritable(0);
16 | 
17 |   public Writable score(RuleWritable r) {
18 |     int src_length = 0;
19 |     int src_count = 0;
20 |     for (int tok : r.source) {
21 |       if (!Vocabulary.nt(tok)) {
22 |         src_length += Vocabulary.word(tok).length();
23 |         src_count++;
24 |       }
25 |     }
26 |     int tgt_length = 0;
27 |     int tgt_count = 0;
28 |     for (int tok : r.target) {
29 |       if (!Vocabulary.nt(tok)) {
30 |         tgt_length += Vocabulary.word(tok).length();
31 |         tgt_count++;
32 |       }
33 |     }
34 |     if (src_count == 0 || tgt_count == 0) {
35 |       return ZERO;
36 |     } else {
37 |       float avg_src_length = (float) src_length / src_count;
38 |       float avg_tgt_length = (float) tgt_length / tgt_count;
39 |       return new FloatWritable(avg_tgt_length - avg_src_length);
40 |     }
41 |   }
42 | 
43 |   public String getName() {
44 |     return NAME;
45 |   }
46 | 
47 |   public void unaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
48 |     map.put(Vocabulary.id(NAME), ZERO);
49 |   }
50 | 
51 |   public void binaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
52 |     map.put(Vocabulary.id(NAME), ZERO);
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/XRuleFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features;
 2 | 
 3 | import java.util.Map;
 4 | 
 5 | import org.apache.hadoop.io.IntWritable;
 6 | import org.apache.hadoop.io.Writable;
 7 | 
 8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
 9 | import edu.jhu.thrax.util.Vocabulary;
10 | 
11 | public class XRuleFeature implements SimpleFeature {
12 | 
13 |   public static final String NAME = "x_rule";
14 | 
15 |   private static final IntWritable ZERO = new IntWritable(0);
16 |   private static final IntWritable ONE = new IntWritable(1);
17 | 
18 |   // TODO: should be default nonterminal and not explicitly X.
19 |   private final int PATTERN = Vocabulary.id("[X]");
20 | 
21 |   public Writable score(RuleWritable r) {
22 |     return (r.lhs == PATTERN ? ONE : ZERO);
23 |   }
24 | 
25 |   public String getName() {
26 |     return NAME;
27 |   }
28 | 
29 |   public void unaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
30 |     map.put(Vocabulary.id(NAME), ZERO);
31 |   }
32 | 
33 |   public void binaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
34 |     map.put(Vocabulary.id(NAME), ZERO);
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/annotation/AlignmentFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features.annotation;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Map;
 5 | import java.util.Set;
 6 | 
 7 | import org.apache.hadoop.io.IntWritable;
 8 | import org.apache.hadoop.io.Writable;
 9 | import org.apache.hadoop.mapreduce.Reducer.Context;
10 | 
11 | import edu.jhu.thrax.hadoop.datatypes.AlignmentWritable;
12 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
13 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
14 | import edu.jhu.thrax.hadoop.jobs.ThraxJob;
15 | import edu.jhu.thrax.util.Vocabulary;
16 | 
17 | @SuppressWarnings("rawtypes")
18 | public class AlignmentFeature implements AnnotationFeature {
19 |   
20 |   public static final String NAME = "alignment";
21 |   
22 |   private static final IntWritable ZERO = new IntWritable(0);
23 | 
24 |   public String getName() {
25 |     return NAME;
26 |   }
27 | 
28 |   public AlignmentWritable score(RuleWritable r, Annotation annotation) {
29 |     return annotation.f2e();
30 |   }
31 | 
32 |   public void unaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
33 |     map.put(Vocabulary.id(NAME), ZERO);
34 |   }
35 | 
36 |   public void binaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
37 |     map.put(Vocabulary.id(NAME), ZERO);
38 |   }
39 | 
40 |   @Override
41 |   public void init(Context context) throws IOException, InterruptedException {}
42 | 
43 |   @Override
44 |   public Set<Class<? extends ThraxJob>> getPrerequisites() {
45 |     return null;
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/annotation/AnnotationFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features.annotation;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Set;
 5 | 
 6 | import org.apache.hadoop.io.Writable;
 7 | import org.apache.hadoop.mapreduce.Reducer.Context;
 8 | 
 9 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
10 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
11 | import edu.jhu.thrax.hadoop.features.Feature;
12 | import edu.jhu.thrax.hadoop.jobs.ThraxJob;
13 | 
14 | public interface AnnotationFeature extends Feature {
15 | 
16 |   @SuppressWarnings("rawtypes")
17 |   public void init(Context context) throws IOException, InterruptedException;
18 | 
19 |   public Writable score(RuleWritable r, Annotation annotation);
20 | 
21 |   // TODO: move this into its own interface, have AF extend it.
22 |   public Set<Class<? extends ThraxJob>> getPrerequisites();
23 | }
24 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/annotation/AnnotationFeatureFactory.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features.annotation;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | import edu.jhu.thrax.util.FormatUtils;
 7 | 
 8 | public class AnnotationFeatureFactory {
 9 | 
10 |   public static AnnotationFeature get(String name) {
11 |     if (name.equals(UnalignedSourceCounterFeature.NAME))
12 |       return new UnalignedSourceCounterFeature();
13 |     else if (name.equals(UnalignedTargetCounterFeature.NAME))
14 |       return new UnalignedTargetCounterFeature();
15 |     else if (name.equals(RarityPenaltyFeature.NAME))
16 |       return new RarityPenaltyFeature();
17 |     else if (name.equals(CountFeature.NAME))
18 |       return new CountFeature();
19 |     else if (name.equals(LogCountFeature.NAME))
20 |       return new LogCountFeature();
21 |     else if (name.equals(SourceGivenTargetLexicalProbabilityFeature.NAME))
22 |       return new SourceGivenTargetLexicalProbabilityFeature();
23 |     else if (name.equals(TargetGivenSourceLexicalProbabilityFeature.NAME))
24 |       return new TargetGivenSourceLexicalProbabilityFeature();
25 |     else if (name.equals(AlignmentFeature.NAME)) 
26 |       return new AlignmentFeature();
27 | 
28 |     return null;
29 |   }
30 | 
31 |   public static List<AnnotationFeature> getAll(String names) {
32 |     String[] feature_names = FormatUtils.P_COMMA_OR_SPACE.split(names);
33 |     List<AnnotationFeature> features = new ArrayList<AnnotationFeature>();
34 | 
35 |     for (String feature_name : feature_names) {
36 |       AnnotationFeature feature = get(feature_name);
37 |       if (feature != null) features.add(feature);
38 |     }
39 |     return features;
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/annotation/AnnotationFeatureJob.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features.annotation;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.HashSet;
 5 | import java.util.Set;
 6 | 
 7 | import org.apache.hadoop.conf.Configuration;
 8 | import org.apache.hadoop.fs.Path;
 9 | import org.apache.hadoop.mapreduce.Job;
10 | import org.apache.hadoop.mapreduce.Mapper;
11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
12 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
14 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
15 | 
16 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
17 | import edu.jhu.thrax.hadoop.datatypes.FeaturePair;
18 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
19 | import edu.jhu.thrax.hadoop.jobs.DefaultValues;
20 | import edu.jhu.thrax.hadoop.jobs.ExtractionJob;
21 | import edu.jhu.thrax.hadoop.jobs.ThraxJob;
22 | 
23 | public class AnnotationFeatureJob implements ThraxJob {
24 | 
25 |   public AnnotationFeatureJob() {}
26 | 
27 |   protected static HashSet<Class<? extends ThraxJob>> prereqs =
28 |       new HashSet<Class<? extends ThraxJob>>();
29 | 
30 |   public Set<Class<? extends ThraxJob>> getPrerequisites() {
31 |     prereqs.add(ExtractionJob.class);
32 |     return prereqs;
33 |   }
34 | 
35 |   public static void addPrerequisites(Iterable<Class<? extends ThraxJob>> cs) {
36 |     if (cs != null)
37 |       for (Class<? extends ThraxJob> c : cs)
38 |         prereqs.add(c);
39 |   }
40 | 
41 |   public static void addPrerequisite(Class<? extends ThraxJob> c) {
42 |     prereqs.add(c);
43 |   }
44 | 
45 |   public String getOutputSuffix() {
46 |     return getName();
47 |   }
48 | 
49 |   public Job getJob(Configuration conf) throws IOException {
50 |     String name = getName();
51 |     Job job = new Job(conf, name);
52 |     job.setJarByClass(this.getClass());
53 | 
54 |     job.setMapperClass(Mapper.class);
55 |     job.setPartitionerClass(RuleWritable.YieldPartitioner.class);
56 |     job.setReducerClass(AnnotationReducer.class);
57 | 
58 |     job.setInputFormatClass(SequenceFileInputFormat.class);
59 |     job.setMapOutputKeyClass(RuleWritable.class);
60 |     job.setMapOutputValueClass(Annotation.class);
61 |     job.setOutputKeyClass(RuleWritable.class);
62 |     job.setOutputValueClass(FeaturePair.class);
63 |     job.setOutputFormatClass(SequenceFileOutputFormat.class);
64 | 
65 |     int num_reducers = conf.getInt("thrax.reducers", conf.getInt("mapreduce.job.reduces", DefaultValues.DEFAULT_NUM_REDUCERS));
66 |     job.setNumReduceTasks(num_reducers);
67 | 
68 |     FileInputFormat.setInputPaths(job, new Path(conf.get("thrax.work-dir") + "rules"));
69 |     FileOutputFormat.setOutputPath(job, new Path(conf.get("thrax.work-dir") + "annotation"));
70 |     return job;
71 |   }
72 | 
73 |   @Override
74 |   public String getName() {
75 |     return "annotation";
76 |   }
77 | }
78 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/annotation/AnnotationPassthroughFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features.annotation;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Map;
 5 | import java.util.Set;
 6 | 
 7 | import org.apache.hadoop.io.Writable;
 8 | import org.apache.hadoop.mapreduce.Reducer.Context;
 9 | 
10 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
11 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
12 | import edu.jhu.thrax.hadoop.jobs.ThraxJob;
13 | 
14 | @SuppressWarnings("rawtypes")
15 | public class AnnotationPassthroughFeature implements AnnotationFeature {
16 |   
17 |   public static final String NAME = "annotation";
18 | 
19 |   public String getName() {
20 |     return NAME;
21 |   }
22 | 
23 |   public Annotation score(RuleWritable r, Annotation annotation) {
24 |     return annotation;
25 |   }
26 | 
27 |   public void unaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
28 |   }
29 | 
30 |   public void binaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
31 |   }
32 | 
33 |   @Override
34 |   public void init(Context context) throws IOException, InterruptedException {}
35 | 
36 |   @Override
37 |   public Set<Class<? extends ThraxJob>> getPrerequisites() {
38 |     return null;
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/annotation/AnnotationReducer.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features.annotation;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.ArrayList;
 5 | import java.util.List;
 6 | 
 7 | import org.apache.hadoop.conf.Configuration;
 8 | import org.apache.hadoop.mapreduce.Reducer;
 9 | 
10 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
11 | import edu.jhu.thrax.hadoop.datatypes.FeaturePair;
12 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
13 | import edu.jhu.thrax.util.BackwardsCompatibility;
14 | import edu.jhu.thrax.util.Vocabulary;
15 | 
16 | public class AnnotationReducer extends Reducer<RuleWritable, Annotation, RuleWritable, FeaturePair> {
17 | 
18 |   private List<AnnotationFeature> annotationFeatures;
19 | 
20 |   public AnnotationReducer() {}
21 | 
22 |   protected void setup(Context context) throws IOException, InterruptedException {
23 |     Configuration conf = context.getConfiguration();
24 |     String vocabulary_path = conf.getRaw("thrax.work-dir") + "vocabulary/part-*";
25 |     Vocabulary.initialize(conf, vocabulary_path);
26 | 
27 |     String features = BackwardsCompatibility.equivalent(conf.get("thrax.features", ""));
28 | 
29 |     // Paraphrasing only needs the annotation to be passed through.
30 |     String type = conf.get("thrax.type", "translation");
31 |     if ("paraphrasing".equals(type)) {
32 |       annotationFeatures = new ArrayList<AnnotationFeature>();
33 |       annotationFeatures.add(new AnnotationPassthroughFeature());
34 |     } else {
35 |       annotationFeatures = AnnotationFeatureFactory.getAll(features);
36 |     }
37 | 
38 |     for (AnnotationFeature af : annotationFeatures)
39 |       af.init(context);
40 |   }
41 | 
42 |   protected void reduce(RuleWritable key, Iterable<Annotation> values, Context context)
43 |       throws IOException, InterruptedException {
44 |     for (Annotation annotation : values) {
45 |       for (AnnotationFeature f : annotationFeatures) {
46 |         context.write(key, new FeaturePair(Vocabulary.id(f.getName()), f.score(key, annotation)));
47 |       }
48 |     }
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/annotation/CountFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features.annotation;
 2 | 
 3 | import java.util.Map;
 4 | import java.util.Set;
 5 | 
 6 | import org.apache.hadoop.io.IntWritable;
 7 | import org.apache.hadoop.io.Writable;
 8 | import org.apache.hadoop.mapreduce.Reducer.Context;
 9 | 
10 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
11 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
12 | import edu.jhu.thrax.hadoop.jobs.ThraxJob;
13 | import edu.jhu.thrax.util.Vocabulary;
14 | 
15 | @SuppressWarnings("rawtypes")
16 | public class CountFeature implements AnnotationFeature {
17 | 
18 |   public static final String NAME = "count";
19 |   
20 |   private static final IntWritable ZERO = new IntWritable(0);
21 | 
22 |   public String getName() {
23 |     return NAME;
24 |   }
25 |   
26 |   public void unaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
27 |     map.put(Vocabulary.id(NAME), ZERO);
28 |   }
29 | 
30 |   public void binaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
31 |     map.put(Vocabulary.id(NAME), ZERO);
32 |   }
33 | 
34 |   @Override
35 |   public Writable score(RuleWritable r, Annotation annotation) {
36 |     return new IntWritable(annotation.count());
37 |   }
38 | 
39 |   @Override
40 |   public void init(Context context) {}
41 | 
42 |   @Override
43 |   public Set<Class<? extends ThraxJob>> getPrerequisites() {
44 |     return null;
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/annotation/LogCountFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features.annotation;
 2 | 
 3 | import java.util.Map;
 4 | import java.util.Set;
 5 | 
 6 | import org.apache.hadoop.io.FloatWritable;
 7 | import org.apache.hadoop.io.Writable;
 8 | import org.apache.hadoop.mapreduce.Reducer.Context;
 9 | 
10 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
11 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
12 | import edu.jhu.thrax.hadoop.jobs.ThraxJob;
13 | import edu.jhu.thrax.util.Vocabulary;
14 | 
15 | @SuppressWarnings("rawtypes")
16 | public class LogCountFeature implements AnnotationFeature {
17 | 
18 |   public static final String NAME = "logcount";
19 | 
20 |   private static final FloatWritable ZERO = new FloatWritable(0);
21 | 
22 |   public String getName() {
23 |     return NAME;
24 |   }
25 | 
26 |   public void unaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
27 |     map.put(Vocabulary.id(NAME), ZERO);
28 |   }
29 | 
30 |   public void binaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
31 |     map.put(Vocabulary.id(NAME), ZERO);
32 |   }
33 | 
34 |   @Override
35 |   public Writable score(RuleWritable r, Annotation annotation) {
36 |     return new FloatWritable((float) Math.log(annotation.count()));
37 |   }
38 | 
39 |   @Override
40 |   public void init(Context context) {}
41 | 
42 |   @Override
43 |   public Set<Class<? extends ThraxJob>> getPrerequisites() {
44 |     return null;
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/annotation/RarityPenaltyFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features.annotation;
 2 | 
 3 | import java.util.Map;
 4 | import java.util.Set;
 5 | 
 6 | import org.apache.hadoop.io.FloatWritable;
 7 | import org.apache.hadoop.io.Writable;
 8 | import org.apache.hadoop.mapreduce.Reducer.Context;
 9 | 
10 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
11 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
12 | import edu.jhu.thrax.hadoop.jobs.ThraxJob;
13 | import edu.jhu.thrax.util.Vocabulary;
14 | 
15 | @SuppressWarnings("rawtypes")
16 | public class RarityPenaltyFeature implements AnnotationFeature {
17 | 
18 |   public static final String NAME = "rarity";
19 | 
20 |   private static final FloatWritable ZERO = new FloatWritable(0.0f);
21 | 
22 |   public String getName() {
23 |     return NAME;
24 |   }
25 | 
26 |   public void unaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
27 |     map.put(Vocabulary.id(NAME), ZERO);
28 |   }
29 | 
30 |   public void binaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
31 |     map.put(Vocabulary.id(NAME), ZERO);
32 |   }
33 | 
34 |   @Override
35 |   public Writable score(RuleWritable r, Annotation annotation) {
36 |     return new FloatWritable((float) Math.exp(1 - annotation.count()));
37 |   }
38 | 
39 |   @Override
40 |   public void init(Context context) {}
41 | 
42 |   @Override
43 |   public Set<Class<? extends ThraxJob>> getPrerequisites() {
44 |     return null;
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/annotation/UnalignedSourceCounterFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features.annotation;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Map;
 5 | import java.util.Set;
 6 | 
 7 | import org.apache.hadoop.io.IntWritable;
 8 | import org.apache.hadoop.io.Writable;
 9 | import org.apache.hadoop.mapreduce.Reducer.Context;
10 | 
11 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
12 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
13 | import edu.jhu.thrax.hadoop.jobs.ThraxJob;
14 | import edu.jhu.thrax.util.Vocabulary;
15 | 
16 | @SuppressWarnings("rawtypes")
17 | public class UnalignedSourceCounterFeature implements AnnotationFeature {
18 |   
19 |   public static final String NAME = "unaligned_source";
20 |   
21 |   private static final IntWritable ZERO = new IntWritable(0);
22 | 
23 |   public String getName() {
24 |     return NAME;
25 |   }
26 | 
27 |   public IntWritable score(RuleWritable r, Annotation annotation) {
28 |     byte[] f2e = annotation.f2e().points;
29 |     int[] src = r.source;
30 | 
31 |     int count = 0;
32 |     int i = 0, j = 0;
33 |     for (i = 0; i < src.length; ++i) {
34 |       if (Vocabulary.nt(src[i])) continue;
35 |       if (j >= f2e.length || i != f2e[j]) count++;
36 |       while (j < f2e.length && f2e[j] <= i)
37 |         j += 2;
38 |     }
39 |     return new IntWritable(count);
40 |   }
41 | 
42 |   public void unaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
43 |     map.put(Vocabulary.id(NAME), ZERO);
44 |   }
45 | 
46 |   public void binaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
47 |     map.put(Vocabulary.id(NAME), ZERO);
48 |   }
49 | 
50 |   @Override
51 |   public void init(Context context) throws IOException, InterruptedException {}
52 | 
53 |   @Override
54 |   public Set<Class<? extends ThraxJob>> getPrerequisites() {
55 |     return null;
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/annotation/UnalignedTargetCounterFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features.annotation;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Map;
 5 | import java.util.Set;
 6 | 
 7 | import org.apache.hadoop.io.IntWritable;
 8 | import org.apache.hadoop.io.Writable;
 9 | import org.apache.hadoop.mapreduce.Reducer.Context;
10 | 
11 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
12 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
13 | import edu.jhu.thrax.hadoop.jobs.ThraxJob;
14 | import edu.jhu.thrax.util.Vocabulary;
15 | 
16 | @SuppressWarnings("rawtypes")
17 | public class UnalignedTargetCounterFeature implements AnnotationFeature {
18 | 
19 |   public static final String NAME = "unaligned_target";
20 | 
21 |   private static final IntWritable ZERO = new IntWritable(0);
22 | 
23 |   public String getName() {
24 |     return NAME;
25 |   }
26 | 
27 |   public IntWritable score(RuleWritable r, Annotation annotation) {
28 |     byte[] e2f = annotation.e2f().points;
29 |     int[] tgt = r.target;
30 | 
31 |     int count = 0;
32 |     int i = 0, j = 0;
33 |     for (i = 0; i < tgt.length; ++i) {
34 |       if (Vocabulary.nt(tgt[i])) continue;
35 |       if (j >= e2f.length || i != e2f[j]) count++;
36 |       while (j < e2f.length && e2f[j] <= i)
37 |         j += 2;
38 |     }
39 |     return new IntWritable(count);
40 |   }
41 | 
42 |   public void unaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
43 |     map.put(Vocabulary.id(NAME), ZERO);
44 |   }
45 | 
46 |   public void binaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
47 |     map.put(Vocabulary.id(NAME), ZERO);
48 |   }
49 | 
50 |   @Override
51 |   public void init(Context context) throws IOException, InterruptedException {}
52 | 
53 |   @Override
54 |   public Set<Class<? extends ThraxJob>> getPrerequisites() {
55 |     return null;
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/mapred/MapReduceFeatureFactory.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features.mapred;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | import edu.jhu.thrax.hadoop.features.annotation.CountFeature;
 7 | import edu.jhu.thrax.util.FormatUtils;
 8 | 
 9 | public class MapReduceFeatureFactory {
10 | 
11 |   public static MapReduceFeature get(String name) {
12 |     if (name.equals(SourcePhraseGivenTargetFeature.NAME))
13 |       return new SourcePhraseGivenTargetFeature();
14 |     else if (name.equals(TargetPhraseGivenSourceFeature.NAME))
15 |       return new TargetPhraseGivenSourceFeature();
16 |     else if (name.equals(GoodTuringSmoothedTargetPhraseGivenSourceFeature.NAME))
17 |         return new GoodTuringSmoothedTargetPhraseGivenSourceFeature();
18 |     else if (name.equals(GoodTuringSmoothedSourcePhraseGivenTargetFeature.NAME))
19 |       return new GoodTuringSmoothedSourcePhraseGivenTargetFeature();
20 |     else if (name.equals(SourcePhraseGivenLHSFeature.NAME))
21 |       return new SourcePhraseGivenLHSFeature();
22 |     else if (name.equals(LhsGivenSourcePhraseFeature.NAME))
23 |       return new LhsGivenSourcePhraseFeature();
24 |     else if (name.equals(SourcePhraseGivenTargetandLHSFeature.NAME))
25 |       return new SourcePhraseGivenTargetandLHSFeature();
26 |     else if (name.equals(TargetPhraseGivenSourceandLHSFeature.NAME))
27 |       return new TargetPhraseGivenSourceandLHSFeature();
28 |     else if (name.equals(TargetPhraseGivenLHSFeature.NAME))
29 |       return new TargetPhraseGivenLHSFeature();
30 |     else if (name.equals(LhsGivenTargetPhraseFeature.NAME))
31 |       return new LhsGivenTargetPhraseFeature();
32 |     else if (name.equals(SourceCountFeature.NAME))
33 |       return new SourceCountFeature();
34 |     else if (name.equals(TargetCountFeature.NAME))
35 |       return new TargetCountFeature();
36 | 
37 |     return null;
38 |   }
39 | 
40 |   public static List<MapReduceFeature> getAll(String names) {
41 |     String[] feature_names = FormatUtils.P_COMMA_OR_SPACE.split(names);
42 |     List<MapReduceFeature> features = new ArrayList<MapReduceFeature>();
43 | 
44 |     for (String feature_name : feature_names) {
45 |       MapReduceFeature feature = get(feature_name);
46 |       if (feature != null) features.add(feature);
47 |     }
48 |     return features;
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/mapred/coc/GoodTuringSmoother.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2015 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | package edu.jhu.thrax.hadoop.features.mapred.coc;
 3 | 
 4 | public class GoodTuringSmoother {
 5 |   private CountOfCountsEstimator estimator;
 6 | 
 7 |   public GoodTuringSmoother(CountOfCountsEstimator estimator) {
 8 |     this.estimator = estimator;
 9 |   }
10 |   
11 |   public double smoothedCount(int count) {
12 |     double turingFraction = estimator.getEstimatedCountOfCount(count + 1) / estimator.getEstimatedCountOfCount(count);
13 |     return (count + 1) * turingFraction;
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/pivot/NonAggregatingPivotedFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features.pivot;
 2 | 
 3 | import java.util.Map;
 4 | 
 5 | import org.apache.hadoop.io.FloatWritable;
 6 | import org.apache.hadoop.io.Writable;
 7 | 
 8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
 9 | import edu.jhu.thrax.util.Vocabulary;
10 | 
11 | public abstract class NonAggregatingPivotedFeature implements PivotedFeature {
12 | 
13 |   private static final FloatWritable ZERO = new FloatWritable(0.0f);
14 | 
15 |   private float value;
16 | 
17 |   public void initializeAggregation() {
18 |     value = Float.MAX_VALUE;
19 |   }
20 | 
21 |   public void aggregate(FeatureMap features) {
22 |     FloatWritable val = (FloatWritable) features.get(getName());
23 |     if (value == Float.MAX_VALUE) {
24 |       value = val.get();
25 |     } else {
26 |       if (value != val.get()) {
27 |         throw new RuntimeException("Diverging values in pseudo-aggregation: " + value + " versus "
28 |             + val.get());
29 |       }
30 |     }
31 |   }
32 | 
33 |   public FloatWritable finalizeAggregation() {
34 |     return new FloatWritable(value);
35 |   }
36 | 
37 |   public void unaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
38 |     map.put(Vocabulary.id(getName()), ZERO);
39 |   }
40 | 
41 |   public void binaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
42 |     map.put(Vocabulary.id(getName()), ZERO);
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/pivot/PivotedAnnotationFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features.pivot;
 2 | 
 3 | import java.util.HashSet;
 4 | import java.util.Map;
 5 | import java.util.Set;
 6 | 
 7 | import org.apache.hadoop.io.Writable;
 8 | 
 9 | import edu.jhu.thrax.hadoop.datatypes.AlignmentWritable;
10 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
11 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
12 | import edu.jhu.thrax.hadoop.features.annotation.AlignmentFeature;
13 | import edu.jhu.thrax.hadoop.features.annotation.AnnotationPassthroughFeature;
14 | 
15 | public class PivotedAnnotationFeature implements PivotedFeature {
16 | 
17 |   public static final String NAME = "annotation"; 
18 |   
19 |   private Annotation aggregated = null;
20 | 
21 |   public String getName() {
22 |     return NAME;
23 |   }
24 | 
25 |   public Set<String> getPrerequisites() {
26 |     Set<String> prereqs = new HashSet<String>();
27 |     prereqs.add(AlignmentFeature.NAME);
28 |     return prereqs;
29 |   }
30 | 
31 |   public Annotation pivot(FeatureMap src, FeatureMap tgt) {
32 |     AlignmentWritable src_f2e = ((AlignmentWritable) src.get(AlignmentFeature.NAME));
33 |     AlignmentWritable tgt_f2e = ((AlignmentWritable) tgt.get(AlignmentFeature.NAME));
34 | 
35 |     return new Annotation(src_f2e.join(tgt_f2e));
36 |   }
37 | 
38 |   public void unaryGlueRuleScore(int nt, Map<Integer, Writable> map) {}
39 | 
40 |   public void binaryGlueRuleScore(int nt, Map<Integer, Writable> map) {}
41 | 
42 |   public void initializeAggregation() {
43 |     aggregated = null;
44 |   }
45 | 
46 |   public void aggregate(FeatureMap a) {
47 |     Annotation annotation = (Annotation) a.get(AnnotationPassthroughFeature.NAME);
48 |     if (aggregated == null) {
49 |       aggregated = new Annotation(annotation);
50 |     } else {
51 |       aggregated.setAlignment(aggregated.f2e().intersect(annotation.f2e()));
52 |       aggregated.merge(annotation);
53 |     }
54 |   }
55 | 
56 |   public Annotation finalizeAggregation() {
57 |     return aggregated;
58 |   }
59 | 
60 |   @Override
61 |   public Set<String> getLowerBoundLabels() {
62 |     return null;
63 |   }
64 | 
65 |   @Override
66 |   public Set<String> getUpperBoundLabels() {
67 |     return null;
68 |   }
69 | }
70 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/pivot/PivotedFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features.pivot;
 2 | 
 3 | import java.util.Set;
 4 | 
 5 | import org.apache.hadoop.io.Writable;
 6 | 
 7 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
 8 | import edu.jhu.thrax.hadoop.features.Feature;
 9 | 
10 | public interface PivotedFeature extends Feature {
11 | 
12 | 	public Set<String> getPrerequisites();
13 | 
14 | 	public Writable pivot(FeatureMap src, FeatureMap tgt);
15 | 
16 | 	public void initializeAggregation();
17 | 	
18 | 	public void aggregate(FeatureMap a);
19 | 
20 | 	public Writable finalizeAggregation();
21 | 	
22 | 	public Set<String> getLowerBoundLabels();
23 | 	
24 | 	public Set<String> getUpperBoundLabels();
25 | }
26 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/pivot/PivotedFeatureFactory.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features.pivot;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | import edu.jhu.thrax.util.FormatUtils;
 7 | 
 8 | public class PivotedFeatureFactory {
 9 | 
10 |   public static PivotedFeature get(String name) {
11 |     if (name.equals(PivotedTargetPhraseGivenSourceFeature.NAME))
12 |       return new PivotedTargetPhraseGivenSourceFeature();
13 |     else if (name.equals(PivotedSourcePhraseGivenTargetFeature.NAME))
14 |       return new PivotedSourcePhraseGivenTargetFeature();
15 |     else if (name.equals(PivotedRarityPenaltyFeature.NAME))
16 |       return new PivotedRarityPenaltyFeature();
17 |     else if (name.equals(PivotedLexicalSourceGivenTargetFeature.NAME))
18 |       return new PivotedLexicalSourceGivenTargetFeature();
19 |     else if (name.equals(PivotedLexicalTargetGivenSourceFeature.NAME))
20 |       return new PivotedLexicalTargetGivenSourceFeature();
21 |     else if (name.equals(PivotedSourcePhraseGivenLHSFeature.NAME))
22 |       return new PivotedSourcePhraseGivenLHSFeature();
23 |     else if (name.equals(PivotedLhsGivenSourcePhraseFeature.NAME))
24 |       return new PivotedLhsGivenSourcePhraseFeature();
25 |     else if (name.equals(PivotedSourcePhraseGivenTargetAndLHSFeature.NAME))
26 |       return new PivotedSourcePhraseGivenTargetAndLHSFeature();
27 |     else if (name.equals(PivotedTargetPhraseGivenLHSFeature.NAME))
28 |       return new PivotedTargetPhraseGivenLHSFeature();
29 |     else if (name.equals(PivotedLhsGivenTargetPhraseFeature.NAME))
30 |       return new PivotedLhsGivenTargetPhraseFeature();
31 |     else if (name.equals(PivotedTargetPhraseGivenSourceAndLHSFeature.NAME))
32 |       return new PivotedTargetPhraseGivenSourceAndLHSFeature();
33 | 
34 |     return null;
35 |   }
36 | 
37 |   public static List<PivotedFeature> getAll(String names) {
38 |     String[] feature_names = FormatUtils.P_COMMA_OR_SPACE.split(names);
39 |     List<PivotedFeature> features = new ArrayList<PivotedFeature>();
40 | 
41 |     for (String feature_name : feature_names) {
42 |       PivotedFeature feature = get(feature_name);
43 |       if (feature != null) features.add(feature);
44 |     }
45 |     return features;
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/pivot/PivotedLexicalSourceGivenTargetFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features.pivot;
 2 | 
 3 | import java.util.HashSet;
 4 | import java.util.Set;
 5 | 
 6 | import org.apache.hadoop.io.FloatWritable;
 7 | 
 8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
 9 | import edu.jhu.thrax.hadoop.features.annotation.SourceGivenTargetLexicalProbabilityFeature;
10 | import edu.jhu.thrax.hadoop.features.annotation.TargetGivenSourceLexicalProbabilityFeature;
11 | 
12 | public class PivotedLexicalSourceGivenTargetFeature extends PivotedNegLogProbFeature {
13 | 
14 |   public static final String NAME = SourceGivenTargetLexicalProbabilityFeature.NAME;
15 | 
16 |   public String getName() {
17 |     return NAME;
18 |   }
19 | 
20 |   public Set<String> getPrerequisites() {
21 |     Set<String> prereqs = new HashSet<String>();
22 |     prereqs.add(SourceGivenTargetLexicalProbabilityFeature.NAME);
23 |     prereqs.add(TargetGivenSourceLexicalProbabilityFeature.NAME);
24 |     return prereqs;
25 |   }
26 | 
27 |   public FloatWritable pivot(FeatureMap src, FeatureMap tgt) {
28 |     float egf = ((FloatWritable) tgt.get(TargetGivenSourceLexicalProbabilityFeature.NAME)).get();
29 |     float fge = ((FloatWritable) src.get(SourceGivenTargetLexicalProbabilityFeature.NAME)).get();
30 | 
31 |     return new FloatWritable(egf + fge);
32 |   }
33 | 
34 |   @Override
35 |   public Set<String> getLowerBoundLabels() {
36 |     Set<String> lower_bound_labels = new HashSet<String>();
37 |     lower_bound_labels.add(TargetGivenSourceLexicalProbabilityFeature.NAME);
38 |     lower_bound_labels.add(SourceGivenTargetLexicalProbabilityFeature.NAME);
39 |     return lower_bound_labels;
40 |   }
41 | 
42 |   @Override
43 |   public Set<String> getUpperBoundLabels() {
44 |     return null;
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/pivot/PivotedLexicalTargetGivenSourceFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features.pivot;
 2 | 
 3 | import java.util.HashSet;
 4 | import java.util.Set;
 5 | 
 6 | import org.apache.hadoop.io.FloatWritable;
 7 | 
 8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
 9 | import edu.jhu.thrax.hadoop.features.annotation.SourceGivenTargetLexicalProbabilityFeature;
10 | import edu.jhu.thrax.hadoop.features.annotation.TargetGivenSourceLexicalProbabilityFeature;
11 | 
12 | public class PivotedLexicalTargetGivenSourceFeature extends PivotedNegLogProbFeature {
13 | 
14 |   public static final String NAME = TargetGivenSourceLexicalProbabilityFeature.NAME;
15 | 
16 |   public String getName() {
17 |     return NAME;
18 |   }
19 | 
20 |   public Set<String> getPrerequisites() {
21 |     Set<String> prereqs = new HashSet<String>();
22 |     prereqs.add(TargetGivenSourceLexicalProbabilityFeature.NAME);
23 |     prereqs.add(SourceGivenTargetLexicalProbabilityFeature.NAME);
24 |     return prereqs;
25 |   }
26 | 
27 |   public FloatWritable pivot(FeatureMap src, FeatureMap tgt) {
28 |     float egf = ((FloatWritable) src.get(TargetGivenSourceLexicalProbabilityFeature.NAME)).get();
29 |     float fge = ((FloatWritable) tgt.get(SourceGivenTargetLexicalProbabilityFeature.NAME)).get();
30 | 
31 |     return new FloatWritable(egf + fge);
32 |   }
33 | 
34 |   @Override
35 |   public Set<String> getLowerBoundLabels() {
36 |     Set<String> lower_bound_labels = new HashSet<String>();
37 |     lower_bound_labels.add(TargetGivenSourceLexicalProbabilityFeature.NAME);
38 |     lower_bound_labels.add(SourceGivenTargetLexicalProbabilityFeature.NAME);
39 |     return lower_bound_labels;
40 |   }
41 | 
42 |   @Override
43 |   public Set<String> getUpperBoundLabels() {
44 |     return null;
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/pivot/PivotedLhsGivenSourcePhraseFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features.pivot;
 2 | 
 3 | import java.util.HashSet;
 4 | import java.util.Set;
 5 | 
 6 | import org.apache.hadoop.io.FloatWritable;
 7 | 
 8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
 9 | import edu.jhu.thrax.hadoop.features.mapred.LhsGivenSourcePhraseFeature;
10 | import edu.jhu.thrax.hadoop.features.mapred.LhsGivenTargetPhraseFeature;
11 | 
12 | public class PivotedLhsGivenSourcePhraseFeature extends NonAggregatingPivotedFeature {
13 | 
14 |   public static final String NAME = LhsGivenSourcePhraseFeature.NAME;
15 | 
16 |   public String getName() {
17 |     return NAME;
18 |   }
19 | 
20 |   public Set<String> getPrerequisites() {
21 |     Set<String> prereqs = new HashSet<String>();
22 |     prereqs.add(LhsGivenTargetPhraseFeature.NAME);
23 |     return prereqs;
24 |   }
25 | 
26 |   public FloatWritable pivot(FeatureMap src, FeatureMap tgt) {
27 |     return new FloatWritable(((FloatWritable) src.get(LhsGivenTargetPhraseFeature.NAME)).get());
28 |   }
29 | 
30 |   @Override
31 |   public Set<String> getLowerBoundLabels() {
32 |     Set<String> lower_bound_labels = new HashSet<String>();
33 |     lower_bound_labels.add(LhsGivenTargetPhraseFeature.NAME);
34 |     return lower_bound_labels;
35 |   }
36 | 
37 |   @Override
38 |   public Set<String> getUpperBoundLabels() {
39 |     return null;
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/pivot/PivotedLhsGivenTargetPhraseFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features.pivot;
 2 | 
 3 | import java.util.HashSet;
 4 | import java.util.Set;
 5 | 
 6 | import org.apache.hadoop.io.FloatWritable;
 7 | 
 8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
 9 | import edu.jhu.thrax.hadoop.features.mapred.LhsGivenTargetPhraseFeature;
10 | 
11 | public class PivotedLhsGivenTargetPhraseFeature extends NonAggregatingPivotedFeature {
12 | 
13 |   public static final String NAME = LhsGivenTargetPhraseFeature.NAME;
14 | 
15 |   public String getName() {
16 |     return NAME;
17 |   }
18 | 
19 |   public Set<String> getPrerequisites() {
20 |     Set<String> prereqs = new HashSet<String>();
21 |     prereqs.add(LhsGivenTargetPhraseFeature.NAME);
22 |     return prereqs;
23 |   }
24 | 
25 |   public FloatWritable pivot(FeatureMap src, FeatureMap tgt) {
26 |     return new FloatWritable(((FloatWritable) tgt.get(LhsGivenTargetPhraseFeature.NAME)).get());
27 |   }
28 | 
29 |   @Override
30 |   public Set<String> getLowerBoundLabels() {
31 |     Set<String> lower_bound_labels = new HashSet<String>();
32 |     lower_bound_labels.add(LhsGivenTargetPhraseFeature.NAME);
33 |     return lower_bound_labels;
34 |   }
35 | 
36 |   @Override
37 |   public Set<String> getUpperBoundLabels() {
38 |     return null;
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/pivot/PivotedNegLogProbFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features.pivot;
 2 | 
 3 | import java.util.Map;
 4 | 
 5 | import org.apache.hadoop.io.FloatWritable;
 6 | import org.apache.hadoop.io.Writable;
 7 | 
 8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
 9 | import edu.jhu.thrax.util.NegLogMath;
10 | import edu.jhu.thrax.util.Vocabulary;
11 | 
12 | public abstract class PivotedNegLogProbFeature implements PivotedFeature {
13 | 
14 |   private static final FloatWritable ONE_PROB = new FloatWritable(0.0f);
15 | 
16 |   private float aggregated;
17 | 
18 |   public void initializeAggregation() {
19 |     aggregated = 64.0f;
20 |   }
21 | 
22 |   public void aggregate(FeatureMap features) {
23 |     FloatWritable val = (FloatWritable) features.get(getName());
24 |     aggregated = NegLogMath.logAdd(aggregated, val.get());
25 |   }
26 | 
27 |   public FloatWritable finalizeAggregation() {
28 |     return new FloatWritable(aggregated);
29 |   }
30 | 
31 |   public void unaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
32 |     map.put(Vocabulary.id(getName()), ONE_PROB);
33 |   }
34 | 
35 |   public void binaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
36 |     map.put(Vocabulary.id(getName()), ONE_PROB);
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/pivot/PivotedRarityPenaltyFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features.pivot;
 2 | 
 3 | import java.util.HashSet;
 4 | import java.util.Map;
 5 | import java.util.Set;
 6 | 
 7 | import org.apache.hadoop.io.FloatWritable;
 8 | import org.apache.hadoop.io.Writable;
 9 | 
10 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
11 | import edu.jhu.thrax.hadoop.features.annotation.RarityPenaltyFeature;
12 | import edu.jhu.thrax.util.Vocabulary;
13 | 
14 | public class PivotedRarityPenaltyFeature implements PivotedFeature {
15 | 
16 |   public static final String NAME = RarityPenaltyFeature.NAME;
17 | 
18 |   private static final FloatWritable ZERO = new FloatWritable(0.0f);
19 | 
20 |   private static final float RENORMALIZE = (float) Math.exp(-1);
21 | 
22 |   private float aggregated_rp;
23 | 
24 |   public String getName() {
25 |     return NAME;
26 |   }
27 | 
28 |   public Set<String> getPrerequisites() {
29 |     Set<String> prereqs = new HashSet<String>();
30 |     prereqs.add(RarityPenaltyFeature.NAME);
31 |     return prereqs;
32 |   }
33 | 
34 |   public FloatWritable pivot(FeatureMap a, FeatureMap b) {
35 |     float a_rp = ((FloatWritable) a.get(RarityPenaltyFeature.NAME)).get();
36 |     float b_rp = ((FloatWritable) b.get(RarityPenaltyFeature.NAME)).get();
37 |     return new FloatWritable(Math.max(a_rp, b_rp));
38 |   }
39 | 
40 |   public void unaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
41 |     map.put(Vocabulary.id(NAME), ZERO);
42 |   }
43 | 
44 |   public void binaryGlueRuleScore(int nt, Map<Integer, Writable> map) {
45 |     map.put(Vocabulary.id(NAME), ZERO);
46 |   }
47 | 
48 |   public void initializeAggregation() {
49 |     aggregated_rp = -1;
50 |   }
51 | 
52 |   public void aggregate(FeatureMap a) {
53 |     float rp = ((FloatWritable) a.get(NAME)).get();
54 |     if (aggregated_rp == -1) {
55 |       aggregated_rp = rp;
56 |     } else {
57 |       // Rarity is exp(1 - count). To compute rarity over a sum of counts:
58 |       // rarity_{1+2} = exp(1 - (count_1 + count_2)) = exp(1 - count_1) * exp(-count_2) =
59 |       // = exp(1 - count_1) * exp(1 - count_2) * exp(-1) = rarity_1 * rarity_2 * exp(-1)
60 |       aggregated_rp *= rp * RENORMALIZE;
61 |     }
62 |   }
63 | 
64 |   public FloatWritable finalizeAggregation() {
65 |     return new FloatWritable(aggregated_rp);
66 |   }
67 | 
68 |   @Override
69 |   public Set<String> getLowerBoundLabels() {
70 |     Set<String> lower_bound_labels = new HashSet<String>();
71 |     lower_bound_labels.add(RarityPenaltyFeature.NAME);
72 |     return lower_bound_labels;
73 |   }
74 | 
75 |   @Override
76 |   public Set<String> getUpperBoundLabels() {
77 |     return null;
78 |   }
79 | }
80 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/pivot/PivotedSourcePhraseGivenLHSFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features.pivot;
 2 | 
 3 | import java.util.HashSet;
 4 | import java.util.Set;
 5 | 
 6 | import org.apache.hadoop.io.FloatWritable;
 7 | 
 8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
 9 | import edu.jhu.thrax.hadoop.features.mapred.SourcePhraseGivenLHSFeature;
10 | import edu.jhu.thrax.hadoop.features.mapred.TargetPhraseGivenLHSFeature;
11 | 
12 | public class PivotedSourcePhraseGivenLHSFeature extends NonAggregatingPivotedFeature {
13 | 
14 |   public static final String NAME = SourcePhraseGivenLHSFeature.NAME;
15 | 
16 |   public String getName() {
17 |     return NAME;
18 |   }
19 | 
20 |   public Set<String> getPrerequisites() {
21 |     Set<String> prereqs = new HashSet<String>();
22 |     prereqs.add(TargetPhraseGivenLHSFeature.NAME);
23 |     return prereqs;
24 |   }
25 | 
26 |   public FloatWritable pivot(FeatureMap src, FeatureMap tgt) {
27 |     return new FloatWritable(((FloatWritable) src.get(TargetPhraseGivenLHSFeature.NAME)).get());
28 |   }
29 | 
30 |   @Override
31 |   public Set<String> getLowerBoundLabels() {
32 |     Set<String> lower_bound_labels = new HashSet<String>();
33 |     lower_bound_labels.add(TargetPhraseGivenLHSFeature.NAME);
34 |     return lower_bound_labels;
35 |   }
36 | 
37 |   @Override
38 |   public Set<String> getUpperBoundLabels() {
39 |     return null;
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/pivot/PivotedSourcePhraseGivenTargetAndLHSFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features.pivot;
 2 | 
 3 | import java.util.HashSet;
 4 | import java.util.Set;
 5 | 
 6 | import org.apache.hadoop.io.FloatWritable;
 7 | 
 8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
 9 | import edu.jhu.thrax.hadoop.features.mapred.SourcePhraseGivenTargetandLHSFeature;
10 | import edu.jhu.thrax.hadoop.features.mapred.TargetPhraseGivenSourceandLHSFeature;
11 | 
12 | public class PivotedSourcePhraseGivenTargetAndLHSFeature extends PivotedNegLogProbFeature {
13 | 
14 |   public static final String NAME = SourcePhraseGivenTargetandLHSFeature.NAME; 
15 |   
16 |   public String getName() {
17 |     return NAME;
18 |   }
19 | 
20 |   public Set<String> getPrerequisites() {
21 |     Set<String> prereqs = new HashSet<String>();
22 |     prereqs.add(SourcePhraseGivenTargetandLHSFeature.NAME);
23 |     prereqs.add(TargetPhraseGivenSourceandLHSFeature.NAME);
24 |     return prereqs;
25 |   }
26 | 
27 |   public FloatWritable pivot(FeatureMap src, FeatureMap tgt) {
28 |     float fge = ((FloatWritable) src.get(TargetPhraseGivenSourceandLHSFeature.NAME)).get();
29 |     float egf = ((FloatWritable) tgt.get(SourcePhraseGivenTargetandLHSFeature.NAME)).get();
30 | 
31 |     return new FloatWritable(egf + fge);
32 |   }
33 | 
34 |   @Override
35 |   public Set<String> getLowerBoundLabels() {
36 |     Set<String> lower_bound_labels = new HashSet<String>();
37 |     lower_bound_labels.add(TargetPhraseGivenSourceandLHSFeature.NAME);
38 |     lower_bound_labels.add(SourcePhraseGivenTargetandLHSFeature.NAME);
39 |     return lower_bound_labels;
40 |   }
41 | 
42 |   @Override
43 |   public Set<String> getUpperBoundLabels() {
44 |     return null;
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/pivot/PivotedSourcePhraseGivenTargetFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features.pivot;
 2 | 
 3 | import java.util.HashSet;
 4 | import java.util.Set;
 5 | 
 6 | import org.apache.hadoop.io.FloatWritable;
 7 | 
 8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
 9 | import edu.jhu.thrax.hadoop.features.mapred.SourcePhraseGivenTargetFeature;
10 | import edu.jhu.thrax.hadoop.features.mapred.TargetPhraseGivenSourceFeature;
11 | 
12 | public class PivotedSourcePhraseGivenTargetFeature extends PivotedNegLogProbFeature {
13 | 
14 |   public static final String NAME = SourcePhraseGivenTargetFeature.NAME;
15 | 
16 |   public String getName() {
17 |     return NAME;
18 |   }
19 | 
20 |   public Set<String> getPrerequisites() {
21 |     Set<String> prereqs = new HashSet<String>();
22 |     prereqs.add(TargetPhraseGivenSourceFeature.NAME);
23 |     prereqs.add(SourcePhraseGivenTargetFeature.NAME);
24 |     return prereqs;
25 |   }
26 | 
27 |   public FloatWritable pivot(FeatureMap src, FeatureMap tgt) {
28 |     float src_f = ((FloatWritable) src.get(TargetPhraseGivenSourceFeature.NAME)).get();
29 |     float f_tgt = ((FloatWritable) tgt.get(SourcePhraseGivenTargetFeature.NAME)).get();
30 | 
31 |     return new FloatWritable(src_f + f_tgt);
32 |   }
33 | 
34 |   @Override
35 |   public Set<String> getLowerBoundLabels() {
36 |     Set<String> lower_bound_labels = new HashSet<String>();
37 |     lower_bound_labels.add(TargetPhraseGivenSourceFeature.NAME);
38 |     lower_bound_labels.add(SourcePhraseGivenTargetFeature.NAME);
39 |     return lower_bound_labels;
40 |   }
41 | 
42 |   @Override
43 |   public Set<String> getUpperBoundLabels() {
44 |     return null;
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/pivot/PivotedTargetPhraseGivenLHSFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features.pivot;
 2 | 
 3 | import java.util.HashSet;
 4 | import java.util.Set;
 5 | 
 6 | import org.apache.hadoop.io.FloatWritable;
 7 | 
 8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
 9 | import edu.jhu.thrax.hadoop.features.mapred.TargetPhraseGivenLHSFeature;
10 | 
11 | public class PivotedTargetPhraseGivenLHSFeature extends NonAggregatingPivotedFeature {
12 | 
13 |   public static final String NAME = TargetPhraseGivenLHSFeature.NAME;
14 | 
15 |   public String getName() {
16 |     return NAME;
17 |   }
18 | 
19 |   public Set<String> getPrerequisites() {
20 |     Set<String> prereqs = new HashSet<String>();
21 |     prereqs.add(TargetPhraseGivenLHSFeature.NAME);
22 |     return prereqs;
23 |   }
24 | 
25 |   public FloatWritable pivot(FeatureMap src, FeatureMap tgt) {
26 |     return new FloatWritable(((FloatWritable) tgt.get(TargetPhraseGivenLHSFeature.NAME)).get());
27 |   }
28 | 
29 |   @Override
30 |   public Set<String> getLowerBoundLabels() {
31 |     Set<String> lower_bound_labels = new HashSet<String>();
32 |     lower_bound_labels.add(TargetPhraseGivenLHSFeature.NAME);
33 |     return lower_bound_labels;
34 |   }
35 | 
36 |   @Override
37 |   public Set<String> getUpperBoundLabels() {
38 |     return null;
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/pivot/PivotedTargetPhraseGivenSourceAndLHSFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features.pivot;
 2 | 
 3 | import java.util.HashSet;
 4 | import java.util.Set;
 5 | 
 6 | import org.apache.hadoop.io.FloatWritable;
 7 | 
 8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
 9 | import edu.jhu.thrax.hadoop.features.mapred.SourcePhraseGivenTargetandLHSFeature;
10 | import edu.jhu.thrax.hadoop.features.mapred.TargetPhraseGivenSourceandLHSFeature;
11 | 
12 | public class PivotedTargetPhraseGivenSourceAndLHSFeature extends PivotedNegLogProbFeature {
13 | 
14 |   public static final String NAME = TargetPhraseGivenSourceandLHSFeature.NAME;
15 | 
16 |   public String getName() {
17 |     return NAME;
18 |   }
19 | 
20 |   public Set<String> getPrerequisites() {
21 |     Set<String> prereqs = new HashSet<String>();
22 |     prereqs.add(TargetPhraseGivenSourceandLHSFeature.NAME);
23 |     prereqs.add(SourcePhraseGivenTargetandLHSFeature.NAME);
24 |     return prereqs;
25 |   }
26 | 
27 |   public FloatWritable pivot(FeatureMap src, FeatureMap tgt) {
28 |     float fge = ((FloatWritable) tgt.get(TargetPhraseGivenSourceandLHSFeature.NAME)).get();
29 |     float egf = ((FloatWritable) src.get(SourcePhraseGivenTargetandLHSFeature.NAME)).get();
30 | 
31 |     return new FloatWritable(egf + fge);
32 |   }
33 | 
34 |   @Override
35 |   public Set<String> getLowerBoundLabels() {
36 |     Set<String> lower_bound_labels = new HashSet<String>();
37 |     lower_bound_labels.add(TargetPhraseGivenSourceandLHSFeature.NAME);
38 |     lower_bound_labels.add(SourcePhraseGivenTargetandLHSFeature.NAME);
39 |     return lower_bound_labels;
40 |   }
41 | 
42 |   @Override
43 |   public Set<String> getUpperBoundLabels() {
44 |     return null;
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/pivot/PivotedTargetPhraseGivenSourceFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.features.pivot;
 2 | 
 3 | import java.util.HashSet;
 4 | import java.util.Set;
 5 | 
 6 | import org.apache.hadoop.io.FloatWritable;
 7 | 
 8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
 9 | import edu.jhu.thrax.hadoop.features.mapred.SourcePhraseGivenTargetFeature;
10 | import edu.jhu.thrax.hadoop.features.mapred.TargetPhraseGivenSourceFeature;
11 | 
12 | public class PivotedTargetPhraseGivenSourceFeature extends PivotedNegLogProbFeature {
13 | 
14 |   public static final String NAME = TargetPhraseGivenSourceFeature.NAME;
15 | 
16 |   public String getName() {
17 |     return NAME;
18 |   }
19 | 
20 |   public Set<String> getPrerequisites() {
21 |     Set<String> prereqs = new HashSet<String>();
22 |     prereqs.add(TargetPhraseGivenSourceFeature.NAME);
23 |     prereqs.add(SourcePhraseGivenTargetFeature.NAME);
24 |     return prereqs;
25 |   }
26 | 
27 |   public FloatWritable pivot(FeatureMap src, FeatureMap tgt) {
28 |     float tgt_f = ((FloatWritable) tgt.get(TargetPhraseGivenSourceFeature.NAME)).get();
29 |     float f_src = ((FloatWritable) src.get(SourcePhraseGivenTargetFeature.NAME)).get();
30 | 
31 |     return new FloatWritable(tgt_f + f_src);
32 |   }
33 | 
34 |   @Override
35 |   public Set<String> getLowerBoundLabels() {
36 |     Set<String> lower_bound_labels = new HashSet<String>();
37 |     lower_bound_labels.add(TargetPhraseGivenSourceFeature.NAME);
38 |     lower_bound_labels.add(SourcePhraseGivenTargetFeature.NAME);
39 |     return lower_bound_labels;
40 |   }
41 | 
42 |   @Override
43 |   public Set<String> getUpperBoundLabels() {
44 |     return null;
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/jobs/DefaultValues.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.jobs;
2 | 
3 | public class DefaultValues {
4 |   public static int DEFAULT_NUM_REDUCERS = 4;
5 |   
6 |   private DefaultValues() {};
7 | }
8 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/jobs/DistributionalContextExtractionJob.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.jobs;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.HashSet;
 5 | import java.util.Set;
 6 | 
 7 | import org.apache.hadoop.conf.Configuration;
 8 | import org.apache.hadoop.fs.Path;
 9 | import org.apache.hadoop.io.NullWritable;
10 | import org.apache.hadoop.io.Text;
11 | import org.apache.hadoop.mapreduce.Job;
12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
14 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
15 | 
16 | import edu.jhu.thrax.hadoop.distributional.ContextWritable;
17 | import edu.jhu.thrax.hadoop.distributional.DistributionalContextCombiner;
18 | import edu.jhu.thrax.hadoop.distributional.DistributionalContextMapper;
19 | import edu.jhu.thrax.hadoop.distributional.DistributionalContextReducer;
20 | import edu.jhu.thrax.hadoop.distributional.SignatureWritable;
21 | 
22 | public class DistributionalContextExtractionJob implements ThraxJob {
23 | 
24 |   public Job getJob(Configuration conf) throws IOException {
25 |     Job job = new Job(conf, "distributional");
26 | 
27 |     job.setJarByClass(DistributionalContextMapper.class);
28 | 
29 |     job.setMapperClass(DistributionalContextMapper.class);
30 |     job.setCombinerClass(DistributionalContextCombiner.class);
31 |     job.setReducerClass(DistributionalContextReducer.class);
32 | 
33 |     job.setMapOutputKeyClass(Text.class);
34 |     job.setMapOutputValueClass(ContextWritable.class);
35 | 
36 |     job.setOutputKeyClass(SignatureWritable.class);
37 |     job.setOutputValueClass(NullWritable.class);
38 | 
39 |     job.setOutputFormatClass(SequenceFileOutputFormat.class);
40 | 
41 |     int numReducers = conf.getInt("thrax.reducers", conf.getInt("mapreduce.job.reduces", DefaultValues.DEFAULT_NUM_REDUCERS));
42 |     job.setNumReduceTasks(numReducers);
43 | 
44 |     FileInputFormat.setInputPaths(job, new Path(conf.get("thrax.input-file")));
45 |     FileOutputFormat.setOutputPath(job, new Path(conf.get("thrax.work-dir") + "signatures"));
46 | 
47 |     int max_split_size = conf.getInt("thrax.max-split-size", 0);
48 |     if (max_split_size != 0) FileInputFormat.setMaxInputSplitSize(job, max_split_size);
49 | 
50 |     return job;
51 |   }
52 | 
53 |   public String getName() {
54 |     return "distributional";
55 |   }
56 | 
57 |   public String getOutputSuffix() {
58 |     return null;
59 |   }
60 | 
61 |   @Override
62 |   public Set<Class<? extends ThraxJob>> getPrerequisites() {
63 |     return new HashSet<Class<? extends ThraxJob>>();
64 |   }
65 | }
66 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/jobs/DistributionalContextSortingJob.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.jobs;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.HashSet;
 5 | import java.util.Set;
 6 | 
 7 | import org.apache.hadoop.conf.Configuration;
 8 | import org.apache.hadoop.fs.Path;
 9 | import org.apache.hadoop.io.NullWritable;
10 | import org.apache.hadoop.mapreduce.Job;
11 | import org.apache.hadoop.mapreduce.Mapper;
12 | import org.apache.hadoop.mapreduce.Reducer;
13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
14 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
15 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
16 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
17 | 
18 | import edu.jhu.thrax.hadoop.distributional.DistributionalContextMapper;
19 | import edu.jhu.thrax.hadoop.distributional.SignatureWritable;
20 | 
21 | public class DistributionalContextSortingJob implements ThraxJob {
22 | 
23 |   private static HashSet<Class<? extends ThraxJob>> prereqs =
24 |       new HashSet<Class<? extends ThraxJob>>();
25 | 
26 |   public Job getJob(Configuration conf) throws IOException {
27 |     Job job = new Job(conf, "sorting");
28 | 
29 |     job.setJarByClass(DistributionalContextMapper.class);
30 | 
31 |     job.setMapperClass(Mapper.class);
32 |     job.setReducerClass(Reducer.class);
33 | 
34 |     job.setInputFormatClass(SequenceFileInputFormat.class);
35 | 
36 |     job.setOutputKeyClass(SignatureWritable.class);
37 |     job.setOutputValueClass(NullWritable.class);
38 | 
39 |     job.setOutputFormatClass(SequenceFileOutputFormat.class);
40 | 
41 |     // TODO: Figure out how to make this workable with multiple reducers. Currently -getmerge-ing
42 |     // multiple sequence file outputs from several reducers yields a broken file.
43 |     job.setNumReduceTasks(1);
44 | 
45 |     FileInputFormat.setInputPaths(job, new Path(conf.get("thrax.work-dir") + "signatures"));
46 |     FileOutputFormat.setOutputPath(job, new Path(conf.get("thrax.outputPath", "")));
47 | 
48 |     int max_split_size = conf.getInt("thrax.max-split-size", 0);
49 |     if (max_split_size != 0) FileInputFormat.setMaxInputSplitSize(job, max_split_size);
50 | 
51 |     return job;
52 |   }
53 | 
54 |   public String getName() {
55 |     return "sorting";
56 |   }
57 | 
58 |   public Set<Class<? extends ThraxJob>> getPrerequisites() {
59 |     prereqs.add(DistributionalContextExtractionJob.class);
60 |     return prereqs;
61 |   }
62 | 
63 |   public String getOutputSuffix() {
64 |     return null;
65 |   }
66 | }
67 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/jobs/ExtractionJob.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.jobs;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.HashSet;
 5 | import java.util.Set;
 6 | 
 7 | import org.apache.hadoop.conf.Configuration;
 8 | import org.apache.hadoop.fs.Path;
 9 | import org.apache.hadoop.mapreduce.Job;
10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
11 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
12 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
13 | 
14 | import edu.jhu.thrax.hadoop.datatypes.AlignedRuleWritable;
15 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
16 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
17 | import edu.jhu.thrax.hadoop.extraction.ExtractionCombiner;
18 | import edu.jhu.thrax.hadoop.extraction.ExtractionMapper;
19 | import edu.jhu.thrax.hadoop.extraction.ExtractionReducer;
20 | 
21 | public class ExtractionJob implements ThraxJob {
22 | 
23 |   public Set<Class<? extends ThraxJob>> getPrerequisites() {
24 |     Set<Class<? extends ThraxJob>> result = new HashSet<Class<? extends ThraxJob>>();
25 |     result.add(VocabularyJob.class);
26 |     return result;
27 |   }
28 | 
29 |   public Job getJob(Configuration conf) throws IOException {
30 |     Job job = new Job(conf, "extraction");
31 |     job.setJarByClass(ExtractionMapper.class);
32 | 
33 |     job.setMapperClass(ExtractionMapper.class);
34 |     job.setCombinerClass(ExtractionCombiner.class);
35 |     job.setReducerClass(ExtractionReducer.class);
36 | 
37 |     job.setSortComparatorClass(AlignedRuleWritable.RuleYieldComparator.class);
38 |     job.setPartitionerClass(AlignedRuleWritable.RuleYieldPartitioner.class);
39 |     
40 |     job.setMapOutputKeyClass(AlignedRuleWritable.class);
41 |     job.setMapOutputValueClass(Annotation.class);
42 |     job.setOutputKeyClass(RuleWritable.class);
43 |     job.setOutputValueClass(Annotation.class);
44 | 
45 |     job.setOutputFormatClass(SequenceFileOutputFormat.class);
46 | 
47 |     int numReducers = conf.getInt("thrax.reducers", conf.getInt("mapreduce.job.reduces", DefaultValues.DEFAULT_NUM_REDUCERS));
48 |     job.setNumReduceTasks(numReducers);
49 | 
50 |     FileInputFormat.setInputPaths(job, new Path(conf.get("thrax.input-file")));
51 |     int maxSplitSize = conf.getInt("thrax.max-split-size", 0);
52 |     if (maxSplitSize != 0) FileInputFormat.setMaxInputSplitSize(job, maxSplitSize);
53 | 
54 |     FileOutputFormat.setOutputPath(job, new Path(conf.get("thrax.work-dir") + "rules"));
55 | 
56 |     return job;
57 |   }
58 | 
59 |   // TODO: unify names of jobs and their output directories
60 | 
61 |   public String getName() {
62 |     return "extraction";
63 |   }
64 | 
65 |   public String getOutputSuffix() {
66 |     return "rules";
67 |   }
68 | }
69 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/jobs/FeatureCollectionJob.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.jobs;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.HashSet;
 5 | import java.util.Set;
 6 | 
 7 | import org.apache.hadoop.conf.Configuration;
 8 | import org.apache.hadoop.fs.Path;
 9 | import org.apache.hadoop.mapreduce.Job;
10 | import org.apache.hadoop.mapreduce.Mapper;
11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
12 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
14 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
15 | 
16 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
17 | import edu.jhu.thrax.hadoop.datatypes.FeaturePair;
18 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
19 | import edu.jhu.thrax.hadoop.paraphrasing.FeatureCollectionReducer;
20 | 
21 | public class FeatureCollectionJob implements ThraxJob {
22 | 
23 |   private static HashSet<Class<? extends ThraxJob>> prereqs =
24 |       new HashSet<Class<? extends ThraxJob>>();
25 | 
26 |   private static HashSet<String> prereq_names = new HashSet<String>();
27 | 
28 |   public static void addPrerequisite(Class<? extends ThraxJob> c) {
29 |     prereqs.add(c);
30 |     try {
31 |       ThraxJob prereq;
32 |       prereq = c.newInstance();
33 |       prereq_names.add(prereq.getOutputSuffix());
34 |     } catch (Exception e) {
35 |       e.printStackTrace();
36 |     }
37 |   }
38 | 
39 |   public Set<Class<? extends ThraxJob>> getPrerequisites() {
40 |     prereqs.add(ExtractionJob.class);
41 |     return prereqs;
42 |   }
43 | 
44 |   public Job getJob(Configuration conf) throws IOException {
45 |     Job job = new Job(conf, "collect");
46 | 
47 |     String workDir = conf.get("thrax.work-dir");
48 | 
49 |     job.setJarByClass(FeatureCollectionReducer.class);
50 | 
51 |     job.setMapperClass(Mapper.class);
52 |     job.setReducerClass(FeatureCollectionReducer.class);
53 | 
54 |     job.setInputFormatClass(SequenceFileInputFormat.class);
55 |     job.setMapOutputKeyClass(RuleWritable.class);
56 |     job.setMapOutputValueClass(FeaturePair.class);
57 |     job.setOutputKeyClass(RuleWritable.class);
58 |     job.setOutputValueClass(FeatureMap.class);
59 |     job.setOutputFormatClass(SequenceFileOutputFormat.class);
60 | 
61 |     job.setPartitionerClass(RuleWritable.YieldPartitioner.class);
62 | 
63 |     int numReducers = conf.getInt("thrax.reducers", conf.getInt("mapreduce.job.reduces", DefaultValues.DEFAULT_NUM_REDUCERS));
64 |     job.setNumReduceTasks(numReducers);
65 | 
66 |     int maxSplitSize = conf.getInt("thrax.max-split-size", 0);
67 |     if (maxSplitSize != 0) FileInputFormat.setMaxInputSplitSize(job, maxSplitSize * 20);
68 | 
69 |     for (String prereq_name : prereq_names)
70 |       FileInputFormat.addInputPath(job, new Path(workDir + prereq_name));
71 | 
72 |     // TODO: double-check this.
73 |     if (FileInputFormat.getInputPaths(job).length == 0)
74 |       FileInputFormat.addInputPath(job, new Path(workDir + "rules"));
75 | 
76 |     String outputPath = workDir + "collected";
77 |     FileOutputFormat.setOutputPath(job, new Path(outputPath));
78 | 
79 |     return job;
80 |   }
81 | 
82 |   public String getName() {
83 |     return "collect";
84 |   }
85 | 
86 |   public String getOutputSuffix() {
87 |     return "collected";
88 |   }
89 | }
90 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/jobs/JobState.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.jobs;
 2 | 
 3 | public enum JobState
 4 | {
 5 |     PREREQ_FAILED,
 6 |     FAILED,
 7 |     READY,
 8 |     RUNNING,
 9 |     SUCCESS,
10 |     WAITING,
11 |     PLANNED
12 | }
13 | 
14 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/jobs/ParaphraseAggregationJob.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.jobs;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.HashSet;
 5 | import java.util.Set;
 6 | 
 7 | import org.apache.hadoop.conf.Configuration;
 8 | import org.apache.hadoop.fs.Path;
 9 | import org.apache.hadoop.io.NullWritable;
10 | import org.apache.hadoop.io.Text;
11 | import org.apache.hadoop.io.compress.GzipCodec;
12 | import org.apache.hadoop.mapreduce.Job;
13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
14 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
15 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
16 | 
17 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
18 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
19 | import edu.jhu.thrax.hadoop.paraphrasing.AggregationCombiner;
20 | import edu.jhu.thrax.hadoop.paraphrasing.AggregationMapper;
21 | import edu.jhu.thrax.hadoop.paraphrasing.AggregationReducer;
22 | 
23 | public class ParaphraseAggregationJob implements ThraxJob {
24 | 
25 |   private static HashSet<Class<? extends ThraxJob>> prereqs =
26 |       new HashSet<Class<? extends ThraxJob>>();
27 | 
28 |   public Job getJob(Configuration conf) throws IOException {
29 |     Job job = new Job(conf, "aggregate");
30 | 
31 |     job.setJarByClass(AggregationReducer.class);
32 | 
33 |     job.setMapperClass(AggregationMapper.class);
34 |     job.setCombinerClass(AggregationCombiner.class);
35 |     job.setReducerClass(AggregationReducer.class);
36 | 
37 |     job.setInputFormatClass(SequenceFileInputFormat.class);
38 |     job.setMapOutputKeyClass(RuleWritable.class);
39 |     job.setMapOutputValueClass(FeatureMap.class);
40 |     job.setOutputKeyClass(Text.class);
41 |     job.setOutputValueClass(NullWritable.class);
42 | 
43 |     job.setSortComparatorClass(RuleWritable.YieldComparator.class);
44 |     job.setPartitionerClass(RuleWritable.FirstWordPartitioner.class);
45 | 
46 |     FileInputFormat.setInputPaths(job, new Path(conf.get("thrax.work-dir") + "pivoted"));
47 |     int maxSplitSize = conf.getInt("thrax.max-split-size", 0);
48 |     if (maxSplitSize != 0) FileInputFormat.setMaxInputSplitSize(job, maxSplitSize * 20);
49 | 
50 |     int numReducers = conf.getInt("thrax.reducers", conf.getInt("mapreduce.job.reduces", DefaultValues.DEFAULT_NUM_REDUCERS));
51 |     job.setNumReduceTasks(numReducers);
52 | 
53 |     String outputPath = conf.get("thrax.outputPath", "");
54 |     FileOutputFormat.setOutputPath(job, new Path(outputPath));
55 |     
56 |     FileOutputFormat.setCompressOutput(job, true);
57 |     FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
58 | 
59 |     return job;
60 |   }
61 | 
62 |   public String getName() {
63 |     return "aggregate";
64 |   }
65 | 
66 |   public static void addPrerequisite(Class<? extends ThraxJob> c) {
67 |     prereqs.add(c);
68 |   }
69 | 
70 |   public Set<Class<? extends ThraxJob>> getPrerequisites() {
71 |     prereqs.add(ParaphrasePivotingJob.class);
72 |     return prereqs;
73 |   }
74 | 
75 |   public String getOutputSuffix() {
76 |     return null;
77 |   }
78 | }
79 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/jobs/ParaphrasePivotingJob.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.jobs;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.HashSet;
 5 | import java.util.Set;
 6 | 
 7 | import org.apache.hadoop.conf.Configuration;
 8 | import org.apache.hadoop.fs.Path;
 9 | import org.apache.hadoop.mapreduce.Job;
10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
11 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
12 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
13 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
14 | 
15 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
16 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
17 | import edu.jhu.thrax.hadoop.paraphrasing.PivotingMapper;
18 | import edu.jhu.thrax.hadoop.paraphrasing.PivotingReducer;
19 | 
20 | public class ParaphrasePivotingJob implements ThraxJob {
21 | 
22 |   private static HashSet<Class<? extends ThraxJob>> prereqs =
23 |       new HashSet<Class<? extends ThraxJob>>();
24 | 
25 |   public static void addPrerequisite(Class<? extends ThraxJob> c) {
26 |     prereqs.add(c);
27 |   }
28 | 
29 |   public Set<Class<? extends ThraxJob>> getPrerequisites() {
30 |     prereqs.add(FeatureCollectionJob.class);
31 |     return prereqs;
32 |   }
33 | 
34 |   public Job getJob(Configuration conf) throws IOException {
35 |     Job job = new Job(conf, "pivoting");
36 | 
37 |     job.setJarByClass(PivotingReducer.class);
38 | 
39 |     job.setMapperClass(PivotingMapper.class);
40 |     job.setReducerClass(PivotingReducer.class);
41 | 
42 |     job.setInputFormatClass(SequenceFileInputFormat.class);
43 |     job.setMapOutputKeyClass(RuleWritable.class);
44 |     job.setMapOutputValueClass(FeatureMap.class);
45 |     job.setOutputKeyClass(RuleWritable.class);
46 |     job.setOutputValueClass(FeatureMap.class);
47 |     job.setOutputFormatClass(SequenceFileOutputFormat.class);
48 | 
49 |     job.setPartitionerClass(RuleWritable.SourcePartitioner.class);
50 | 
51 |     FileInputFormat.setInputPaths(job, new Path(conf.get("thrax.work-dir") + "collected"));
52 |     int maxSplitSize = conf.getInt("thrax.max-split-size", 0);
53 |     if (maxSplitSize != 0) FileInputFormat.setMaxInputSplitSize(job, maxSplitSize * 20);
54 | 
55 |     int numReducers = conf.getInt("thrax.reducers", conf.getInt("mapreduce.job.reduces", DefaultValues.DEFAULT_NUM_REDUCERS));
56 |     job.setNumReduceTasks(numReducers);
57 | 
58 |     FileOutputFormat.setOutputPath(job, new Path(conf.get("thrax.work-dir") + "pivoted"));
59 | 
60 |     return job;
61 |   }
62 | 
63 |   public String getName() {
64 |     return "pivoting";
65 |   }
66 | 
67 |   public String getOutputSuffix() {
68 |     return "pivoted";
69 |   }
70 | }
71 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/jobs/SchedulerException.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.jobs;
 2 | 
 3 | public class SchedulerException extends Exception
 4 | {
 5 |     private static final long serialVersionUID = 9090L;
 6 | 
 7 |     public SchedulerException(String s)
 8 |     {
 9 |         super(s);
10 |     }
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/jobs/SourceWordGivenTargetWordProbabilityJob.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.jobs;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.apache.hadoop.fs.Path;
 7 | import org.apache.hadoop.mapreduce.Job;
 8 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 9 | 
10 | public class SourceWordGivenTargetWordProbabilityJob extends WordLexprobJob {
11 | 
12 |   public SourceWordGivenTargetWordProbabilityJob() {
13 |     super(true);
14 |   }
15 | 
16 |   public Job getJob(Configuration conf) throws IOException {
17 |     Job job = super.getJob(conf);
18 |     FileOutputFormat.setOutputPath(job, new Path(conf.get("thrax.work-dir") + "lexprobs_sgt"));
19 |     return job;
20 |   }
21 | 
22 |   public String getName() {
23 |     return "source-word-lexprob";
24 |   }
25 | 
26 |   public String getOutputSuffix() {
27 |     return "lexprobs_sgt";
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/jobs/TargetWordGivenSourceWordProbabilityJob.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.jobs;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.apache.hadoop.fs.Path;
 7 | import org.apache.hadoop.mapreduce.Job;
 8 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 9 | 
10 | public class TargetWordGivenSourceWordProbabilityJob extends WordLexprobJob {
11 | 
12 |   public TargetWordGivenSourceWordProbabilityJob() {
13 |     super(false);
14 |   }
15 | 
16 |   public Job getJob(Configuration conf) throws IOException {
17 |     Job job = super.getJob(conf);
18 |     FileOutputFormat.setOutputPath(job, new Path(conf.get("thrax.work-dir") + "lexprobs_tgs"));
19 |     return job;
20 |   }
21 | 
22 |   @Override
23 |   public String getName() {
24 |     return "target-word-lexprob";
25 |   }
26 | 
27 |   @Override
28 |   public String getOutputSuffix() {
29 |     return "lexprobs_tgs";
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/jobs/ThraxJob.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.jobs;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Set;
 5 | 
 6 | import org.apache.hadoop.conf.Configuration;
 7 | import org.apache.hadoop.mapreduce.Job;
 8 | 
 9 | public interface ThraxJob {
10 | 
11 |   public Job getJob(Configuration conf) throws IOException;
12 | 
13 |   public Set<Class<? extends ThraxJob>> getPrerequisites();
14 | 
15 |   public String getName();
16 | 
17 |   public String getOutputSuffix();
18 | }
19 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/jobs/WordLexprobJob.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.jobs;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.HashSet;
 5 | import java.util.Set;
 6 | 
 7 | import org.apache.hadoop.conf.Configuration;
 8 | import org.apache.hadoop.fs.Path;
 9 | import org.apache.hadoop.io.FloatWritable;
10 | import org.apache.hadoop.io.IntWritable;
11 | import org.apache.hadoop.io.LongWritable;
12 | import org.apache.hadoop.mapreduce.Job;
13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
14 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
15 | import org.apache.hadoop.mapreduce.lib.reduce.IntSumReducer;
16 | 
17 | import edu.jhu.thrax.hadoop.features.WordLexicalProbabilityCalculator;
18 | 
19 | public abstract class WordLexprobJob implements ThraxJob {
20 |   public static final String SOURCE_GIVEN_TARGET = "thrax.__wordlexprob_sgt";
21 |   private boolean isSourceGivenTarget;
22 | 
23 |   public WordLexprobJob(boolean isSrcGivenTgt) {
24 |     isSourceGivenTarget = isSrcGivenTgt;
25 |   }
26 | 
27 |   public Set<Class<? extends ThraxJob>> getPrerequisites() {
28 |     Set<Class<? extends ThraxJob>> result = new HashSet<Class<? extends ThraxJob>>();
29 |     result.add(VocabularyJob.class);
30 |     return result;
31 |   }
32 | 
33 |   public Job getJob(Configuration conf) throws IOException {
34 |     Configuration theConf = new Configuration(conf);
35 |     theConf.setBoolean(SOURCE_GIVEN_TARGET, isSourceGivenTarget);
36 |     Job job = new Job(theConf, getName());
37 |     job.setJarByClass(WordLexicalProbabilityCalculator.class);
38 |     job.setMapperClass(WordLexicalProbabilityCalculator.Map.class);
39 |     job.setCombinerClass(IntSumReducer.class);
40 |     
41 |     job.setPartitionerClass(WordLexicalProbabilityCalculator.Partition.class);
42 |     job.setReducerClass(WordLexicalProbabilityCalculator.Reduce.class);
43 | 
44 |     job.setMapOutputKeyClass(LongWritable.class);
45 |     job.setMapOutputValueClass(IntWritable.class);
46 |     
47 |     int numReducers = conf.getInt("thrax.reducers", conf.getInt("mapreduce.job.reduces", DefaultValues.DEFAULT_NUM_REDUCERS));
48 |     job.setNumReduceTasks(numReducers);
49 | 
50 |     job.setOutputKeyClass(LongWritable.class);
51 |     job.setOutputValueClass(FloatWritable.class);
52 | 
53 |     job.setOutputFormatClass(SequenceFileOutputFormat.class);
54 | 
55 |     FileInputFormat.setInputPaths(job, new Path(conf.get("thrax.input-file")));
56 |     int maxSplitSize = conf.getInt("thrax.max-split-size", 0);
57 |     if (maxSplitSize != 0) {
58 |       FileInputFormat.setMaxInputSplitSize(job, maxSplitSize);
59 |     }
60 |     return job;
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/output/OutputReducer.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.output;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.List;
 5 | import java.util.Map;
 6 | import java.util.TreeMap;
 7 | 
 8 | import org.apache.hadoop.conf.Configuration;
 9 | import org.apache.hadoop.io.NullWritable;
10 | import org.apache.hadoop.io.Text;
11 | import org.apache.hadoop.io.Writable;
12 | import org.apache.hadoop.mapreduce.Reducer;
13 | 
14 | import edu.jhu.thrax.hadoop.datatypes.FeaturePair;
15 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
16 | import edu.jhu.thrax.hadoop.features.SimpleFeature;
17 | import edu.jhu.thrax.hadoop.features.SimpleFeatureFactory;
18 | import edu.jhu.thrax.util.BackwardsCompatibility;
19 | import edu.jhu.thrax.util.FormatUtils;
20 | import edu.jhu.thrax.util.Vocabulary;
21 | 
22 | public class OutputReducer extends Reducer<RuleWritable, FeaturePair, Text, NullWritable> {
23 | 
24 |   private boolean label;
25 |   private boolean sparse;
26 | 
27 |   private List<SimpleFeature> simpleFeatures;
28 | 
29 |   protected void setup(Context context) throws IOException, InterruptedException {
30 |     Configuration conf = context.getConfiguration();
31 |     String vocabulary_path = conf.getRaw("thrax.work-dir") + "vocabulary/part-*";
32 |     Vocabulary.initialize(conf, vocabulary_path);
33 |     
34 |     label = conf.getBoolean("thrax.label-feature-scores", true);
35 |     sparse = conf.getBoolean("thrax.sparse-feature-vectors", false);
36 | 
37 |     String features = BackwardsCompatibility.equivalent(conf.get("thrax.features", ""));
38 |     simpleFeatures = SimpleFeatureFactory.getAll(features);
39 |   }
40 | 
41 |   protected void reduce(RuleWritable key, Iterable<FeaturePair> values, Context context)
42 |       throws IOException, InterruptedException {
43 |     Map<String, Writable> features = new TreeMap<String, Writable>();
44 |     for (FeaturePair fp : values)
45 |       features.put(Vocabulary.word(fp.key), fp.val.get());
46 |     for (SimpleFeature feature : simpleFeatures)
47 |       features.put(feature.getName(), feature.score(key));
48 |     context.write(FormatUtils.ruleToText(key, features, label, sparse), NullWritable.get());
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/paraphrasing/AggregationCombiner.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.paraphrasing;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.ArrayList;
 5 | import java.util.List;
 6 | 
 7 | import org.apache.hadoop.conf.Configuration;
 8 | import org.apache.hadoop.mapreduce.Reducer;
 9 | 
10 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
11 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
12 | import edu.jhu.thrax.hadoop.features.annotation.AnnotationFeature;
13 | import edu.jhu.thrax.hadoop.features.annotation.AnnotationFeatureFactory;
14 | import edu.jhu.thrax.hadoop.features.pivot.PivotedAnnotationFeature;
15 | import edu.jhu.thrax.hadoop.features.pivot.PivotedFeature;
16 | import edu.jhu.thrax.hadoop.features.pivot.PivotedFeatureFactory;
17 | import edu.jhu.thrax.util.BackwardsCompatibility;
18 | import edu.jhu.thrax.util.FormatUtils;
19 | import edu.jhu.thrax.util.Vocabulary;
20 | 
21 | public class AggregationCombiner
22 |     extends Reducer<RuleWritable, FeatureMap, RuleWritable, FeatureMap> {
23 | 
24 |   private List<PivotedFeature> pivotedFeatures;
25 | 
26 |   protected void setup(Context context) throws IOException, InterruptedException {
27 |     Configuration conf = context.getConfiguration();
28 |     String vocabulary_path = conf.getRaw("thrax.work-dir") + "vocabulary/part-*";
29 |     Vocabulary.initialize(conf, vocabulary_path);
30 | 
31 |     pivotedFeatures = new ArrayList<PivotedFeature>();
32 |     List<AnnotationFeature> annotationFeatures = new ArrayList<AnnotationFeature>();
33 | 
34 |     String features = BackwardsCompatibility.equivalent(conf.get("thrax.features", ""));
35 |     for (String f_name : FormatUtils.P_COMMA_OR_SPACE.split(features)) {
36 |       PivotedFeature pf = PivotedFeatureFactory.get(f_name);
37 |       if (pf != null) {
38 |         pivotedFeatures.add(pf);
39 |       } else {
40 |         AnnotationFeature af = AnnotationFeatureFactory.get(f_name);
41 |         if (af != null) {
42 |           annotationFeatures.add(af);
43 |         }
44 |       }
45 |     }
46 |     if (!annotationFeatures.isEmpty()) pivotedFeatures.add(new PivotedAnnotationFeature());
47 |   }
48 | 
49 |   protected void reduce(RuleWritable key, Iterable<FeatureMap> values, Context context)
50 |       throws IOException, InterruptedException {
51 |     FeatureMap merged = new FeatureMap();
52 | 
53 |     for (PivotedFeature feature : pivotedFeatures)
54 |       feature.initializeAggregation();
55 |     for (FeatureMap feature_map : values) {
56 |       for (PivotedFeature feature : pivotedFeatures) {
57 |         try {
58 |           feature.aggregate(feature_map);
59 |         } catch (Exception e) {
60 |           throw new RuntimeException(key.toString() + " on " + feature.getName() + ": "
61 |               + e.getMessage());
62 |         }
63 |       }
64 |     }
65 |     for (PivotedFeature feature : pivotedFeatures)
66 |       merged.put(feature.getName(), feature.finalizeAggregation());
67 |     context.write(key, merged);
68 |   }
69 | }
70 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/paraphrasing/AggregationMapper.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.paraphrasing;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.apache.hadoop.mapreduce.Mapper;
 7 | 
 8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
 9 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
10 | import edu.jhu.thrax.util.Vocabulary;
11 | 
12 | public class AggregationMapper extends Mapper<RuleWritable, FeatureMap, RuleWritable, FeatureMap> {
13 | 
14 |   protected void setup(Context context) throws IOException, InterruptedException {
15 |     Configuration conf = context.getConfiguration();
16 |     Vocabulary.initialize(conf);
17 |   }
18 | 
19 |   protected void map(RuleWritable key, FeatureMap value, Context context) throws IOException,
20 |       InterruptedException {
21 |     context.write(key, value);
22 |     context.progress();
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/paraphrasing/FeatureCollectionReducer.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.paraphrasing;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.apache.hadoop.mapreduce.Reducer;
 7 | 
 8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
 9 | import edu.jhu.thrax.hadoop.datatypes.FeaturePair;
10 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
11 | import edu.jhu.thrax.util.Vocabulary;
12 | 
13 | public class FeatureCollectionReducer
14 |     extends Reducer<RuleWritable, FeaturePair, RuleWritable, FeatureMap> {
15 | 
16 |   protected void setup(Context context) throws IOException, InterruptedException {
17 |     Configuration conf = context.getConfiguration();
18 |     String vocabulary_path = conf.getRaw("thrax.work-dir") + "vocabulary/part-*";
19 |     Vocabulary.initialize(conf, vocabulary_path);
20 |   }
21 | 
22 |   protected void reduce(RuleWritable key, Iterable<FeaturePair> values, Context context)
23 |       throws IOException, InterruptedException {
24 |     FeatureMap features = new FeatureMap();
25 |     for (FeaturePair fp : values)
26 |       features.put(fp.key, fp.val.get());
27 |     context.write(key, features);
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/paraphrasing/PivotingMapper.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.paraphrasing;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.apache.hadoop.mapreduce.Mapper;
 7 | 
 8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
 9 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
10 | import edu.jhu.thrax.util.Vocabulary;
11 | 
12 | public class PivotingMapper extends Mapper<RuleWritable, FeatureMap, RuleWritable, FeatureMap> {
13 | 
14 |   protected void setup(Context context) throws IOException, InterruptedException {
15 |     Configuration conf = context.getConfiguration();
16 |     Vocabulary.initialize(conf);
17 |   }
18 | 
19 |   protected void map(RuleWritable key, FeatureMap value, Context context) throws IOException,
20 |       InterruptedException {
21 |     context.write(key, value);
22 |     context.progress();
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/tools/ExtractionTool.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.tools;
 2 | 
 3 | import java.util.Map;
 4 | 
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.apache.hadoop.conf.Configured;
 7 | import org.apache.hadoop.fs.Path;
 8 | import org.apache.hadoop.io.IntWritable;
 9 | import org.apache.hadoop.mapreduce.Job;
10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
11 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
12 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
13 | import org.apache.hadoop.mapreduce.lib.reduce.IntSumReducer;
14 | import org.apache.hadoop.util.Tool;
15 | import org.apache.hadoop.util.ToolRunner;
16 | 
17 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
18 | import edu.jhu.thrax.hadoop.extraction.ExtractionMapper;
19 | import edu.jhu.thrax.util.ConfFileParser;
20 | 
21 | public class ExtractionTool extends Configured implements Tool
22 | {
23 |     public int run(String [] argv) throws Exception
24 |     {
25 |         if (argv.length < 1) {
26 |             System.err.println("USAGE: ExtractionTool <conf file>");
27 |             return 1;
28 |         }
29 |         String thraxConf = argv[0];
30 |         Configuration conf = getConf();
31 | 
32 |         Map<String,String> options = ConfFileParser.parse(thraxConf);
33 |         for (String opt : options.keySet()) {
34 |             conf.set("thrax." + opt, options.get(opt));
35 |         }
36 |         String inputPath = conf.get("thrax.input-file");
37 |         if (inputPath == null) {
38 |             System.err.println("Set input-file key in conf file " + thraxConf + "!");
39 |             return 1;
40 |         }
41 |         String workDir = conf.get("thrax.work-dir");
42 |         if (workDir == null) {
43 |             System.err.println("Set work-dir key in conf file " + thraxConf + "!");
44 |             return 1;
45 |         }
46 | 
47 |         Job job = new Job(conf, "thrax");
48 |         job.setJarByClass(ExtractionMapper.class);
49 |         job.setMapperClass(ExtractionMapper.class);
50 |         job.setCombinerClass(IntSumReducer.class);
51 |         job.setReducerClass(IntSumReducer.class);
52 | 
53 |         job.setMapOutputKeyClass(RuleWritable.class);
54 |         job.setMapOutputValueClass(IntWritable.class);
55 | 
56 |         job.setOutputKeyClass(RuleWritable.class);
57 |         job.setOutputValueClass(IntWritable.class);
58 | 
59 |         job.setOutputFormatClass(SequenceFileOutputFormat.class);
60 | 
61 |         FileInputFormat.setInputPaths(job, new Path(inputPath));
62 |         if (!workDir.endsWith(Path.SEPARATOR))
63 |             workDir += Path.SEPARATOR;
64 |         FileOutputFormat.setOutputPath(job, new Path(workDir + "rules"));
65 | 
66 |         job.submit();
67 |         return 0;
68 |     }
69 | 
70 |     public static void main(String [] argv) throws Exception
71 |     {
72 |         int exit_code = ToolRunner.run(null, new ExtractionTool(), argv);
73 |         System.exit(exit_code);
74 |     }
75 | }
76 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/tools/FeatureTool.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.tools;
 2 | 
 3 | import java.util.Map;
 4 | 
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.apache.hadoop.conf.Configured;
 7 | import org.apache.hadoop.fs.Path;
 8 | import org.apache.hadoop.io.IntWritable;
 9 | import org.apache.hadoop.mapreduce.Job;
10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
11 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
12 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
13 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
14 | import org.apache.hadoop.util.Tool;
15 | import org.apache.hadoop.util.ToolRunner;
16 | 
17 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
18 | import edu.jhu.thrax.hadoop.features.mapred.MapReduceFeature;
19 | import edu.jhu.thrax.hadoop.features.mapred.MapReduceFeatureFactory;
20 | import edu.jhu.thrax.util.ConfFileParser;
21 | 
22 | public class FeatureTool extends Configured implements Tool
23 | {
24 |     public int run(String [] argv) throws Exception
25 |     {
26 |         if (argv.length < 2) {
27 |             System.err.println("usage: FeatureTool <conf file> <feature>");
28 |             return 1;
29 |         }
30 |         String confFile = argv[0];
31 |         String featureName = argv[1];
32 |         MapReduceFeature f = MapReduceFeatureFactory.get(featureName);
33 |         if (!(f instanceof MapReduceFeature)) {
34 |             System.err.println("Not a MapReduceFeature: " + featureName);
35 |             return 1;
36 |         }
37 |         Configuration conf = getConf();
38 |         Map<String,String> options = ConfFileParser.parse(confFile);
39 |         for (String opt : options.keySet()) {
40 |             conf.set("thrax." + opt, options.get(opt));
41 |         }
42 |         String workDir = conf.get("thrax.work-dir");
43 |         if (workDir == null) {
44 |             System.err.println("set work-dir key in conf file " + confFile + "!");
45 |             return 1;
46 |         }
47 |         if (!workDir.endsWith(Path.SEPARATOR)) {
48 |             workDir += Path.SEPARATOR;
49 |             conf.set("thrax.work-dir", workDir);
50 |         }
51 |         Job job = new Job(conf, String.format("thrax-%s", featureName));
52 | 
53 |         job.setJarByClass(f.getClass());
54 |         job.setMapperClass(f.mapperClass());
55 |         job.setCombinerClass(f.combinerClass());
56 |         job.setSortComparatorClass(f.sortComparatorClass());
57 |         job.setPartitionerClass(f.partitionerClass());
58 |         job.setReducerClass(f.reducerClass());
59 | 
60 |         job.setInputFormatClass(SequenceFileInputFormat.class);
61 | 
62 |         job.setMapOutputKeyClass(RuleWritable.class);
63 |         job.setMapOutputValueClass(IntWritable.class);
64 | 
65 |         job.setOutputKeyClass(RuleWritable.class);
66 |         job.setOutputValueClass(IntWritable.class);
67 | 
68 |         job.setOutputFormatClass(SequenceFileOutputFormat.class);
69 | 
70 |         FileInputFormat.setInputPaths(job, new Path(workDir + "rules"));
71 |         FileOutputFormat.setOutputPath(job, new Path(workDir + featureName));
72 | 
73 |         job.submit();
74 |         return 0;
75 |     }
76 | 
77 |     public static void main(String [] argv) throws Exception
78 |     {
79 |         int exit_code = ToolRunner.run(null, new FeatureTool(), argv);
80 |         System.exit(exit_code);
81 |     }
82 | }
83 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/tools/OutputTool.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.tools;
 2 | 
 3 | import java.util.Map;
 4 | 
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.apache.hadoop.conf.Configured;
 7 | import org.apache.hadoop.fs.Path;
 8 | import org.apache.hadoop.io.NullWritable;
 9 | import org.apache.hadoop.mapreduce.Job;
10 | import org.apache.hadoop.mapreduce.Mapper;
11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
12 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
14 | import org.apache.hadoop.util.Tool;
15 | import org.apache.hadoop.util.ToolRunner;
16 | 
17 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
18 | import edu.jhu.thrax.hadoop.features.mapred.MapReduceFeature;
19 | import edu.jhu.thrax.hadoop.features.mapred.MapReduceFeatureFactory;
20 | import edu.jhu.thrax.hadoop.output.OutputReducer;
21 | import edu.jhu.thrax.util.BackwardsCompatibility;
22 | import edu.jhu.thrax.util.ConfFileParser;
23 | import edu.jhu.thrax.util.FormatUtils;
24 | 
25 | public class OutputTool extends Configured implements Tool
26 | {
27 |     public int run(String [] argv) throws Exception
28 |     {
29 |         if (argv.length < 1) {
30 |             System.err.println("usage: OutputTool <conf file>");
31 |             return 1;
32 |         }
33 |         String confFile = argv[0];
34 |         Map<String,String> options = ConfFileParser.parse(confFile);
35 |         Configuration conf = getConf();
36 |         for (String opt : options.keySet()) {
37 |             conf.set("thrax." + opt, options.get(opt));
38 |         }
39 |         String workDir = conf.get("thrax.work-dir");
40 |         if (workDir == null) {
41 |             System.err.println("Set work-dir key in conf file " + confFile + "!");
42 |             return 1;
43 |         }
44 |         if (!workDir.endsWith(Path.SEPARATOR)) {
45 |             workDir += Path.SEPARATOR;
46 |             conf.set("thrax.work-dir", workDir);
47 |         }
48 |         Job job = new Job(conf, "thrax-collect");
49 |         job.setJarByClass(OutputReducer.class);
50 | 
51 |         job.setMapperClass(Mapper.class);
52 |         job.setReducerClass(OutputReducer.class);
53 | 
54 |         job.setInputFormatClass(SequenceFileInputFormat.class);
55 | 
56 |         job.setMapOutputKeyClass(RuleWritable.class);
57 |         job.setMapOutputValueClass(NullWritable.class);
58 | 
59 |         job.setOutputKeyClass(RuleWritable.class);
60 |         job.setOutputValueClass(NullWritable.class);
61 | 
62 |         String features = BackwardsCompatibility.equivalent(conf.get("thrax.features", ""));
63 |         for (String feature : FormatUtils.P_SPACE.split(features)) {
64 |             if (MapReduceFeatureFactory.get(feature) instanceof MapReduceFeature) {
65 |                 FileInputFormat.addInputPath(job, new Path(workDir + feature));
66 |             }
67 |         }
68 |         if (FileInputFormat.getInputPaths(job).length == 0)
69 |             FileInputFormat.addInputPath(job, new Path(workDir + "rules"));
70 |         FileOutputFormat.setOutputPath(job, new Path(workDir + "final"));
71 | 
72 |         job.submit();
73 |         return 0;
74 |     }
75 | 
76 |     public static void main(String [] argv) throws Exception
77 |     {
78 |         int exit_code = ToolRunner.run(null, new OutputTool(), argv);
79 |         System.exit(exit_code);
80 |     }
81 | }
82 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/tools/SourceWordGivenTargetWordProbabilityTool.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.tools;
 2 | 
 3 | import java.util.Map;
 4 | 
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.apache.hadoop.conf.Configured;
 7 | import org.apache.hadoop.fs.Path;
 8 | import org.apache.hadoop.io.FloatWritable;
 9 | import org.apache.hadoop.io.IntWritable;
10 | import org.apache.hadoop.mapreduce.Job;
11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
12 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
13 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
14 | import org.apache.hadoop.mapreduce.lib.reduce.IntSumReducer;
15 | import org.apache.hadoop.util.Tool;
16 | import org.apache.hadoop.util.ToolRunner;
17 | 
18 | import edu.jhu.thrax.hadoop.datatypes.TextPair;
19 | import edu.jhu.thrax.hadoop.features.WordLexicalProbabilityCalculator;
20 | import edu.jhu.thrax.hadoop.jobs.WordLexprobJob;
21 | import edu.jhu.thrax.util.ConfFileParser;
22 | 
23 | public class SourceWordGivenTargetWordProbabilityTool extends Configured implements Tool
24 | {
25 |     public int run(String [] argv) throws Exception
26 |     {
27 |         if (argv.length < 1) {
28 |             System.err.println("usage: SourceWordGivenTargetWordProbabilityTool <conf file>");
29 |             return 1;
30 |         }
31 |         String confFile = argv[0];
32 |         Configuration conf = getConf();
33 |         Map<String,String> options = ConfFileParser.parse(confFile);
34 |         for (String opt : options.keySet()) {
35 |             conf.set("thrax." + opt, options.get(opt));
36 |         }
37 |         String input = conf.get("thrax.input-file");
38 |         if (input == null) {
39 |             System.err.println("set input-file key in conf file " + confFile + "!");
40 |             return 1;
41 |         }
42 |         String workDir = conf.get("thrax.work-dir");
43 |         if (workDir == null) {
44 |             System.err.println("set work-dir key in conf file " + confFile + "!");
45 |             return 1;
46 |         }
47 |         if (!workDir.endsWith(Path.SEPARATOR)) {
48 |             workDir += Path.SEPARATOR;
49 |             conf.set("thrax.work-dir", workDir);
50 |         }
51 | 		conf.setBoolean(WordLexprobJob.SOURCE_GIVEN_TARGET, true);
52 |         Job job = new Job(conf, "thrax-sgt-word-lexprob");
53 | 
54 |         job.setJarByClass(WordLexicalProbabilityCalculator.class);
55 |         job.setMapperClass(WordLexicalProbabilityCalculator.Map.class);
56 |         job.setCombinerClass(IntSumReducer.class);
57 |         job.setSortComparatorClass(TextPair.SndMarginalComparator.class);
58 |         job.setPartitionerClass(WordLexicalProbabilityCalculator.Partition.class);
59 |         job.setReducerClass(WordLexicalProbabilityCalculator.Reduce.class);
60 | 
61 |         job.setMapOutputKeyClass(TextPair.class);
62 |         job.setMapOutputValueClass(IntWritable.class);
63 | 
64 |         job.setOutputKeyClass(TextPair.class);
65 |         job.setOutputValueClass(FloatWritable.class);
66 | 
67 |         job.setOutputFormatClass(SequenceFileOutputFormat.class);
68 | 
69 |         FileInputFormat.setInputPaths(job, new Path(input));
70 |         FileOutputFormat.setOutputPath(job, new Path(workDir + "lexprobs_sgt"));
71 | 
72 |         job.submit();
73 |         return 0;
74 |     }
75 | 
76 |     public static void main(String [] argv) throws Exception
77 |     {
78 |         int exit_code = ToolRunner.run(null, new SourceWordGivenTargetWordProbabilityTool(), argv);
79 |         System.exit(exit_code);
80 |     }
81 | }
82 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/tools/TargetWordGivenSourceWordProbabilityTool.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.hadoop.tools;
 2 | 
 3 | import java.util.Map;
 4 | 
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.apache.hadoop.conf.Configured;
 7 | import org.apache.hadoop.fs.Path;
 8 | import org.apache.hadoop.io.FloatWritable;
 9 | import org.apache.hadoop.io.IntWritable;
10 | import org.apache.hadoop.mapreduce.Job;
11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
12 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
13 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
14 | import org.apache.hadoop.mapreduce.lib.reduce.IntSumReducer;
15 | import org.apache.hadoop.util.Tool;
16 | import org.apache.hadoop.util.ToolRunner;
17 | 
18 | import edu.jhu.thrax.hadoop.datatypes.TextPair;
19 | import edu.jhu.thrax.hadoop.features.WordLexicalProbabilityCalculator;
20 | import edu.jhu.thrax.hadoop.jobs.WordLexprobJob;
21 | import edu.jhu.thrax.util.ConfFileParser;
22 | 
23 | public class TargetWordGivenSourceWordProbabilityTool extends Configured implements Tool
24 | {
25 |     public int run(String [] argv) throws Exception
26 |     {
27 |         if (argv.length < 1) {
28 |             System.err.println("usage: TargetWordGivenSourceWordProbabilityTool <conf file>");
29 |             return 1;
30 |         }
31 |         String confFile = argv[0];
32 |         Configuration conf = getConf();
33 |         Map<String,String> options = ConfFileParser.parse(confFile);
34 |         for (String opt : options.keySet()) {
35 |             conf.set("thrax." + opt, options.get(opt));
36 |         }
37 |         String input = conf.get("thrax.input-file");
38 |         if (input == null) {
39 |             System.err.println("set input-file key in conf file " + confFile + "!");
40 |             return 1;
41 |         }
42 |         String workDir = conf.get("thrax.work-dir");
43 |         if (workDir == null) {
44 |             System.err.println("set work-dir key in conf file " + confFile + "!");
45 |             return 1;
46 |         }
47 |         if (!workDir.endsWith(Path.SEPARATOR)) {
48 |             workDir += Path.SEPARATOR;
49 |             conf.set("thrax.work-dir", workDir);
50 |         }
51 | 		conf.setBoolean(WordLexprobJob.SOURCE_GIVEN_TARGET, false);
52 |         Job job = new Job(conf, "thrax-tgs-word-lexprob");
53 | 
54 |         job.setJarByClass(WordLexicalProbabilityCalculator.class);
55 |         job.setMapperClass(WordLexicalProbabilityCalculator.Map.class);
56 |         job.setCombinerClass(IntSumReducer.class);
57 |         job.setSortComparatorClass(TextPair.SndMarginalComparator.class);
58 |         job.setPartitionerClass(WordLexicalProbabilityCalculator.Partition.class);
59 |         job.setReducerClass(WordLexicalProbabilityCalculator.Reduce.class);
60 | 
61 |         job.setMapOutputKeyClass(TextPair.class);
62 |         job.setMapOutputValueClass(IntWritable.class);
63 | 
64 |         job.setOutputKeyClass(TextPair.class);
65 |         job.setOutputValueClass(FloatWritable.class);
66 | 
67 |         job.setOutputFormatClass(SequenceFileOutputFormat.class);
68 | 
69 |         FileInputFormat.setInputPaths(job, new Path(input));
70 |         FileOutputFormat.setOutputPath(job, new Path(workDir + "lexprobs_tgs"));
71 | 
72 |         job.submit();
73 |         return 0;
74 |     }
75 | 
76 |     public static void main(String [] argv) throws Exception
77 |     {
78 |         int exit_code = ToolRunner.run(null, new TargetWordGivenSourceWordProbabilityTool(), argv);
79 |         System.exit(exit_code);
80 |     }
81 | }
82 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/lexprob/HashMapLexprobTable.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.lexprob;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.HashMap;
 5 | 
 6 | import org.apache.hadoop.conf.Configuration;
 7 | 
 8 | public class HashMapLexprobTable extends SequenceFileLexprobTable {
 9 |   private HashMap<Long, Float> table;
10 | 
11 |   public HashMapLexprobTable(Configuration conf, String fileGlob) throws IOException {
12 |     super(conf, fileGlob);
13 |     Iterable<TableEntry> entries = getSequenceFileIterator(fs, conf, files);
14 |     initialize(entries);
15 |   }
16 | 
17 |   public void initialize(Iterable<TableEntry> entries) {
18 |     table = new HashMap<Long, Float>();
19 |     for (TableEntry te : entries) {
20 |       table.put((((long) te.car << 32) | te.cdr), te.probability);
21 |       if (table.size() % 1000 == 0) System.err.printf("[%d]\n", table.size());
22 |     }
23 |   }
24 | 
25 |   public float get(int car, int cdr) {
26 |     long pair = (((long) car << 32) | cdr);
27 |     if (table.containsKey(pair)) return table.get(pair);
28 |     return -1.0f;
29 |   }
30 | 
31 |   public boolean contains(int car, int cdr) {
32 |     long pair = (((long) car << 32) | cdr);
33 |     return table.containsKey(pair);
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/lexprob/LexicalProbabilityTable.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.lexprob;
 2 | 
 3 | 
 4 | /**
 5 |  * A data structure holding word-level lexical probabilities. The table only needs to support two
 6 |  * operations: determining whether a particular pair is present in the table, and returning the
 7 |  * probability associated with the pair.
 8 |  */
 9 | public interface LexicalProbabilityTable {
10 |   /**
11 |    * Return the lexical probability of a source language word given a target language word.
12 |    * 
13 |    * @param source the source language word
14 |    * @param target the target language word
15 |    * @return the probability p(source|target) if present, -1 otherwise
16 |    */
17 |   public float logpSourceGivenTarget(int source, int target);
18 | 
19 |   // TODO: these don't actually return -logp, they return p.
20 | 
21 |   /**
22 |    * Return the lexical probability of a target language word given a source language word.
23 |    * 
24 |    * @param source the source language word
25 |    * @param target the target language word
26 |    * @return the probability p(target|source) is present, -1 otherwise
27 |    */
28 |   public float logpTargetGivenSource(int source, int target);
29 | }
30 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/lexprob/LexprobTest.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.lexprob;
 2 | 
 3 | import org.apache.hadoop.conf.Configuration;
 4 | import org.apache.hadoop.conf.Configured;
 5 | import org.apache.hadoop.util.Tool;
 6 | import org.apache.hadoop.util.ToolRunner;
 7 | 
 8 | public class LexprobTest extends Configured implements Tool {
 9 |   public int run(String[] argv) throws Exception {
10 |     if (argv.length < 1) {
11 |       System.err.println("usage: LexprobTest <file>");
12 |       return 1;
13 |     }
14 | 
15 |     Configuration conf = getConf();
16 |     HashMapLexprobTable t = new HashMapLexprobTable(conf, argv[0]);
17 |     System.err.println("HashMap populated: " + t.toString());
18 |     TrieLexprobTable trie = new TrieLexprobTable(conf, argv[0]);
19 |     System.err.println("Trie populated: " + trie.toString());
20 |     return 0;
21 |   }
22 | 
23 |   public static void main(String[] argv) throws Exception {
24 |     ToolRunner.run(null, new LexprobTest(), argv);
25 |     return;
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/lexprob/TableEntry.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.lexprob;
 2 | 
 3 | import org.apache.hadoop.io.FloatWritable;
 4 | import org.apache.hadoop.io.LongWritable;
 5 | 
 6 | import edu.jhu.thrax.util.Vocabulary;
 7 | 
 8 | public class TableEntry {
 9 |   
10 |   public final int car;
11 |   public final int cdr;
12 |   public final float probability;
13 | 
14 |   public TableEntry(LongWritable pair, FloatWritable d) {
15 |     int first = (int) (pair.get() >> 32); 
16 |     car = (first < 0 ? Vocabulary.getUnknownId() : first);
17 |     cdr = (int) pair.get();
18 |     probability = d.get();
19 |   }
20 | 
21 |   public String toString() {
22 |     return String.format("(%s,%s):%.4f", car, cdr, probability);
23 |   }
24 | 
25 |   public boolean equals(Object o) {
26 |     if (this == o) return true;
27 |     if (!(o instanceof TableEntry)) return false;
28 |     TableEntry te = (TableEntry) o;
29 |     return car == te.car && cdr == te.cdr && probability == te.probability;
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/syntax/ParseLattice.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.syntax;
 2 | 
 3 | import java.util.Collection;
 4 | 
 5 | public interface ParseLattice {
 6 | 
 7 |     public Collection<Integer> getConstituentLabels(int from, int to);
 8 | 
 9 |     public Collection<Integer> getConcatenatedLabels(int from, int to);
10 | 
11 |     public Collection<Integer> getCcgLabels(int from, int to);
12 | 
13 | }
14 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/tools/JudgeParaphrases.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.tools;
 2 | 
 3 | import java.io.BufferedWriter;
 4 | import java.io.IOException;
 5 | import java.util.Scanner;
 6 | import java.util.logging.Logger;
 7 | 
 8 | import edu.jhu.jerboa.util.FileManager;
 9 | import edu.jhu.thrax.util.io.LineReader;
10 | 
11 | public class JudgeParaphrases {
12 | 
13 |   private static final Logger logger = Logger.getLogger(JudgeParaphrases.class.getName());
14 | 
15 |   public static void main(String[] args) {
16 | 
17 |     String input = null;
18 |     String output = null;
19 | 
20 |     for (int i = 0; i < args.length; i++) {
21 |       if ("-i".equals(args[i]) && (i < args.length - 1)) {
22 |         input = args[++i];
23 |       } else if ("-o".equals(args[i]) && (i < args.length - 1)) {
24 |         output = args[++i];
25 |       }
26 |     }
27 | 
28 |     if (input == null) {
29 |       logger.severe("No input file specified.");
30 |       return;
31 |     }
32 |     if (output == null) {
33 |       logger.severe("No output file specified.");
34 |       return;
35 |     }
36 | 
37 |     LineReader reader = null;
38 |     BufferedWriter writer = null;
39 |     Scanner user = null;
40 |     try {
41 |       reader = new LineReader(input);
42 |       writer = FileManager.getWriter(output);
43 |       user = new Scanner(System.in);
44 |       while (reader.hasNext()) {
45 |         String pp = reader.next().trim();
46 |         System.out.print(pp + "\t");
47 |         String score = user.next().trim();
48 |         if (score.toLowerCase().equals("quit") || score.toLowerCase().equals("exit"))
49 |           break;
50 |         writer.write(score + "\t" + pp + "\n");
51 |       }
52 |       reader.close();
53 |       writer.close();
54 |     } catch (IOException e) {
55 |       logger.severe(e.getMessage());
56 |     }
57 |   }
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/tools/SequenceToGrammar.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.tools;
 2 | 
 3 | import java.io.BufferedWriter;
 4 | import java.util.logging.Logger;
 5 | 
 6 | import org.apache.hadoop.conf.Configuration;
 7 | import org.apache.hadoop.fs.FileSystem;
 8 | import org.apache.hadoop.fs.Path;
 9 | import org.apache.hadoop.io.SequenceFile;
10 | import org.apache.hadoop.io.Text;
11 | 
12 | import edu.jhu.jerboa.util.FileManager;
13 | 
14 | public class SequenceToGrammar {
15 | 
16 |   private static final Logger logger = Logger.getLogger(SequenceToGrammar.class.getName());
17 | 
18 |   private static void usage() {
19 |     System.err.println("Usage: java edu.jhu.thrax.tools.SequenceToGrammar");
20 |     System.err.println("\t -i sequence_file \t Sequence file from Thrax grammar extraction.");
21 |     System.err.println("\t -o output_file   \t Output grammar file name.");
22 |     System.err.println();
23 |   }
24 | 
25 |   public static void main(String[] args) throws Exception {
26 |     String input_file = null;
27 |     String output_file = null;
28 | 
29 |     if (args.length < 4 || args[0].toLowerCase().equals("-h")) {
30 |       usage();
31 |       System.exit(0);
32 |     }
33 |     for (int i = 0; i < args.length; i++) {
34 |       if ("-i".equals(args[i]) && (i < args.length - 1)) {
35 |         input_file = args[++i];
36 |       } else if ("-o".equals(args[i]) && (i < args.length - 1)) {
37 |         output_file = args[++i];
38 |       }
39 |     }
40 |     if (input_file == null) {
41 |       logger.severe("No input file specified.");
42 |       usage();
43 |       System.exit(0);
44 |     }
45 |     if (output_file == null) {
46 |       logger.severe("No output file specified.");
47 |       usage();
48 |       System.exit(0);
49 |     }
50 | 
51 |     Text rule_string = new Text();
52 |     Configuration config = new Configuration();
53 |     Path path = new Path(input_file);
54 |     SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.getLocal(config), path, config);
55 | 
56 |     BufferedWriter grammar_writer = FileManager.getWriter(output_file);
57 |     long rule_count = 0;
58 |     while (reader.next(rule_string)) {
59 |       grammar_writer.write(rule_string.toString());
60 |       grammar_writer.newLine();
61 |       rule_count++;
62 |     }
63 |     reader.close();
64 |     grammar_writer.close();
65 |     System.err.println("Merged " + rule_count + " rules.");
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/util/BackwardsCompatibility.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.util;
 2 | 
 3 | import edu.jhu.thrax.hadoop.features.annotation.SourceGivenTargetLexicalProbabilityFeature;
 4 | import edu.jhu.thrax.hadoop.features.annotation.TargetGivenSourceLexicalProbabilityFeature;
 5 | import edu.jhu.thrax.hadoop.features.annotation.UnalignedSourceCounterFeature;
 6 | import edu.jhu.thrax.hadoop.features.annotation.UnalignedTargetCounterFeature;
 7 | import edu.jhu.thrax.hadoop.features.mapred.SourcePhraseGivenTargetFeature;
 8 | import edu.jhu.thrax.hadoop.features.mapred.TargetPhraseGivenSourceFeature;
 9 | 
10 | public class BackwardsCompatibility {
11 | 
12 |   public static String equivalent(String features) {
13 |     features = features.replace("e2fphrase", SourcePhraseGivenTargetFeature.NAME);
14 |     features = features.replace("f2ephrase", TargetPhraseGivenSourceFeature.NAME);
15 | 
16 |     features = features.replace("lexprob_tgs", TargetGivenSourceLexicalProbabilityFeature.NAME);
17 |     features = features.replace("lexprob_sgt", SourceGivenTargetLexicalProbabilityFeature.NAME);
18 | 
19 |     features =
20 |         features.replace("lexprob", TargetGivenSourceLexicalProbabilityFeature.NAME + " "
21 |             + SourceGivenTargetLexicalProbabilityFeature.NAME);
22 | 
23 |     features =
24 |         features.replace("unaligned-count", UnalignedSourceCounterFeature.NAME + " "
25 |             + UnalignedTargetCounterFeature.NAME);
26 | 
27 |     return features;
28 |   }
29 | 
30 | 	public static String defaultLabelPolicy(boolean allow_nonlexical_x) {
31 | 		if (allow_nonlexical_x) {
32 | 			return "always";
33 | 		} else {
34 | 			return "phrases";
35 | 		}
36 | 	}
37 | }
38 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/util/ConfFileParser.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.util;
 2 | 
 3 | import java.net.URI;
 4 | import java.util.HashMap;
 5 | import java.util.Map;
 6 | import java.util.Scanner;
 7 | 
 8 | import edu.jhu.thrax.util.amazon.AmazonConfigFileLoader;
 9 | 
10 | /**
11 |  * This class parses conf files of a standard format. The '#' character is used
12 |  * to indicate comments, and non-comment lines have a key and a value separated
13 |  * by whitespace.
14 |  */
15 | public class ConfFileParser {
16 | 
17 |     public static Map<String,String> parse(String confName)
18 |     {
19 |         Map<String,String> opts = new HashMap<String,String>();
20 |         Scanner scanner;
21 |         
22 |         try {
23 |                 URI configURI = new URI(confName);
24 |                 String scheme = configURI.getScheme();
25 | 		if (scheme != null && (scheme.equalsIgnoreCase("s3n") || scheme.equalsIgnoreCase("s3"))) {
26 |                     scanner = new Scanner(AmazonConfigFileLoader.getConfigStream(configURI));
27 |                 }
28 |                 else {
29 |                     scanner = new Scanner(DefaultConfigFileLoader.getConfigStream(configURI));
30 |                 }
31 |         } catch (Exception e) {
32 |         	throw new IllegalArgumentException(e.toString());
33 |         }
34 |         
35 |         while (scanner.hasNextLine()) {
36 |             String line = scanner.nextLine();
37 |             // strip comments
38 |             if (line.indexOf("#") != -1) {
39 |                 line = line.substring(0, line.indexOf("#")).trim();
40 |             }
41 |             if ("".equals(line))
42 |                 continue;
43 | 
44 |             String [] keyVal = line.split("\\s+", 2);
45 |             if (keyVal.length > 1)
46 |                 opts.put(keyVal[0].trim(), keyVal[1].trim());
47 |         }
48 |         scanner.close();
49 |         return opts;
50 |     }
51 | }
52 | 
53 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/util/CreateGlueGrammar.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.util;
 2 | 
 3 | import java.io.File;
 4 | import java.io.IOException;
 5 | import java.util.HashSet;
 6 | 
 7 | import edu.jhu.thrax.util.io.LineReader;
 8 | 
 9 | public class CreateGlueGrammar {
10 |   private static HashSet<String> nts;
11 | 
12 |   // [GOAL] ||| [GOAL,1] [X,2] ||| [GOAL,1] [X,2] ||| -1
13 |   // [GOAL] ||| [GOAL,1] </s> ||| [GOAL,1] </s> ||| 0
14 |   // [GOAL] ||| <s> ||| <s> ||| 0
15 | 
16 |   private static final String R_START = "[%1$s] ||| <s> ||| <s> ||| 0";
17 |   private static final String R_TWO = "[%1$s] ||| [%1$s,1] [%2$s,2] ||| [%1$s,1] [%2$s,2] ||| -1";
18 |   private static final String R_END = "[%1$s] ||| [%1$s,1] </s> ||| [%1$s,1] </s> ||| 0";
19 | 
20 |   // [GOAL] ||| <s> [X,1] </s> ||| <s> [X,1] </s> ||| 0
21 |   private static final String R_TOP = "[%1$s] ||| <s> [%2$s,1] </s> ||| <s> [%2$s,1] </s> ||| 0";
22 | 
23 |   private static String GOAL = "GOAL";
24 | 
25 |   public static void main(String[] argv) throws IOException {
26 |     String grammar_file_name = null;
27 |     if (argv.length > 0) grammar_file_name = argv[0];
28 |     if (argv.length > 1) GOAL = argv[1];
29 | 
30 |     if (grammar_file_name == null) {
31 |       System.err.println("No grammar specified.");
32 |       System.exit(1);
33 |     }
34 |     File grammar_file = new File(grammar_file_name);
35 |     if (!grammar_file.exists()) {
36 |       System.err.println("Grammar file doesn't exist: " + grammar_file_name);
37 |       System.exit(1);
38 |     }
39 | 
40 |     nts = new HashSet<String>();
41 |     if (grammar_file.isDirectory()) {
42 |       Vocabulary.read(grammar_file_name + File.separator + "vocabulary");
43 |       for (int i = 0; i < Vocabulary.size(); ++i) {
44 |         String token = Vocabulary.word(i);
45 |         if (Vocabulary.nt(token)) nts.add(token.substring(1, token.length() - 1));
46 |       }
47 |     } else {
48 |       LineReader reader = new LineReader(grammar_file_name);
49 |       while (reader.hasNext()) {
50 |         String line = reader.next();
51 |         int lhsStart = line.indexOf("[") + 1;
52 |         int lhsEnd = line.indexOf("]");
53 |         if (lhsStart < 1 || lhsEnd < 0) {
54 |           System.err.printf("malformed rule: %s\n", line);
55 |           continue;
56 |         }
57 |         String lhs = line.substring(lhsStart, lhsEnd);
58 |         nts.add(lhs);
59 |       }
60 |     }
61 | 
62 |     System.out.println(String.format(R_START, GOAL));
63 |     for (String nt : nts)
64 |       System.out.println(String.format(R_TWO, GOAL, nt));
65 |     System.out.println(String.format(R_END, GOAL));
66 |     for (String nt : nts)
67 |       System.out.println(String.format(R_TOP, GOAL, nt));
68 | 
69 |   }
70 | }
71 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/util/DefaultConfigFileLoader.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.util;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FileInputStream;
 5 | import java.io.IOException;
 6 | import java.io.InputStream;
 7 | import java.net.URI;
 8 | 
 9 | public class DefaultConfigFileLoader
10 | {
11 |     public static InputStream getConfigStream(URI configURI) throws IOException
12 |     { 
13 |         return new FileInputStream(new File(configURI.getPath()));
14 |     }
15 | }
16 | 
17 | 		
18 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/util/ExternalizableToUtf8.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.util;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | public interface ExternalizableToUtf8 {
 6 | 
 7 |     public void readExternalUtf8(String fileName) throws IOException;
 8 | 
 9 |     public void writeExternalUtf8(String fileName) throws IOException;
10 | 
11 | }
12 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/util/GrammarComparison.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.util;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FileInputStream;
 5 | import java.io.FileNotFoundException;
 6 | import java.io.FileOutputStream;
 7 | import java.io.IOException;
 8 | import java.io.PrintStream;
 9 | import java.util.HashSet;
10 | import java.util.Scanner;
11 | import java.util.Set;
12 | import java.util.zip.GZIPInputStream;
13 | 
14 | 
15 | public class GrammarComparison {
16 | 
17 |     private static final String SEPARATOR = "|||";
18 |     private static final String USAGE = "usage: GrammarComparison <grammar> <grammar> <prefix for output files>";
19 | 
20 |     public static void main(String [] argv)
21 |     {
22 |         if (argv.length < 3) {
23 |             System.err.println(USAGE);
24 |             return;
25 |         }
26 | 
27 |         String file1 = argv[0];
28 |         String file2 = argv[1];
29 |         String outputBase = argv[2];
30 | 
31 |         try {
32 |             HashSet<String> grammar1 = getRulesFromFile(file1);
33 |             HashSet<String> alsoGrammar1 = getRulesFromFile(file1);
34 |             HashSet<String> grammar2 = getRulesFromFile(file2);
35 | 
36 |             Set<String> smaller = grammar1.size() < grammar2.size()
37 |                 ? grammar1
38 |                 : grammar2;
39 |             Set<String> larger = smaller == grammar1 ? grammar2 : grammar1;
40 | 
41 |             Set<String> intersection = new HashSet<String>();
42 |             for (String s : smaller) {
43 |                 if (larger.contains(s))
44 |                     intersection.add(s);
45 |             }
46 |             alsoGrammar1.removeAll(grammar2);
47 |             grammar2.removeAll(grammar1);
48 | 
49 |             printRules(alsoGrammar1, outputBase + ".1");
50 |             printRules(grammar2, outputBase + ".2");
51 |             printRules(intersection, outputBase + ".both");
52 |         }
53 |         catch (Exception e) {
54 |             e.printStackTrace();
55 |         }
56 |         return;
57 |     }
58 | 
59 |     private static void printRules(Set<String> rules, String filename) throws FileNotFoundException, SecurityException {
60 |         PrintStream ps = new PrintStream(new FileOutputStream(filename));
61 |         for (String s : rules)
62 |             ps.println(s);
63 |         ps.close();
64 |         return;
65 |     }
66 | 
67 |     private static HashSet<String> getRulesFromFile(String filename) throws IOException
68 |     {
69 |         Scanner scanner;
70 |         if (filename.endsWith(".gz")) {
71 |             scanner = new Scanner(new GZIPInputStream(new FileInputStream(new File(filename))), "UTF-8");
72 |         }
73 |         else {
74 |             scanner = new Scanner(new File(filename), "UTF-8");
75 |         }
76 | 
77 |         HashSet<String> ret = new HashSet<String>();
78 |         while (scanner.hasNextLine()) {
79 |             String line = scanner.nextLine();
80 |             String rule = line.substring(0, line.lastIndexOf(SEPARATOR));
81 |             ret.add(rule);
82 |         }
83 |         scanner.close();
84 |         return ret;
85 |     }
86 | }
87 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/util/HdfsUtils.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2015 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | package edu.jhu.thrax.util;
 3 | 
 4 | import java.io.IOException;
 5 | import java.io.ObjectInputStream;
 6 | import java.io.ObjectOutputStream;
 7 | 
 8 | import org.apache.hadoop.conf.Configuration;
 9 | import org.apache.hadoop.fs.FSDataInputStream;
10 | import org.apache.hadoop.fs.FSDataOutputStream;
11 | import org.apache.hadoop.fs.FileSystem;
12 | import org.apache.hadoop.fs.Path;
13 | 
14 | public class HdfsUtils {
15 |   
16 |   private HdfsUtils() {};
17 |   
18 |   public static <E> void writeObjectToFs(Configuration conf, E object, Path outPath) throws IOException {
19 |     FileSystem hdfs = FileSystem.get(conf);
20 |     
21 |     ObjectOutputStream oos = null;
22 |     try {
23 |       FSDataOutputStream out = hdfs.create(outPath);
24 |       oos = new ObjectOutputStream(out);
25 |       oos.writeObject(object);
26 |     } finally {
27 |       if (oos != null) {
28 |         oos.close();
29 |       }
30 |     }
31 |   }
32 |   
33 |   public static <E> E readObjectFromFs(Configuration conf, Path inPath) throws IOException,ClassNotFoundException {
34 |     FileSystem hdfs = FileSystem.get(conf);
35 | 
36 |     ObjectInputStream ois = null;
37 |     try {
38 |       FSDataInputStream in = hdfs.open(inPath);
39 |       ois = new ObjectInputStream(in);
40 |       @SuppressWarnings("unchecked")
41 |       E object = (E) ois.readObject();
42 |       return object;
43 |     } finally {
44 |       if (ois != null) {
45 |         ois.close();
46 |       }
47 |     }
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/util/Intersect.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.util;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FileInputStream;
 5 | import java.io.FileOutputStream;
 6 | import java.io.IOException;
 7 | import java.io.PrintStream;
 8 | import java.util.ArrayList;
 9 | import java.util.HashMap;
10 | import java.util.Scanner;
11 | import java.util.zip.GZIPInputStream;
12 | 
13 | public class Intersect
14 | {
15 |     private static HashMap<String,ArrayList<String>> rules;
16 |     private static boolean ignoreNTs;
17 |     public static void main(String [] argv) throws Exception
18 |     {
19 |         String file1;
20 |         String file2;
21 |         String outputPrefix;
22 |         if (argv[0].equals("-X")) {
23 |             file1 = argv[1];
24 |             file2 = argv[2];
25 |             outputPrefix = argv[3];
26 |             ignoreNTs = true;
27 |         }
28 |         else {
29 |             file1 = argv[0];
30 |             file2 = argv[1];
31 |             outputPrefix = argv[2];
32 |             ignoreNTs = false;
33 |         }
34 |         getRulesFromFile(file1);
35 | 
36 |         Scanner scanner;
37 |         if (file2.endsWith(".gz"))
38 |             scanner = new Scanner(new GZIPInputStream(new FileInputStream(new File(file2))), "UTF-8");
39 |         else
40 |             scanner = new Scanner(new File(file2), "UTF-8");
41 |         PrintStream firstGrammar = new PrintStream(new FileOutputStream(outputPrefix + ".1"));
42 |         PrintStream secondGrammar = new PrintStream(new FileOutputStream(outputPrefix + ".2"));
43 |         while (scanner.hasNextLine()) {
44 |             String s = scanner.nextLine();
45 |             String r = repr(s);
46 |             if (rules.containsKey(r)) {
47 |                 secondGrammar.println(s);
48 |                 for (String x : rules.get(r))
49 |                     firstGrammar.println(x);
50 |                 rules.get(r).clear();
51 |             }
52 |         }
53 |         scanner.close();
54 |         firstGrammar.close();
55 |         secondGrammar.close();
56 |         return;
57 |     }
58 | 
59 |     private static String repr(String s)
60 |     {
61 |         String r = s.substring(0, s.lastIndexOf("|||"));
62 |         if (ignoreNTs) 
63 |             r = r.replaceAll("\\[[^]]+?\\]", "[X]");
64 |         return r;
65 |     }
66 | 
67 |     private static void getRulesFromFile(String filename) throws IOException
68 |     {
69 |         rules = new HashMap<String,ArrayList<String>>();
70 |         Scanner scanner;
71 |         if (filename.endsWith(".gz")) {
72 |             scanner = new Scanner(new GZIPInputStream(new FileInputStream(new File(filename))), "UTF-8");
73 |         }
74 |         else {
75 |             scanner = new Scanner(new File(filename), "UTF-8");
76 |         }
77 |         while (scanner.hasNextLine()) {
78 |             String s = scanner.nextLine();
79 |             String r = repr(s);
80 |             if (rules.containsKey(r))
81 |                 rules.get(r).add(s);
82 |             else {
83 |                 ArrayList<String> al = new ArrayList<String>();
84 |                 al.add(s);
85 |                 rules.put(r, al);
86 |             }
87 |         }
88 |         scanner.close();
89 |         return;
90 |     }
91 | }
92 | 
93 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/util/MalformedInput.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.util;
 2 | 
 3 | public enum MalformedInput
 4 | {
 5 |     NOT_ENOUGH_FIELDS,
 6 |     EMPTY_SENTENCE,
 7 |     MALFORMED_PARSE,
 8 |     EMPTY_ALIGNMENT,
 9 |     INCONSISTENT_ALIGNMENT,
10 |     UNKNOWN
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/util/MalformedInput.properties:
--------------------------------------------------------------------------------
1 | CounterGroupName=Malformed Inputs
2 | NOT_ENOUGH_FIELDS.name=Not enough fields
3 | EMPTY_SENTENCE.name=Empty sentences
4 | MALFORMED_PARSE.name=Malformed parses
5 | EMPTY_ALIGNMENT.name=Empty alignments
6 | INCONSISTENT_ALIGNMENT.name=Inconsistent alignments
7 | UNKNOWN.name=Unknown errors
8 | 
9 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/util/NegLogMath.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.util;
 2 | 
 3 | public class NegLogMath {
 4 | 
 5 |   // Number of entries in the table.
 6 |   private static final int LOG_ADD_TABLE_SIZE = 640000;
 7 |   // Smallest value for nlog_a - nlog_b.
 8 |   private static final float LOG_ADD_MIN = -64.0f;
 9 |   private static final float AS_GOOD_AS_ZERO = 1e-10f;
10 |   private static final float logAddInc = -LOG_ADD_MIN / LOG_ADD_TABLE_SIZE;
11 |   private static final float invLogAddInc = LOG_ADD_TABLE_SIZE / -LOG_ADD_MIN;
12 |   private static final float[] logAddTable = new float[LOG_ADD_TABLE_SIZE + 1];
13 | 
14 |   static {
15 |     for (int i = 0; i <= LOG_ADD_TABLE_SIZE; i++) {
16 |       logAddTable[i] = (float) -Math.log1p(Math.exp((i * logAddInc) + LOG_ADD_MIN));
17 |     }
18 |   }
19 | 
20 |   public static float logAdd(float nlog_a, float nlog_b) {
21 |     if (nlog_b < nlog_a) {
22 |       float temp = nlog_a;
23 |       nlog_a = nlog_b;
24 |       nlog_b = temp;
25 |     }
26 |     float neg_diff = (nlog_a - nlog_b) - LOG_ADD_MIN;
27 |     if (neg_diff < AS_GOOD_AS_ZERO) {
28 |       return nlog_a;
29 |     }
30 |     return nlog_a + logAddTable[(int) (neg_diff * invLogAddInc)];
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/util/SequenceFileCreator.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.util;
 2 | 
 3 | import java.net.URI;
 4 | import java.util.Scanner;
 5 | 
 6 | import org.apache.hadoop.conf.Configuration;
 7 | import org.apache.hadoop.fs.FileSystem;
 8 | import org.apache.hadoop.fs.Path;
 9 | import org.apache.hadoop.io.LongWritable;
10 | import org.apache.hadoop.io.SequenceFile;
11 | import org.apache.hadoop.io.Text;
12 | 
13 | public class SequenceFileCreator
14 | {
15 | 	public static void main(String [] argv) throws Exception
16 | 	{
17 | 		LongWritable k = new LongWritable();
18 | 		Text v = new Text();
19 | 
20 | 		URI uri = URI.create(argv[0]);
21 | 		Configuration conf = new Configuration();
22 | 		FileSystem fs = FileSystem.get(uri, conf);
23 | 		Path path = new Path(argv[0]);
24 | 		SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, path, LongWritable.class, Text.class);
25 | 
26 | 		long current = 0;
27 | 		Scanner scanner = new Scanner(System.in, "UTF-8");
28 | 		while (scanner.hasNextLine()) {
29 | 			String line = scanner.nextLine();
30 | 			k.set(current);
31 | 			v.set(line);
32 | 			writer.append(k, v);
33 | 			current++;
34 | 		}
35 | 		scanner.close();
36 | 		writer.close();
37 | 		return;
38 | 	}
39 | }
40 | 
41 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/util/amazon/AmazonConfigFileLoader.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.util.amazon;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | import java.net.URI;
 6 | 
 7 | import com.amazonaws.auth.PropertiesCredentials;
 8 | import com.amazonaws.services.s3.AmazonS3;
 9 | import com.amazonaws.services.s3.AmazonS3Client;
10 | import com.amazonaws.services.s3.model.GetObjectRequest;
11 | 
12 | public class AmazonConfigFileLoader {
13 |   protected static final String CRED_PROPS = "AwsCredentials.properties";
14 | 
15 |   public static InputStream getConfigStream(URI configURI) throws IOException {
16 |     InputStream resStream = AmazonConfigFileLoader.class.getResourceAsStream(CRED_PROPS);
17 | 
18 |     if (resStream == null) {
19 |       resStream = AmazonConfigFileLoader.class.getResourceAsStream("/" + CRED_PROPS);
20 |     }
21 | 
22 |     if (resStream == null) {
23 |       throw new IllegalArgumentException("Could not locate " + CRED_PROPS);
24 |     }
25 | 
26 |     AmazonS3 s3 = new AmazonS3Client(new PropertiesCredentials(resStream));
27 |     return s3.getObject(
28 |         new GetObjectRequest(configURI.getHost(), configURI.getPath().replaceFirst("/+", "")))
29 |         .getObjectContent();
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/util/exceptions/ConfigurationException.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.util.exceptions;
 2 | 
 3 | public class ConfigurationException extends Exception
 4 | {
 5 |     private static final long serialVersionUID = 3040L;
 6 | 
 7 |     public ConfigurationException(String message)
 8 |     {
 9 |         super(message);
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/util/exceptions/EmptyAlignmentException.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.util.exceptions;
2 | 
3 | public class EmptyAlignmentException extends MalformedInputException
4 | {
5 |     private static final long serialVersionUID = 4556L;
6 | }
7 | 
8 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/util/exceptions/EmptySentenceException.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.util.exceptions;
2 | 
3 | public class EmptySentenceException extends MalformedInputException
4 | {
5 |     private static final long serialVersionUID = 8132L;
6 | }
7 | 
8 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/util/exceptions/InconsistentAlignmentException.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.util.exceptions;
 2 | 
 3 | public class InconsistentAlignmentException extends MalformedInputException
 4 | {
 5 |     private static final long serialVersionUID = 33L;
 6 | 
 7 |     public InconsistentAlignmentException(String alignment)
 8 |     {
 9 |         super(alignment);
10 |     }
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/util/exceptions/MalformedInputException.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.util.exceptions;
 2 | 
 3 | public class MalformedInputException extends Exception
 4 | {
 5 |     private static final long serialVersionUID = 5544L;
 6 | 
 7 |     public MalformedInputException()
 8 |     {
 9 |         super();
10 |     }
11 | 
12 |     public MalformedInputException(String input)
13 |     {
14 |         super(input);
15 |     }
16 | }
17 | 
18 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/util/exceptions/MalformedParseException.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.util.exceptions;
 2 | 
 3 | public class MalformedParseException extends MalformedInputException
 4 | {
 5 |     private static final long serialVersionUID = 1095L;
 6 | 
 7 |     public MalformedParseException(String parse)
 8 |     {
 9 |         super(parse);
10 |     }
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/util/exceptions/NotEnoughFieldsException.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.util.exceptions;
2 | 
3 | public class NotEnoughFieldsException extends MalformedInputException
4 | {
5 |     private static final long serialVersionUID = 9988L;
6 | }
7 | 
8 | 


--------------------------------------------------------------------------------
/src/edu/jhu/thrax/util/io/Reader.java:
--------------------------------------------------------------------------------
 1 | /* This file is part of the Joshua Machine Translation System.
 2 |  * 
 3 |  * Joshua is free software; you can redistribute it and/or modify
 4 |  * it under the terms of the GNU Lesser General Public License as
 5 |  * published by the Free Software Foundation; either version 2.1
 6 |  * of the License, or (at your option) any later version.
 7 |  *
 8 |  * This library is distributed in the hope that it will be useful,
 9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 |  * Lesser General Public License for more details.
12 |  *
13 |  * You should have received a copy of the GNU Lesser General Public
14 |  * License along with this library; if not, write to the Free
15 |  * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
16 |  * MA 02111-1307 USA
17 |  */
18 | package edu.jhu.thrax.util.io;
19 | 
20 | import java.io.IOException;
21 | import java.util.Iterator;
22 | 
23 | /**
24 |  * Common interface for Reader type objects.
25 |  *
26 |  * @author wren ng thornton <wren@users.sourceforge.net>
27 |  * @version $LastChangedDate: 2009-03-26 15:06:57 -0400 (Thu, 26 Mar 2009) $
28 |  */
29 | public interface Reader<E> extends Iterable<E>, Iterator<E> {
30 | 
31 |     /** Close the reader, freeing all resources. */
32 |     void close() throws IOException;
33 | 
34 |     /** Determine if the reader is ready to read a line. */
35 |     boolean ready() throws IOException;
36 | 
37 |     /** Read a "line" and return an object representing it. */
38 |     E readLine() throws IOException;
39 | }
40 | 


--------------------------------------------------------------------------------
/test/edu/jhu/thrax/extraction/SAMTLabelerTest.java:
--------------------------------------------------------------------------------
 1 | package edu.jhu.thrax.extraction;
 2 | 
 3 | import org.testng.Assert;
 4 | import org.testng.annotations.Test;
 5 | 
 6 | import edu.jhu.thrax.util.Vocabulary;
 7 | 
 8 | public class SAMTLabelerTest {
 9 |   
10 |   private final int defaultLabel = Vocabulary.id("X");
11 |   
12 |   @Test
13 |   public void getLabel_MalformedTree_isDefault() {
14 |     SAMTLabeler labeler =
15 |         new SAMTLabeler("(A b))", true, true, true, true, "top", defaultLabel);
16 |     Assert.assertEquals(labeler.getLabel(0, 1), defaultLabel);
17 |   }
18 | 
19 |   @Test
20 |   public void getLabel_SpanOutOfBounds_isDefault() {
21 |     SAMTLabeler labeler = new SAMTLabeler("(A b)", true, true, true, true, "top", defaultLabel);
22 |     Assert.assertEquals(labeler.getLabel(0, 3), defaultLabel);
23 |     Assert.assertEquals(labeler.getLabel(-2, 1), defaultLabel);
24 |   }
25 | 
26 |   @Test
27 |   public void getLabel_UnaryChain_Top() {
28 |     SAMTLabeler labeler = new SAMTLabeler("(A (B c))", true, true, true, true, "top", defaultLabel);
29 |     Assert.assertEquals(labeler.getLabel(0, 1), "A");
30 |   }
31 | 
32 |   @Test
33 |   public void getLabel_UnaryChain_Bottom() {
34 |     SAMTLabeler labeler = new SAMTLabeler("(A (B c))", true, true, true, true, "bottom", defaultLabel);
35 |     Assert.assertEquals(labeler.getLabel(0, 1), "B");
36 |   }
37 | 
38 |   @Test
39 |   public void getLabel_UnaryChain_All() {
40 |     SAMTLabeler labeler = new SAMTLabeler("(A (B c))", true, true, true, true, "all", defaultLabel);
41 |     Assert.assertEquals(labeler.getLabel(0, 1), "A:B");
42 |   }
43 | 
44 |   @Test
45 |   public void getLabel_NoConst_returnCat() {
46 |     SAMTLabeler labeler = new SAMTLabeler("(A (B c) (D e))", false, true, true, true, "all", defaultLabel);
47 |     Assert.assertEquals(labeler.getLabel(0, 2), "B+D");
48 |   }
49 | 
50 |   @Test
51 |   public void getLabel_NoConstCat_noCCG_returnDefault() {
52 |     SAMTLabeler labeler = new SAMTLabeler("(A (B c) (D e))", false, true, false, true, "all", defaultLabel);
53 |     Assert.assertEquals(labeler.getLabel(0, 2), defaultLabel);
54 |   }
55 | 
56 |   @Test
57 |   public void getLabel_NoConstCat_returnCCG() {
58 |     SAMTLabeler labeler = new SAMTLabeler("(A (B c) (D e))", false, true, false, true, "all", defaultLabel);
59 |     Assert.assertEquals(labeler.getLabel(0, 1), "A/D");
60 |     Assert.assertEquals(labeler.getLabel(1, 2), "A\\B");
61 |   }
62 | 
63 |   @Test
64 |   public void getLabel_NoConstCatCCG_returnDoubleCat() {
65 |     SAMTLabeler labeler =
66 |         new SAMTLabeler("(A (B c) (D e) (F g))", false, false, false, true, "all", defaultLabel);
67 |     Assert.assertEquals(labeler.getLabel(0, 3), "B+D+F");
68 |   }
69 | }
70 | 


--------------------------------------------------------------------------------
/test/edu/jhu/thrax/syntax/ParseTreeTest.java:
--------------------------------------------------------------------------------
  1 | package edu.jhu.thrax.syntax;
  2 | 
  3 | import java.util.Iterator;
  4 | import java.util.List;
  5 | 
  6 | import org.testng.Assert;
  7 | import org.testng.annotations.Test;
  8 | 
  9 | public class ParseTreeTest
 10 | {
 11 | 	@Test
 12 | 	public void numLeaves_Leaf_isOne()
 13 | 	{
 14 | 		ParseTree pt = ParseTree.fromPennFormat("a");
 15 | 		Assert.assertEquals(pt.numLeaves(), 1);
 16 | 	}
 17 | 
 18 | 	@Test
 19 | 	public void numNodes_Leaf_isOne()
 20 | 	{
 21 | 		ParseTree pt = ParseTree.fromPennFormat("a");
 22 | 		Assert.assertEquals(pt.numNodes(), 1);
 23 | 	}
 24 | 
 25 | 	@Test
 26 | 	public void numLeaves_Tree()
 27 | 	{
 28 | 		ParseTree pt = ParseTree.fromPennFormat("(A (B c d))");
 29 | 		Assert.assertEquals(pt.numLeaves(), 2);
 30 | 	}
 31 | 
 32 | 	@Test
 33 | 	public void numNodes_Tree()
 34 | 	{
 35 | 		ParseTree pt = ParseTree.fromPennFormat("(A (B c d))");
 36 | 		Assert.assertEquals(pt.numNodes(), 4);
 37 | 	}
 38 | 
 39 | 	@Test 
 40 | 	void internalNodesWithSpan_Single()
 41 | 	{
 42 | 		ParseTree pt = ParseTree.fromPennFormat("(A (B c d) e)");
 43 | 		List<ParseTree.Node> list = pt.internalNodesWithSpan(0, 2);
 44 | 		Assert.assertEquals(list.size(), 1);
 45 | 		ParseTree.Node node = list.get(0);
 46 | 		Assert.assertEquals(node.label(), "B");
 47 | 		Assert.assertEquals(node.spanStart(), 0);
 48 | 		Assert.assertEquals(node.spanEnd(), 2);
 49 | 		Assert.assertFalse(node.numChildren() == 0);
 50 | 	}
 51 | 
 52 | 	@Test
 53 | 	public void internalNodesWithSpan_unaryChain()
 54 | 	{
 55 | 		ParseTree pt = ParseTree.fromPennFormat("(A (B c))");
 56 | 		List<ParseTree.Node> list = pt.internalNodesWithSpan(0, 1);
 57 | 		Assert.assertEquals(list.size(), 2);
 58 | 		ParseTree.Node node = list.get(0);
 59 | 		Assert.assertEquals(node.label(), "A");
 60 | 		Assert.assertEquals(node.spanStart(), 0);
 61 | 		Assert.assertEquals(node.spanEnd(), 1);
 62 | 		Assert.assertFalse(node.numChildren() == 0);
 63 | 		node = list.get(1);
 64 | 		Assert.assertEquals(node.label(), "B");
 65 | 		Assert.assertEquals(node.spanStart(), 0);
 66 | 		Assert.assertEquals(node.spanEnd(), 1);
 67 | 		Assert.assertFalse(node.numChildren() == 0);
 68 | 	}
 69 | 
 70 | 	@Test
 71 | 	public void leaf_ChildIterator_isEmpty()
 72 | 	{
 73 | 		ParseTree pt = ParseTree.fromPennFormat("a");
 74 | 		ParseTree.Node node = pt.root();
 75 | 		Assert.assertFalse(node.children().hasNext());
 76 | 	}
 77 | 
 78 | 	@Test
 79 | 	public void tree_ChildIterator()
 80 | 	{
 81 | 		ParseTree pt = ParseTree.fromPennFormat("(A b c)");
 82 | 		ParseTree.Node node = pt.root();
 83 | 		Iterator<ParseTree.Node> iter = node.children();
 84 | 		Assert.assertTrue(iter.hasNext());
 85 | 		node = iter.next();
 86 | 		Assert.assertEquals(node.label(), "b");
 87 | 		Assert.assertEquals(node.spanStart(), 0);
 88 | 		Assert.assertEquals(node.spanEnd(), 1);
 89 | 		Assert.assertFalse(node.children().hasNext());
 90 | 		Assert.assertTrue(iter.hasNext());
 91 | 		node = iter.next();
 92 | 		Assert.assertEquals(node.label(), "c");
 93 | 		Assert.assertEquals(node.spanStart(), 1);
 94 | 		Assert.assertEquals(node.spanEnd(), 2);
 95 | 		Assert.assertFalse(node.children().hasNext());
 96 | 		Assert.assertFalse(iter.hasNext());
 97 | 	}
 98 | }
 99 | 
100 | 


--------------------------------------------------------------------------------
/testng.xml:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE suite SYSTEM "http://testng.org/testng-1.0.dtd" >
 2 | 
 3 | <!-- test suite for thrax -->
 4 | 
 5 | <suite name="Thrax" verbose="2">
 6 | 
 7 |     <test name="Datatypes">
 8 |         <packages>
 9 |             <package name="edu.jhu.thrax.datatypes"/>
10 | 			<package name="edu.jhu.thrax.syntax"/>
11 |         </packages>
12 |     </test>
13 | 
14 |     <test name="Input Utilities">
15 |         <packages>
16 |             <package name="edu.jhu.thrax.util.io"/>
17 |         </packages>
18 |     </test>
19 | 
20 | 	<test name="Extraction">
21 | 		<packages>
22 | 			<package name="edu.jhu.thrax.extraction"/>
23 | 		</packages>
24 | 	</test>
25 | 
26 | </suite>
27 | 
28 | 


--------------------------------------------------------------------------------