├── .classpath
├── .gitignore
├── .project
├── AwsCredentials.properties
├── LICENSE.txt
├── README
├── build.xml
├── example
├── counts
│ ├── corpus.a
│ ├── corpus.en
│ ├── corpus.es
│ └── thrax-phrase.conf
├── europarl.unified.1
├── hiero.conf
├── nist09.unified.1
└── samt.conf
├── lib
├── aws-java-sdk-1.1.3.jar
├── commons-lang3-3.1.jar
├── hadoop-common-2.5.2.jar
├── hadoop-mapreduce-client-core-2.5.2.jar
├── jerboa.jar
└── testng-5.8-jdk15.jar
├── scripts
├── berant_to_reference.py
├── create_glue_grammar.sh
├── filter_rules.sh
└── run_on_amazon.sh
├── src
└── edu
│ └── jhu
│ └── thrax
│ ├── Thrax.java
│ ├── datatypes
│ ├── AlignedSentencePair.java
│ ├── Alignment.java
│ ├── ArrayAlignment.java
│ ├── HierarchicalRule.java
│ ├── IntPair.java
│ └── PhrasePair.java
│ ├── distributional
│ ├── ContextPhrase.java
│ ├── ContextPhraseExtractor.java
│ ├── FeatureClass.java
│ ├── FeatureEncoder.java
│ ├── FeatureSet.java
│ └── FeatureTypes.java
│ ├── extraction
│ ├── HierarchicalRuleExtractor.java
│ ├── HieroLabeler.java
│ ├── LabelCache.java
│ ├── Labeling.java
│ ├── ManualSpanLabeler.java
│ ├── SAMTLabeler.java
│ └── SpanLabeler.java
│ ├── hadoop
│ ├── comparators
│ │ ├── FieldComparator.java
│ │ ├── PrimitiveArrayMarginalComparator.java
│ │ └── TextMarginalComparator.java
│ ├── datatypes
│ │ ├── AlignedRuleWritable.java
│ │ ├── AlignmentWritable.java
│ │ ├── Annotation.java
│ │ ├── FeatureMap.java
│ │ ├── FeaturePair.java
│ │ ├── FeatureValue.java
│ │ ├── IntPair.java
│ │ ├── PrimitiveUtils.java
│ │ ├── RuleWritable.java
│ │ └── TextPair.java
│ ├── distributional
│ │ ├── CommonLSH.java
│ │ ├── ContextWritable.java
│ │ ├── DistributionalContextCombiner.java
│ │ ├── DistributionalContextMapper.java
│ │ ├── DistributionalContextReducer.java
│ │ └── SignatureWritable.java
│ ├── extraction
│ │ ├── ExtractionCombiner.java
│ │ ├── ExtractionMapper.java
│ │ ├── ExtractionReducer.java
│ │ ├── HierarchicalRuleWritableExtractor.java
│ │ ├── RuleWritableExtractor.java
│ │ └── RuleWritableExtractorFactory.java
│ ├── features
│ │ ├── AbstractnessFeature.java
│ │ ├── AdjacentNonTerminalsFeature.java
│ │ ├── CharacterCompressionRatioFeature.java
│ │ ├── CharacterCountDifferenceFeature.java
│ │ ├── ConsumeSourceTerminalsFeature.java
│ │ ├── Feature.java
│ │ ├── GlueRuleFeature.java
│ │ ├── IdentityFeature.java
│ │ ├── LexicalityFeature.java
│ │ ├── MonotonicFeature.java
│ │ ├── PhrasePenaltyFeature.java
│ │ ├── ProduceTargetTerminalsFeature.java
│ │ ├── SimpleFeature.java
│ │ ├── SimpleFeatureFactory.java
│ │ ├── SourceWordCounterFeature.java
│ │ ├── TargetWordCounterFeature.java
│ │ ├── WordCompressionRatioFeature.java
│ │ ├── WordCountDifferenceFeature.java
│ │ ├── WordLengthDifferenceFeature.java
│ │ ├── WordLexicalProbabilityCalculator.java
│ │ ├── XRuleFeature.java
│ │ ├── annotation
│ │ │ ├── AlignmentFeature.java
│ │ │ ├── AnnotationFeature.java
│ │ │ ├── AnnotationFeatureFactory.java
│ │ │ ├── AnnotationFeatureJob.java
│ │ │ ├── AnnotationPassthroughFeature.java
│ │ │ ├── AnnotationReducer.java
│ │ │ ├── CountFeature.java
│ │ │ ├── LogCountFeature.java
│ │ │ ├── RarityPenaltyFeature.java
│ │ │ ├── SourceGivenTargetLexicalProbabilityFeature.java
│ │ │ ├── TargetGivenSourceLexicalProbabilityFeature.java
│ │ │ ├── UnalignedSourceCounterFeature.java
│ │ │ └── UnalignedTargetCounterFeature.java
│ │ ├── mapred
│ │ │ ├── CountOfRuleCountsEstimationJob.java
│ │ │ ├── GoodTuringSmoothedSourcePhraseGivenTargetFeature.java
│ │ │ ├── GoodTuringSmoothedTargetPhraseGivenSourceFeature.java
│ │ │ ├── LhsGivenSourcePhraseFeature.java
│ │ │ ├── LhsGivenTargetPhraseFeature.java
│ │ │ ├── MapReduceFeature.java
│ │ │ ├── MapReduceFeatureFactory.java
│ │ │ ├── SourceCountFeature.java
│ │ │ ├── SourcePhraseGivenLHSFeature.java
│ │ │ ├── SourcePhraseGivenTargetFeature.java
│ │ │ ├── SourcePhraseGivenTargetandLHSFeature.java
│ │ │ ├── TargetCountFeature.java
│ │ │ ├── TargetPhraseGivenLHSFeature.java
│ │ │ ├── TargetPhraseGivenSourceFeature.java
│ │ │ ├── TargetPhraseGivenSourceandLHSFeature.java
│ │ │ └── coc
│ │ │ │ ├── CountOfCountsEstimator.java
│ │ │ │ └── GoodTuringSmoother.java
│ │ └── pivot
│ │ │ ├── NonAggregatingPivotedFeature.java
│ │ │ ├── PivotedAnnotationFeature.java
│ │ │ ├── PivotedFeature.java
│ │ │ ├── PivotedFeatureFactory.java
│ │ │ ├── PivotedLexicalSourceGivenTargetFeature.java
│ │ │ ├── PivotedLexicalTargetGivenSourceFeature.java
│ │ │ ├── PivotedLhsGivenSourcePhraseFeature.java
│ │ │ ├── PivotedLhsGivenTargetPhraseFeature.java
│ │ │ ├── PivotedNegLogProbFeature.java
│ │ │ ├── PivotedRarityPenaltyFeature.java
│ │ │ ├── PivotedSourcePhraseGivenLHSFeature.java
│ │ │ ├── PivotedSourcePhraseGivenTargetAndLHSFeature.java
│ │ │ ├── PivotedSourcePhraseGivenTargetFeature.java
│ │ │ ├── PivotedTargetPhraseGivenLHSFeature.java
│ │ │ ├── PivotedTargetPhraseGivenSourceAndLHSFeature.java
│ │ │ └── PivotedTargetPhraseGivenSourceFeature.java
│ ├── jobs
│ │ ├── DefaultValues.java
│ │ ├── DistributionalContextExtractionJob.java
│ │ ├── DistributionalContextSortingJob.java
│ │ ├── ExtractionJob.java
│ │ ├── FeatureCollectionJob.java
│ │ ├── JobState.java
│ │ ├── OutputJob.java
│ │ ├── ParaphraseAggregationJob.java
│ │ ├── ParaphrasePivotingJob.java
│ │ ├── Scheduler.java
│ │ ├── SchedulerException.java
│ │ ├── SourceWordGivenTargetWordProbabilityJob.java
│ │ ├── TargetWordGivenSourceWordProbabilityJob.java
│ │ ├── ThraxJob.java
│ │ ├── VocabularyJob.java
│ │ └── WordLexprobJob.java
│ ├── output
│ │ └── OutputReducer.java
│ ├── paraphrasing
│ │ ├── AggregationCombiner.java
│ │ ├── AggregationMapper.java
│ │ ├── AggregationReducer.java
│ │ ├── FeatureCollectionReducer.java
│ │ ├── PivotingMapper.java
│ │ └── PivotingReducer.java
│ └── tools
│ │ ├── ExtractionTool.java
│ │ ├── FeatureTool.java
│ │ ├── OutputTool.java
│ │ ├── SourceWordGivenTargetWordProbabilityTool.java
│ │ └── TargetWordGivenSourceWordProbabilityTool.java
│ ├── lexprob
│ ├── HashMapLexprobTable.java
│ ├── LexicalProbabilityTable.java
│ ├── LexprobTest.java
│ ├── SequenceFileLexprobTable.java
│ ├── TableEntry.java
│ └── TrieLexprobTable.java
│ ├── syntax
│ ├── LatticeArray.java
│ ├── ParseLattice.java
│ └── ParseTree.java
│ ├── tools
│ ├── ExtractPropbankRules.java
│ ├── JudgeParaphrases.java
│ ├── ParaphraseCoverage.java
│ ├── ParaphraseIntersect.java
│ ├── ParaphraseOverlap.java
│ ├── ParaphraseScore.java
│ ├── ParaphraseWordNet.java
│ ├── SequenceToGrammar.java
│ ├── SequenceToSignatures.java
│ └── SplitAndFilter.java
│ └── util
│ ├── BackwardsCompatibility.java
│ ├── ConfFileParser.java
│ ├── CreateGlueGrammar.java
│ ├── DefaultConfigFileLoader.java
│ ├── ExternalizableToUtf8.java
│ ├── FormatUtils.java
│ ├── GrammarComparison.java
│ ├── HdfsUtils.java
│ ├── Intersect.java
│ ├── MalformedInput.java
│ ├── MalformedInput.properties
│ ├── MurmurHash.java
│ ├── NegLogMath.java
│ ├── SequenceFileCreator.java
│ ├── TestSetFilter.java
│ ├── Vocabulary.java
│ ├── amazon
│ └── AmazonConfigFileLoader.java
│ ├── exceptions
│ ├── ConfigurationException.java
│ ├── EmptyAlignmentException.java
│ ├── EmptySentenceException.java
│ ├── InconsistentAlignmentException.java
│ ├── MalformedInputException.java
│ ├── MalformedParseException.java
│ └── NotEnoughFieldsException.java
│ └── io
│ ├── InputUtilities.java
│ ├── LineReader.java
│ └── Reader.java
├── test
└── edu
│ └── jhu
│ └── thrax
│ ├── datatypes
│ └── ArrayAlignmentTest.java
│ ├── extraction
│ └── SAMTLabelerTest.java
│ ├── hadoop
│ └── features
│ │ └── mapred
│ │ └── coc
│ │ └── CountOfCountsEstimatorTest.java
│ ├── syntax
│ └── ParseTreeTest.java
│ └── util
│ └── io
│ └── InputUtilitiesTest.java
└── testng.xml
/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | bin/
2 | test-output/
3 | doc/
4 | AwsCredentials.properties
5 |
6 | .DS_Store
--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | Thrax
4 |
5 |
6 |
7 |
8 |
9 | org.eclipse.jdt.core.javabuilder
10 |
11 |
12 |
13 |
14 |
15 | org.eclipse.jdt.core.javanature
16 |
17 |
18 |
--------------------------------------------------------------------------------
/AwsCredentials.properties:
--------------------------------------------------------------------------------
1 | accessKey=
2 | secretKey=
3 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) 2010-13 the Thrax team
2 | Jonny Weese
3 | Juri Ganitkevitch
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
1 | Thrax uses Apache hadoop (an open-source implementation of MapReduce) to
2 | efficiently extract a synchronous context-free grammar translation model
3 | for use in modern machine translation systems.
4 |
5 | Thrax currently has support for both Hiero-style grammars (with a single
6 | non-terminal symbol) and SAMT-style grammars (where non-terminal symbols are
7 | calculated by projecting onto the span from a target-side parse tree).
8 |
9 | COMPILING:
10 |
11 | First, you need to set two environment variables:
12 | $HADOOP should point to the directory where Hadoop is installed.
13 | $AWS_SDK should point to the directory where the Amazon Web Services SDK
14 | is installed.
15 |
16 | To compile, type
17 |
18 | ant
19 |
20 | This will compile all classes and package them into a jar for use on a
21 | Hadoop cluster.
22 |
23 | At the end of the compilation, ant should report that the build was successful.
24 |
25 | RUNNING THRAX:
26 | Thrax can be invoked with
27 |
28 | hadoop jar $THRAX/bin/thrax.jar
29 |
30 | Some example configuration files have been included with this distribution:
31 |
32 | example/hiero.conf
33 | example/samt.conf
34 |
35 | COPYRIGHT AND LICENSE:
36 | Copyright (c) 2010-13 by the Thrax team:
37 | Jonny Weese
38 | Juri Ganitkevitch
39 |
40 | See LICENSE.txt (included with this distribution) for the complete terms.
41 |
--------------------------------------------------------------------------------
/build.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
--------------------------------------------------------------------------------
/example/counts/thrax-phrase.conf:
--------------------------------------------------------------------------------
1 | # this is an example Thrax configuration file
2 | # <- this symbol indicates a comment
3 | # each line should be a key-value pair separated by whitespace
4 |
5 | ###
6 | ### GRAMMAR OPTIONS
7 | ###
8 |
9 | grammar hiero # or samt
10 | reverse false
11 | source-is-parsed false
12 | target-is-parsed false
13 | # default-nt X # X is the default anyway
14 |
15 | min-rule-count 1
16 |
17 | # the number of reducers
18 | reducers 16
19 |
20 | # Maximum length of initial phrase pairs. These are set to be shorter than
21 | # used by Hiero.
22 | initial-phrase-length 5
23 | lex-source-words 5
24 | lex-target-words 5
25 |
26 | # maximum number of NTs in a rule
27 | arity 0
28 |
29 | # minimum number of aligned terminals in a rule
30 | lexicality 1
31 |
32 | # allow adjacent nonterminals on source side
33 | adjacent-nts false
34 |
35 | # allow unaligned words at boundaries of phrases
36 | loose true
37 |
38 | allow-abstract-rules false
39 | allow-nonlexical-x false
40 | allow-full-sentence-rules false
41 |
42 | nonlex-source-length 5
43 | nonlex-target-length 5
44 | nonlex-source-words 5
45 | nonlex-target-words 5
46 |
47 | allow-double-plus false
48 |
49 | rule-span-limit 12
50 |
51 | phrase-penalty 2.718
52 |
53 | # a whitespace seperated list of features
54 | # in this example, the features are phrase translation probability,
55 | # lexical probability, and phrase penalty
56 | # features phrase-penalty e2fphrase f2ephrase lexprob lexical abstract adjacent x-rule source-terminals-without-target target-terminals-without-source monotonic glue-rule rarity target-word-count unaligned-count
57 | features e_given_f_phrase f_given_e_phrase e_given_f_lex f_given_e_lex rarity phrase-penalty alignment count
58 |
59 | # the only option and default later we will want to add formats for other decoders such as moses and
60 | # cdec, if they use other formats
61 | output-format joshua
62 |
63 | # label feature scores? each score will be output as name=score
64 | label-feature-scores false
65 |
66 | amazon-work s3://edu.jhu.cs.jonny/wmt11/fr-en/hiero
67 | amazon-jar s3://edu.jhu.cs.jonny/thrax.jar
68 | amazon-num-instances 15
69 |
70 | max-split-size 8388608
71 |
72 | # the format should be:
73 | # foreign sentence ||| english sentence ||| alignment
74 | # where the english is either parsed or not depending on whether you want
75 | # SAMT or you want Hiero.
76 | #input-file s3://edu.jhu.cs.jonny/wmt11/corpus.fr-en
77 | input-file pipeline-es-en-phrase-_export_projects_mpost_language-packs_es-en_1.3/input-file
78 |
--------------------------------------------------------------------------------
/example/europarl.unified.1:
--------------------------------------------------------------------------------
1 | declaro reanudado el período de sesiones del parlamento europeo , interrumpido el viernes 17 de diciembre pasado , y reitero a sus señorías mi deseo de que hayan tenido unas buenas vacaciones . ||| i declare resumed the session of the european parliament adjourned on friday 17 december 1999 , and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period . ||| 0-0 0-1 1-1 2-1 3-1 0-2 0-3 5-4 4-5 6-5 8-6 8-7 7-8 10-9 12-10 11-11 12-11 13-12 14-13 15-13 16-13 16-14 17-15 18-16 19-17 19-18 19-19 19-20 19-21 20-22 21-24 22-24 25-29 24-31 26-32 27-33 28-34 30-35 31-36 29-37 30-37 31-37 31-38 32-39
2 |
--------------------------------------------------------------------------------
/example/nist09.unified.1:
--------------------------------------------------------------------------------
1 | اس ملک کا مغربی صحرائے راجھستان بھی مسلسل اپنے پانچ سال سے سخت خشک سالی کی لپیٹ میں ہے . ||| (TOP (S (NP (NP (NP (DT The) (NN country) (POS 's)) (JJ western) (NN desert) (NN state)) (PP (IN of) (NP (NNP Rajasthan)))) (VP (VBZ is) (ADVP (RB also)) (VP (VBG bracing) (PP (IN for) (NP (NP (PRP$ its) (JJ fifth) (JJ straight) (NN year)) (PP (IN of) (NP (NN drought))))))) (. .))) ||| 0-0 15-16 10-15 11-16 13-17 14-17 8-12 18-8 4-10 5-10 19-18 6-9 9-13 1-1 2-2 3-3
2 |
--------------------------------------------------------------------------------
/lib/aws-java-sdk-1.1.3.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joshua-decoder/thrax/0d766be2e7d0fbbad0734064e55699d56048a30c/lib/aws-java-sdk-1.1.3.jar
--------------------------------------------------------------------------------
/lib/commons-lang3-3.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joshua-decoder/thrax/0d766be2e7d0fbbad0734064e55699d56048a30c/lib/commons-lang3-3.1.jar
--------------------------------------------------------------------------------
/lib/hadoop-common-2.5.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joshua-decoder/thrax/0d766be2e7d0fbbad0734064e55699d56048a30c/lib/hadoop-common-2.5.2.jar
--------------------------------------------------------------------------------
/lib/hadoop-mapreduce-client-core-2.5.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joshua-decoder/thrax/0d766be2e7d0fbbad0734064e55699d56048a30c/lib/hadoop-mapreduce-client-core-2.5.2.jar
--------------------------------------------------------------------------------
/lib/jerboa.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joshua-decoder/thrax/0d766be2e7d0fbbad0734064e55699d56048a30c/lib/jerboa.jar
--------------------------------------------------------------------------------
/lib/testng-5.8-jdk15.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joshua-decoder/thrax/0d766be2e7d0fbbad0734064e55699d56048a30c/lib/testng-5.8-jdk15.jar
--------------------------------------------------------------------------------
/scripts/berant_to_reference.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import os, sys, codecs
4 |
5 | def main():
6 | #
7 | #
8 | for line in sys.stdin:
9 | (source, target) = line.lstrip().rstrip().split("\t")
10 | (s_phr, s1, s2) = source[1:-1].split("::")
11 | (t_phr, t1, t2) = target[1:-1].split("::")
12 | if (s1[-2:] == t1[-2:]):
13 | t1 = "[1]"
14 | t2 = "[2]"
15 | else:
16 | t1 = "[2]"
17 | t2 = "[1]"
18 | s1 = "[1]"
19 | s2 = "[2]"
20 | print s1 + " " + s_phr + " " + s2 + " ||| " + t1 + " " + t_phr + " " + t2
21 |
22 |
23 | if __name__ == "__main__":
24 | main()
25 |
26 |
27 |
--------------------------------------------------------------------------------
/scripts/create_glue_grammar.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # this script just wraps a java call
3 |
4 | if [[ -z "$THRAX" ]]
5 | then
6 | THRAX="`basename $0`/.."
7 | fi
8 |
9 | java -cp $THRAX/bin/thrax.jar edu.jhu.thrax.util.CreateGlueGrammar $1
10 |
11 |
--------------------------------------------------------------------------------
/scripts/filter_rules.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | if (($# < 1))
4 | then
5 | cat << END_USAGE
6 | usage: filter_rules.sh [-v|-p|-f] [test set ...]
7 | -v verbose mode
8 | -p parallel compatibility: print blank lines, don't buffer output
9 | -f fast mode: not as aggressive
10 | END_USAGE
11 | exit 1
12 | fi
13 |
14 | java -Dfile.encoding=utf8 -cp $THRAX/bin/thrax.jar edu.jhu.thrax.util.TestSetFilter $*
15 |
16 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/datatypes/AlignedSentencePair.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.datatypes;
2 |
3 | import java.util.Arrays;
4 |
5 | public class AlignedSentencePair {
6 | public final int[] source;
7 | public final int[] target;
8 | public final Alignment alignment;
9 |
10 | public AlignedSentencePair(int[] ss, int[] ts, Alignment a) {
11 | source = ss;
12 | target = ts;
13 | alignment = a;
14 | }
15 |
16 | public boolean equals(Object o) {
17 | if (o == this) return true;
18 | if (!(o instanceof AlignedSentencePair)) return false;
19 | AlignedSentencePair other = (AlignedSentencePair) o;
20 | return Arrays.equals(source, other.source) && Arrays.equals(target, other.target)
21 | && alignment.equals(other.alignment);
22 | }
23 |
24 | public int hashCode() {
25 | int result = 137;
26 | result = result * 67 + Arrays.hashCode(source);
27 | result = result * 67 + Arrays.hashCode(target);
28 | result = result * 67 + alignment.hashCode();
29 | return result;
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/datatypes/Alignment.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.datatypes;
2 |
3 | import java.util.Iterator;
4 |
5 | /**
6 | * This interface represents a word-level alignment of a sentence pair.
7 | */
8 | public interface Alignment {
9 |
10 | public boolean sourceIndexIsAligned(int i);
11 |
12 | public boolean targetIndexIsAligned(int i);
13 |
14 | public int numTargetWordsAlignedTo(int i);
15 |
16 | public int numSourceWordsAlignedTo(int i);
17 |
18 | public Iterator targetIndicesAlignedTo(int i);
19 |
20 | public Iterator sourceIndicesAlignedTo(int i);
21 |
22 | public boolean consistentWith(int sourceLength, int targetLength);
23 | }
24 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/datatypes/IntPair.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.datatypes;
2 |
3 | import edu.jhu.thrax.util.FormatUtils;
4 |
5 | /**
6 | * A class that represents a pair of integers.
7 | */
8 | public class IntPair implements Comparable {
9 |
10 | /**
11 | * The first integer of the pair ("car").
12 | */
13 | public final int fst;
14 |
15 | /**
16 | * The second integer of the pair ("cdr").
17 | */
18 | public final int snd;
19 |
20 | /**
21 | * Constructor that sets the two ints of the pair.
22 | *
23 | * @param a the first int of the pair
24 | * @param b the second int of the pair
25 | */
26 | public IntPair(int a, int b)
27 | {
28 | fst = a;
29 | snd = b;
30 | }
31 |
32 | /**
33 | * Create a new IntPair that is the reverse of this pair; that is, puts
34 | * the second int first and the first int second.
35 | */
36 | public IntPair reverse()
37 | {
38 | return new IntPair(snd, fst);
39 | }
40 |
41 | /**
42 | * Builds a pair from the type of String that you would see in Berkeley
43 | * aligner output. For example, the String "3-4" would yield the pair
44 | * (3,4).
45 | *
46 | * @param s a string in Berkeley aligner format
47 | * @return a new IntPair representing that string
48 | */
49 | public static IntPair fromHyphenatedString(String s)
50 | {
51 | String [] nums = FormatUtils.P_DASH.split(s);
52 | if (nums.length != 2) {
53 | return null;
54 | }
55 | return new IntPair(Integer.parseInt(nums[0]), Integer.parseInt(nums[1]));
56 | }
57 |
58 | public String toString()
59 | {
60 | return String.format("(%d,%d)", fst, snd);
61 | }
62 |
63 | public boolean equals(Object o)
64 | {
65 | if (o instanceof IntPair) {
66 | IntPair ip = (IntPair) o;
67 | return this.fst == ip.fst && this.snd == ip.snd;
68 | }
69 | return false;
70 | }
71 |
72 | public int compareTo(IntPair ip)
73 | {
74 | if (this.fst == ip.fst) {
75 | return this.snd - ip.snd;
76 | }
77 | return this.fst - ip.fst;
78 | }
79 |
80 | public int hashCode()
81 | {
82 | return fst * 37 + snd;
83 | }
84 |
85 | }
86 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/distributional/ContextPhrase.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.distributional;
2 |
3 | import org.apache.hadoop.io.IntWritable;
4 | import org.apache.hadoop.io.MapWritable;
5 | import org.apache.hadoop.io.Text;
6 | import org.apache.hadoop.io.Writable;
7 |
8 | public class ContextPhrase {
9 |
10 | private final Text phrase;
11 |
12 | private MapWritable features;
13 |
14 | public ContextPhrase(String phrase) {
15 | this.phrase = new Text(phrase);
16 | this.features = new MapWritable();
17 | }
18 |
19 | public void addFeature(String feature_name) {
20 | addFeature(feature_name, 1);
21 | }
22 |
23 | public void addFeature(String feature_name, int feature_value) {
24 | Text feature_text = new Text(feature_name);
25 | Writable current_value = features.get(feature_text);
26 | if (current_value != null)
27 | features.put(feature_text, new IntWritable(((IntWritable) current_value).get()
28 | + feature_value));
29 | else
30 | features.put(feature_text, new IntWritable(feature_value));
31 | }
32 |
33 | public Text getPhrase() {
34 | return phrase;
35 | }
36 |
37 | public MapWritable getFeatures() {
38 | return features;
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/distributional/FeatureClass.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.distributional;
2 |
3 | import edu.jhu.thrax.distributional.FeatureTypes.Label;
4 | import edu.jhu.thrax.distributional.FeatureTypes.Type;
5 |
6 | public class FeatureClass {
7 | public final Type type;
8 | public final Label label;
9 | public final int max_context;
10 | public final int max_gram;
11 |
12 | public FeatureClass(Type type, Label label) {
13 | this(type, label, -1, -1);
14 | }
15 |
16 | public FeatureClass(Type type, Label label, int max_context, int max_gram) {
17 | this.type = type;
18 | this.label = label;
19 | this.max_context = max_context;
20 | this.max_gram = max_gram;
21 | }
22 | }
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/distributional/FeatureEncoder.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.distributional;
2 |
3 | import edu.jhu.thrax.distributional.FeatureTypes.Directionality;
4 | import edu.jhu.thrax.distributional.FeatureTypes.Flavor;
5 | import edu.jhu.thrax.distributional.FeatureTypes.Label;
6 | import edu.jhu.thrax.distributional.FeatureTypes.Type;
7 |
8 | public class FeatureEncoder {
9 |
10 | public static long encode(Type type, Label label, Flavor flavor, Directionality directionality) {
11 | return 0;
12 | }
13 |
14 | public static String type(long coded) {
15 | int feature_code = (int) (coded >> 32);
16 |
17 | return new Integer(feature_code).toString();
18 | }
19 |
20 | public static int feature(long coded) {
21 | return (int) (coded & 0x00000000FFFFFFFF);
22 | }
23 |
24 | }
25 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/distributional/FeatureSet.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.distributional;
2 |
3 | import java.util.HashSet;
4 | import java.util.Set;
5 |
6 | import edu.jhu.thrax.distributional.FeatureTypes.Label;
7 | import edu.jhu.thrax.distributional.FeatureTypes.Type;
8 | import edu.jhu.thrax.util.FormatUtils;
9 |
10 |
11 | public class FeatureSet {
12 |
13 | private Set features;
14 |
15 | private boolean active[][];
16 |
17 | public FeatureSet() {
18 | features = new HashSet();
19 | active = new boolean[Type.values().length][Label.values().length];
20 | }
21 |
22 | public void addFeatureClass(String entry) {
23 | String[] fields = FormatUtils.P_DASH.split(entry);
24 | for (String f : fields) {
25 | System.err.println(f);
26 | }
27 | }
28 |
29 | public void addFeatureSet(FeatureSet set) {
30 | for (FeatureClass fc : set.features)
31 | this.features.add(fc);
32 |
33 | for (int i = 0; i < active.length; ++i)
34 | for (int j = 0; j < active[i].length; ++j)
35 | active[i][j] = active[i][j] || set.active[i][j];
36 | }
37 |
38 | public boolean active(Type type, Label label) {
39 | return active[type.code][label.code];
40 | }
41 |
42 |
43 | }
44 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/distributional/FeatureTypes.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.distributional;
2 |
3 | import java.util.HashMap;
4 | import java.util.Map;
5 |
6 | public class FeatureTypes {
7 |
8 | public enum Type {
9 | NGRAM(0, "ngram"), SYN(1, "syn"), DEP(2, "dep"), CDEP(3, "cdep"), CPDEP(4, "cpdep");
10 |
11 | private static Map map;
12 |
13 | static {
14 | map = new HashMap();
15 | for (Type t : Type.values())
16 | map.put(t.code, t);
17 | }
18 |
19 | public static Type get(int code) {
20 | return map.get(code);
21 | }
22 |
23 | public final int code;
24 | public final String name;
25 |
26 | Type(int code, String name) {
27 | this.code = code;
28 | this.name = name;
29 | }
30 | }
31 |
32 | public enum Label {
33 | NONE(0, "none"), LEX(1, "lex"), LEM(2, "lem"), POS(3, "pos"), NER(4, "ner");
34 |
35 | public final int code;
36 | public final String name;
37 |
38 | Label(int code, String name) {
39 | this.code = code;
40 | this.name = name;
41 | }
42 | }
43 |
44 | public enum Directionality {
45 | NONE(0, "none"), LEFT(1, "left"), RIGHT(2, "right"), CENTER(3, "center");
46 |
47 | public final int code;
48 | public final String name;
49 |
50 | Directionality(int code, String name) {
51 | this.code = code;
52 | this.name = name;
53 | }
54 | }
55 |
56 | public enum Flavor {
57 | NONE(0, "none"), GOV(1, "gov"), DEP(2, "dep"), HEAD(3, "head");
58 |
59 | public final int code;
60 | public final String name;
61 |
62 | Flavor(int code, String name) {
63 | this.code = code;
64 | this.name = name;
65 | }
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/extraction/HieroLabeler.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.extraction;
2 |
3 | public class HieroLabeler implements SpanLabeler
4 | {
5 | private final int label;
6 |
7 | public HieroLabeler(int s)
8 | {
9 | label = s;
10 | }
11 |
12 | public int getLabel(int start, int end)
13 | {
14 | return label;
15 | }
16 | }
17 |
18 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/extraction/LabelCache.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.extraction;
2 |
3 | import java.util.HashMap;
4 |
5 | import edu.jhu.thrax.util.Vocabulary;
6 |
7 | public enum LabelCache {
8 | SLASH("/"), BACKSLASH("\\"), PLUS("+");
9 |
10 | private HashMap cache = new HashMap();
11 | private String glue;
12 |
13 | private LabelCache(String g) {
14 | glue = g;
15 | }
16 |
17 | public final int get(int left, int right) {
18 | long key = ((long) left << 32) | ((long) right & 0x00000000FFFFFFFFL);
19 | Integer val = cache.get(key);
20 | if (val == null) {
21 | val = join(left, right, glue);
22 | cache.put(key, val);
23 | }
24 | return val;
25 | }
26 |
27 | private static final int join(int a, int b, String glue) {
28 | String word_a = Vocabulary.word(a);
29 | String word_b = Vocabulary.word(b);
30 | return Vocabulary.id(word_a.substring(0, word_a.length() - 1) + glue
31 | + word_b.substring(1));
32 | }
33 | }
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/extraction/Labeling.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.extraction;
2 |
3 | public enum Labeling {
4 | HIERO, SYNTAX, MANUAL;
5 | }
6 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/extraction/ManualSpanLabeler.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.extraction;
2 |
3 | public class ManualSpanLabeler implements SpanLabeler
4 | {
5 | private final int [] labels;
6 | private final int defaultLabel;
7 | private final int sentenceLength;
8 |
9 | public ManualSpanLabeler(int[] ls, int def)
10 | {
11 | labels = ls;
12 | defaultLabel = def;
13 | sentenceLength = getSentenceLength(labels.length);
14 | }
15 |
16 | public int getLabel(int from, int to)
17 | {
18 | int idx = getLabelIndex(from, to, sentenceLength);
19 | if (idx >= labels.length || idx < 0) {
20 | return defaultLabel;
21 | }
22 | else {
23 | return labels[idx];
24 | }
25 | }
26 |
27 | private static int getSentenceLength(int numLabels)
28 | {
29 | if (numLabels < 0)
30 | return 0;
31 | // 0 labels => sentence length 0
32 | // 1 label => 1
33 | // 3 labels => 2
34 | // T_n labels => n, where T_n is the nth triangle number
35 | int result = 0;
36 | int triangle = 0;
37 | while (triangle != numLabels) {
38 | result++;
39 | triangle += result;
40 | }
41 | return result;
42 | }
43 |
44 | private static int getLabelIndex(int from, int to, int length)
45 | {
46 | // let the length of the target sentence be L
47 | // the first L labels are for spans (0,1) ... (0,L)
48 | // the next L - 1 are for (1,2) ... (1,L)
49 | // and so on
50 | int result = 0;
51 | int offset = length;
52 | for (int i = 0; i < from; i++) {
53 | result += offset;
54 | offset--;
55 | }
56 | int difference = to - from - 1;
57 | result += difference;
58 | return result;
59 | }
60 | }
61 |
62 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/extraction/SpanLabeler.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.extraction;
2 |
3 | public interface SpanLabeler
4 | {
5 | public int getLabel(int start, int end);
6 | }
7 |
8 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/comparators/FieldComparator.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.comparators;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.hadoop.io.WritableComparator;
6 | import org.apache.hadoop.io.WritableUtils;
7 |
8 | public class FieldComparator {
9 | private final int fieldNumber;
10 | private final WritableComparator comparator;
11 |
12 | public int offset;
13 |
14 | public FieldComparator(int field, WritableComparator comparator) {
15 | if (field < 0)
16 | throw new IllegalArgumentException("TextFieldComparator: cannot compare field of index "
17 | + field);
18 | fieldNumber = field;
19 | this.comparator = comparator;
20 | }
21 |
22 | public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) throws IOException {
23 | int start1 = getFieldStart(fieldNumber, b1, s1);
24 | int start2 = getFieldStart(fieldNumber, b2, s2);
25 |
26 | int length1 = getFieldLength(b1, start1);
27 | int length2 = getFieldLength(b2, start2);
28 |
29 | // TODO: l1 and l2 may need to be adjusted to reflect offset.
30 | return comparator.compare(b1, start1, length1, b2, start2, length2);
31 | }
32 |
33 | private final int getFieldStart(int field, byte[] bytes, int start) throws IOException {
34 | // if we want the first field, just return current start
35 | if (field == 0) return start;
36 | // otherwise, find out how long this field is ...
37 | int fieldLength = getFieldLength(bytes, start);
38 | // then decrement the field number and find the next start
39 | return getFieldStart(field - 1, bytes, start + fieldLength);
40 | }
41 |
42 | private static final int getFieldLength(byte[] bytes, int start) throws IOException {
43 | // Text is serialized as vInt (the length) plus that many bytes
44 | int vint_size = WritableUtils.decodeVIntSize(bytes[start]);
45 | int field_length = WritableComparator.readVInt(bytes, start);
46 | return vint_size + field_length;
47 | }
48 |
49 | public int fieldEndIndex(byte[] bytes, int start) throws IOException {
50 | int fieldStart = getFieldStart(fieldNumber, bytes, start);
51 | int fieldLength = getFieldLength(bytes, fieldStart);
52 | return fieldStart + fieldLength;
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/comparators/PrimitiveArrayMarginalComparator.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.comparators;
2 |
3 | import org.apache.hadoop.io.Text;
4 | import org.apache.hadoop.io.WritableComparator;
5 | import org.apache.hadoop.io.WritableUtils;
6 |
7 | /**
8 | * Compares two primitive array objects lexicographically, except the zero-length array should be
9 | * sorted before any other.
10 | */
11 | public class PrimitiveArrayMarginalComparator extends WritableComparator {
12 |
13 | public static final int[] MARGINAL = new int[0];
14 |
15 | public PrimitiveArrayMarginalComparator() {
16 | super(Text.class);
17 | }
18 |
19 | public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
20 | int h1 = WritableUtils.decodeVIntSize(b1[s1]);
21 | int length1 = (h1 == 1 ? b1[s1] : -1);
22 |
23 | int h2 = WritableUtils.decodeVIntSize(b2[s2]);
24 | int length2 = (h2 == 1 ? b2[s2] : -1);
25 |
26 | if (length1 == 0 && length2 == 0) return 0;
27 | if (length1 == 0) return -1;
28 | if (length2 == 0) return 1;
29 | return WritableComparator.compareBytes(b1, s1 + h1, l1 - h1, b2, s2 + h2, l2 - h2);
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/comparators/TextMarginalComparator.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.comparators;
2 |
3 | import org.apache.hadoop.io.Text;
4 | import org.apache.hadoop.io.WritableComparator;
5 | import org.apache.hadoop.io.WritableUtils;
6 |
7 | /**
8 | * Compares two Text objects lexicographically, except the Text "/MARGINAL/"
9 | * should be sorted before any other string.
10 | */
11 | public class TextMarginalComparator extends WritableComparator
12 | {
13 | private static final Text.Comparator TEXT_COMPARATOR = new Text.Comparator();
14 |
15 | public static final Text MARGINAL = new Text("/MARGINAL/");
16 | private static final byte [] MARGINAL_BYTES = MARGINAL.getBytes();
17 | private static final int MARGINAL_LENGTH = MARGINAL.getLength();
18 |
19 | public TextMarginalComparator()
20 | {
21 | super(Text.class);
22 | }
23 |
24 | public int compare(byte [] b1, int s1, int l1,
25 | byte [] b2, int s2, int l2)
26 | {
27 | // if they're equal, return zero
28 | int cmp = TEXT_COMPARATOR.compare(b1, s1, l1, b2, s2, l2);
29 | if (cmp == 0) {
30 | return 0;
31 | }
32 | // else if the first string is "/MARGINAL/", return -1
33 | int vIntSize = WritableUtils.decodeVIntSize(b1[s1]);
34 | int cmpMarginal = compareBytes(b1, s1 + vIntSize, l1 - vIntSize,
35 | MARGINAL_BYTES, 0, MARGINAL_LENGTH);
36 | if (cmpMarginal == 0)
37 | return -1;
38 | // else if the second is "/MARGINAL/", return 1
39 | vIntSize = WritableUtils.decodeVIntSize(b2[s2]);
40 | cmpMarginal = compareBytes(b2, s2 + vIntSize, l2 - vIntSize,
41 | MARGINAL_BYTES, 0, MARGINAL_LENGTH);
42 | if (cmpMarginal == 0)
43 | return 1;
44 | // else, just return the result of the comparison
45 | return cmp;
46 | }
47 | }
48 |
49 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/datatypes/Annotation.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.datatypes;
2 |
3 | import java.io.DataInput;
4 | import java.io.DataOutput;
5 | import java.io.IOException;
6 |
7 | import org.apache.hadoop.io.Writable;
8 | import org.apache.hadoop.io.WritableUtils;
9 |
10 | public class Annotation implements Writable {
11 |
12 | // Source-to-target alignment.
13 | private AlignmentWritable f2e = null;
14 |
15 | // Rule occurrence count.
16 | private int count;
17 |
18 | public Annotation() {
19 | count = 0;
20 | }
21 |
22 | public Annotation(int c) {
23 | count = c;
24 | }
25 |
26 | public Annotation(Annotation a) {
27 | count = a.count;
28 | this.f2e = new AlignmentWritable(a.f2e);
29 | }
30 |
31 | public Annotation(AlignmentWritable f2e) {
32 | count = 1;
33 | this.f2e = f2e;
34 | }
35 |
36 | public void merge(Annotation that) {
37 | this.count += that.count;
38 | }
39 |
40 | @Override
41 | public void readFields(DataInput in) throws IOException {
42 | boolean has_alignments = false;
43 | count = WritableUtils.readVInt(in);
44 | if (count < 0) {
45 | count = -count;
46 | has_alignments = true;
47 | }
48 | if (has_alignments) {
49 | f2e = new AlignmentWritable();
50 | f2e.readFields(in);
51 | }
52 | }
53 |
54 | @Override
55 | public void write(DataOutput out) throws IOException {
56 | WritableUtils.writeVInt(out, (f2e != null ? -count : count));
57 | if (f2e != null) f2e.write(out);
58 | }
59 |
60 | public AlignmentWritable e2f() {
61 | return f2e.flip();
62 | }
63 |
64 | public AlignmentWritable f2e() {
65 | return f2e;
66 | }
67 |
68 | public void setAlignment(AlignmentWritable a) {
69 | f2e = a;
70 | }
71 |
72 | public int count() {
73 | return count;
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/datatypes/FeatureMap.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.datatypes;
2 |
3 | import java.io.DataInput;
4 | import java.io.DataOutput;
5 | import java.io.IOException;
6 | import java.util.HashMap;
7 | import java.util.Map;
8 | import java.util.Set;
9 |
10 | import org.apache.hadoop.io.FloatWritable;
11 | import org.apache.hadoop.io.Writable;
12 | import org.apache.hadoop.io.WritableUtils;
13 |
14 | import edu.jhu.thrax.hadoop.features.annotation.AnnotationPassthroughFeature;
15 | import edu.jhu.thrax.util.Vocabulary;
16 |
17 | public class FeatureMap implements Writable {
18 |
19 | private Map map;
20 |
21 | public FeatureMap() {
22 | map = new HashMap();
23 | }
24 |
25 | public FeatureMap(FeatureMap fm) {
26 | this();
27 | for (int key : fm.map.keySet())
28 | this.map.put(key, fm.map.get(key));
29 | }
30 |
31 | public Writable get(int key) {
32 | return map.get(key);
33 | }
34 |
35 | public Writable get(String key) {
36 | return map.get(Vocabulary.id(key));
37 | }
38 |
39 | public void put(int key, Writable val) {
40 | map.put(key, val);
41 | }
42 |
43 | public void put(String key, Writable val) {
44 | map.put(Vocabulary.id(key), val);
45 | }
46 |
47 | public boolean containsKey(int key) {
48 | return map.containsKey(key);
49 | }
50 |
51 | public Set keySet() {
52 | return map.keySet();
53 | }
54 |
55 | @Override
56 | public void readFields(DataInput in) throws IOException {
57 | map.clear();
58 | int size = WritableUtils.readVInt(in);
59 | for (int i = 0; i < size; ++i) {
60 | int key = 0;
61 | Writable val = null;
62 | key = WritableUtils.readVInt(in);
63 | if (key == Vocabulary.id(AnnotationPassthroughFeature.NAME)) {
64 | val = new Annotation();
65 | val.readFields(in);
66 | } else {
67 | val = new FloatWritable();
68 | val.readFields(in);
69 | }
70 | map.put(key, val);
71 | }
72 | }
73 |
74 | @Override
75 | public void write(DataOutput out) throws IOException {
76 | WritableUtils.writeVInt(out, map.size());
77 | for (int key : map.keySet()) {
78 | WritableUtils.writeVInt(out, key);
79 | if (key == Vocabulary.id(AnnotationPassthroughFeature.NAME)) {
80 | ((Annotation) this.get(key)).write(out);
81 | } else {
82 | ((FloatWritable) this.get(key)).write(out);
83 | }
84 | }
85 | }
86 | }
87 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/datatypes/FeaturePair.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.datatypes;
2 |
3 | import java.io.DataInput;
4 | import java.io.DataOutput;
5 | import java.io.IOException;
6 |
7 | import org.apache.hadoop.io.Writable;
8 | import org.apache.hadoop.io.WritableUtils;
9 |
10 | import edu.jhu.thrax.util.Vocabulary;
11 |
12 | public class FeaturePair implements Writable {
13 | public int key;
14 | public FeatureValue val;
15 |
16 | public FeaturePair() {
17 | key = 0;
18 | val = new FeatureValue();
19 | }
20 |
21 | public FeaturePair(int k, Writable v) {
22 | key = k;
23 | val = new FeatureValue(v);
24 | }
25 |
26 | public void write(DataOutput out) throws IOException {
27 | WritableUtils.writeVInt(out, key);
28 | val.write(out);
29 | }
30 |
31 | public void readFields(DataInput in) throws IOException {
32 | key = WritableUtils.readVInt(in);
33 | val.readFields(in);
34 | }
35 |
36 | public int hashCode() {
37 | return key * 163 + val.hashCode();
38 | }
39 |
40 | public boolean equals(Object o) {
41 | if (o instanceof FeaturePair) {
42 | FeaturePair that = (FeaturePair) o;
43 | return key == that.key && val.equals(that.val);
44 | }
45 | return false;
46 | }
47 |
48 | public String toString() {
49 | return Vocabulary.word(key) + "=" + val.toString();
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/datatypes/FeatureValue.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.datatypes;
2 |
3 | import org.apache.hadoop.io.FloatWritable;
4 | import org.apache.hadoop.io.GenericWritable;
5 | import org.apache.hadoop.io.IntWritable;
6 | import org.apache.hadoop.io.Text;
7 | import org.apache.hadoop.io.Writable;
8 |
9 | public class FeatureValue extends GenericWritable {
10 |
11 | @SuppressWarnings("rawtypes")
12 | private static Class[] TYPES = {FloatWritable.class, IntWritable.class, Text.class,
13 | Annotation.class, AlignmentWritable.class};
14 |
15 | FeatureValue() {}
16 |
17 | FeatureValue(Writable val) {
18 | this.set(val);
19 | }
20 |
21 | @SuppressWarnings("unchecked")
22 | @Override
23 | protected Class extends Writable>[] getTypes() {
24 | return TYPES;
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/datatypes/IntPair.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.datatypes;
2 |
3 | import java.io.DataInput;
4 | import java.io.DataOutput;
5 | import java.io.IOException;
6 |
7 | import org.apache.hadoop.io.WritableComparable;
8 | import org.apache.hadoop.io.WritableComparator;
9 |
10 | public class IntPair implements WritableComparable {
11 | public int fst;
12 | public int snd;
13 |
14 | public IntPair() {
15 | // do nothing
16 | }
17 |
18 | public IntPair(int car, int cdr) {
19 | fst = car;
20 | snd = cdr;
21 | }
22 |
23 | public void reverse() {
24 | int tmp = fst;
25 | fst = snd;
26 | snd = tmp;
27 | }
28 |
29 | public void write(DataOutput out) throws IOException {
30 | out.writeInt(fst);
31 | out.writeInt(snd);
32 | }
33 |
34 | public void readFields(DataInput in) throws IOException {
35 | fst = in.readInt();
36 | snd = in.readInt();
37 | }
38 |
39 | public int hashCode() {
40 | return fst * 163 + snd;
41 | }
42 |
43 | public boolean equals(Object o) {
44 | if (o instanceof IntPair) {
45 | IntPair ip = (IntPair) o;
46 | return fst == ip.fst && snd == ip.snd;
47 | }
48 | return false;
49 | }
50 |
51 | public String toString() {
52 | return fst + "\t" + snd;
53 | }
54 |
55 | public int compareTo(IntPair ip) {
56 | int cmp = ip.fst - fst;
57 | if (cmp != 0) {
58 | return cmp;
59 | }
60 | return ip.snd - snd;
61 | }
62 |
63 | public static class Comparator extends WritableComparator {
64 | public Comparator() {
65 | super(IntPair.class);
66 | }
67 |
68 | public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
69 | int fst1 = readInt(b1, s1);
70 | int fst2 = readInt(b2, s2);
71 | if (fst1 != fst2) {
72 | return fst2 - fst1;
73 | }
74 | int snd1 = readInt(b1, s1 + 4);
75 | int snd2 = readInt(b2, s2 + 4);
76 | return snd2 - snd1;
77 | }
78 | }
79 |
80 | static {
81 | WritableComparator.define(IntPair.class, new Comparator());
82 | }
83 |
84 | }
85 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/distributional/CommonLSH.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.distributional;
2 |
3 | import org.apache.hadoop.conf.Configuration;
4 |
5 | import edu.jhu.jerboa.sim.SLSH;
6 |
7 | public class CommonLSH {
8 |
9 | public static SLSH getSLSH(Configuration conf) {
10 | SLSH slsh = null;
11 | try {
12 | slsh = new SLSH();
13 | slsh.initialize(conf.getInt("thrax.lsh-num-bits", 256),
14 | conf.getInt("thrax.lsh-pool-size", 100000), conf.getInt("thrax.lsh-random-seed", 42));
15 | } catch (Exception e) {
16 | e.printStackTrace();
17 | System.exit(1);
18 | }
19 | return slsh;
20 | }
21 |
22 | }
23 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/distributional/DistributionalContextCombiner.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.distributional;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.hadoop.conf.Configuration;
6 | import org.apache.hadoop.io.Text;
7 | import org.apache.hadoop.mapreduce.Reducer;
8 |
9 | import edu.jhu.jerboa.sim.SLSH;
10 |
11 | public class DistributionalContextCombiner
12 | extends Reducer {
13 |
14 | private SLSH slsh;
15 |
16 | public void setup(Context context) throws IOException, InterruptedException {
17 | Configuration conf = context.getConfiguration();
18 | slsh = CommonLSH.getSLSH(conf);
19 | }
20 |
21 | protected void reduce(Text key, Iterable values, Context context)
22 | throws IOException, InterruptedException {
23 | ContextWritable combined = new ContextWritable();
24 | for (ContextWritable input : values) {
25 | combined.merge(input, slsh);
26 | }
27 | if (!combined.compacted.get()) combined.compact(slsh);
28 | context.write(key, combined);
29 | return;
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/distributional/DistributionalContextMapper.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.distributional;
2 |
3 | import java.io.IOException;
4 | import java.util.List;
5 |
6 | import org.apache.hadoop.conf.Configuration;
7 | import org.apache.hadoop.io.LongWritable;
8 | import org.apache.hadoop.io.Text;
9 | import org.apache.hadoop.mapreduce.Mapper;
10 |
11 | import edu.jhu.thrax.distributional.ContextPhrase;
12 | import edu.jhu.thrax.distributional.ContextPhraseExtractor;
13 | import edu.jhu.thrax.util.MalformedInput;
14 | import edu.jhu.thrax.util.exceptions.EmptySentenceException;
15 | import edu.jhu.thrax.util.exceptions.MalformedInputException;
16 | import edu.jhu.thrax.util.exceptions.MalformedParseException;
17 | import edu.jhu.thrax.util.exceptions.NotEnoughFieldsException;
18 |
19 | public class DistributionalContextMapper extends Mapper {
20 |
21 | private ContextPhraseExtractor extractor;
22 |
23 | protected void setup(Context context) throws IOException, InterruptedException {
24 | Configuration conf = context.getConfiguration();
25 | extractor = new ContextPhraseExtractor(conf);
26 | }
27 |
28 | protected void map(LongWritable key, Text value, Context context) throws IOException,
29 | InterruptedException {
30 | if (extractor == null) return;
31 | String line = value.toString();
32 | try {
33 | List phrases = extractor.extract(line);
34 | for (ContextPhrase cp : phrases) {
35 | context.write(cp.getPhrase(), new ContextWritable(1, cp.getFeatures()));
36 | }
37 | } catch (NotEnoughFieldsException e) {
38 | context.getCounter(MalformedInput.NOT_ENOUGH_FIELDS).increment(1);
39 | } catch (EmptySentenceException e) {
40 | context.getCounter(MalformedInput.EMPTY_SENTENCE).increment(1);
41 | } catch (MalformedParseException e) {
42 | context.getCounter(MalformedInput.MALFORMED_PARSE).increment(1);
43 | } catch (MalformedInputException e) {
44 | context.getCounter(MalformedInput.UNKNOWN).increment(1);
45 | }
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/distributional/DistributionalContextReducer.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.distributional;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.hadoop.conf.Configuration;
6 | import org.apache.hadoop.io.NullWritable;
7 | import org.apache.hadoop.io.Text;
8 | import org.apache.hadoop.mapreduce.Reducer;
9 |
10 | import edu.jhu.jerboa.sim.SLSH;
11 | import edu.jhu.jerboa.sim.Signature;
12 |
13 | public class DistributionalContextReducer
14 | extends Reducer {
15 |
16 | private int minCount;
17 | private SLSH slsh;
18 |
19 | public void setup(Context context) throws IOException, InterruptedException {
20 | Configuration conf = context.getConfiguration();
21 | minCount = conf.getInt("thrax.min-phrase-count", 3);
22 | slsh = CommonLSH.getSLSH(conf);
23 | }
24 |
25 | protected void reduce(Text key, Iterable values, Context context)
26 | throws IOException, InterruptedException {
27 | ContextWritable reduced = new ContextWritable();
28 | for (ContextWritable input : values) {
29 | reduced.merge(input, slsh);
30 | }
31 | if (!reduced.compacted.get()) reduced.compact(slsh);
32 | if (reduced.strength.get() >= minCount) {
33 | Signature reduced_signature = new Signature();
34 | // TODO: double-check need for deep copy?
35 | reduced_signature.sums = reduced.sums;
36 | slsh.buildSignature(reduced_signature, false);
37 | context.write(new SignatureWritable(key, reduced_signature, reduced.strength.get()),
38 | NullWritable.get());
39 | }
40 | return;
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/distributional/SignatureWritable.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.distributional;
2 |
3 | import java.io.DataInput;
4 | import java.io.DataOutput;
5 | import java.io.IOException;
6 |
7 | import org.apache.hadoop.io.IntWritable;
8 | import org.apache.hadoop.io.Text;
9 | import org.apache.hadoop.io.Writable;
10 | import org.apache.hadoop.io.WritableComparable;
11 | import org.apache.hadoop.mapreduce.Partitioner;
12 |
13 | import edu.jhu.jerboa.sim.Signature;
14 | import edu.jhu.thrax.hadoop.datatypes.PrimitiveUtils;
15 |
16 | public class SignatureWritable implements WritableComparable {
17 | public Text key;
18 | public byte[] bytes;
19 | public IntWritable strength;
20 |
21 | public SignatureWritable() {
22 | this.key = new Text();
23 | this.bytes = null;
24 | this.strength = new IntWritable();
25 | }
26 |
27 | public SignatureWritable(Text key, Signature signature, int strength) {
28 | this.key = new Text(key);
29 | // TODO: deep copy?
30 | this.bytes = signature.bytes;
31 | this.strength = new IntWritable(strength);
32 | }
33 |
34 | @Override
35 | public void readFields(DataInput in) throws IOException {
36 | key.readFields(in);
37 | bytes = PrimitiveUtils.readByteArray(in);
38 | strength.readFields(in);
39 | }
40 |
41 | @Override
42 | public void write(DataOutput out) throws IOException {
43 | key.write(out);
44 | PrimitiveUtils.writeByteArray(out, bytes);
45 | strength.write(out);
46 | }
47 |
48 | @Override
49 | public int compareTo(SignatureWritable that) {
50 | int cmp = strength.compareTo(that.strength);
51 | // Flip sign for descending sort order.
52 | if (cmp != 0) return -cmp;
53 | return key.compareTo(that.key);
54 | }
55 |
56 | public static class SignaturePartitioner extends Partitioner {
57 | public int getPartition(SignatureWritable signature, Writable value, int num_partitions) {
58 | int hash = 163;
59 | hash = 37 * hash + signature.key.hashCode();
60 | hash = 37 * hash + signature.bytes.hashCode();
61 | hash = 37 * hash + signature.strength.hashCode();
62 | return (hash & Integer.MAX_VALUE) % num_partitions;
63 | }
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/extraction/ExtractionCombiner.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.extraction;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.hadoop.mapreduce.Reducer;
6 |
7 | import edu.jhu.thrax.hadoop.datatypes.AlignedRuleWritable;
8 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
9 |
10 | public class ExtractionCombiner extends Reducer {
11 |
12 | protected void reduce(AlignedRuleWritable key, Iterable values, Context context)
13 | throws IOException, InterruptedException {
14 | context.progress();
15 | Annotation merged = new Annotation();
16 | for (Annotation a : values) merged.merge(a);
17 | context.write(key, merged);
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/extraction/ExtractionMapper.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.extraction;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.hadoop.conf.Configuration;
6 | import org.apache.hadoop.io.LongWritable;
7 | import org.apache.hadoop.io.Text;
8 | import org.apache.hadoop.mapreduce.Mapper;
9 |
10 | import edu.jhu.thrax.hadoop.datatypes.AlignedRuleWritable;
11 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
12 | import edu.jhu.thrax.util.Vocabulary;
13 |
14 | public class ExtractionMapper extends Mapper {
15 | private RuleWritableExtractor extractor;
16 |
17 | protected void setup(Context context) throws IOException, InterruptedException {
18 | Configuration conf = context.getConfiguration();
19 | String vocabulary_path = conf.getRaw("thrax.work-dir") + "vocabulary/part-*";
20 | Vocabulary.initialize(conf, vocabulary_path);
21 |
22 | // TODO: static initializer call for what Annotation actually carries would go here.
23 | extractor = RuleWritableExtractorFactory.create(context);
24 | if (extractor == null) {
25 | System.err.println("WARNING: could not create rule extractor as configured!");
26 | }
27 | }
28 |
29 | protected void map(LongWritable key, Text value, Context context) throws IOException,
30 | InterruptedException {
31 | if (extractor == null) return;
32 | for (AnnotatedRule ar : extractor.extract(value))
33 | context.write(new AlignedRuleWritable(ar.rule, ar.f2e), ar.annotation);
34 | context.progress();
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/extraction/ExtractionReducer.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.extraction;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.hadoop.conf.Configuration;
6 | import org.apache.hadoop.mapreduce.Reducer;
7 |
8 | import edu.jhu.thrax.hadoop.datatypes.AlignedRuleWritable;
9 | import edu.jhu.thrax.hadoop.datatypes.AlignmentWritable;
10 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
11 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
12 | import edu.jhu.thrax.util.Vocabulary;
13 |
14 | public class ExtractionReducer
15 | extends Reducer {
16 |
17 | private RuleWritable currentRule = null;
18 | private Annotation currentAnnotation = null;
19 | private AlignmentWritable maxAlignment = null;
20 | private int alignmentCount;
21 |
22 | private int minCount;
23 |
24 | protected void setup(Context context) throws IOException, InterruptedException {
25 | Configuration conf = context.getConfiguration();
26 | String vocabulary_path = conf.getRaw("thrax.work-dir") + "vocabulary/part-*";
27 | Vocabulary.initialize(conf, vocabulary_path);
28 | minCount = conf.getInt("thrax.min-rule-count", 1);
29 | }
30 |
31 | protected void reduce(AlignedRuleWritable key, Iterable values, Context context)
32 | throws IOException, InterruptedException {
33 | RuleWritable rule = key.getRule();
34 | AlignmentWritable alignment = key.getAlignment();
35 |
36 | Annotation merged = new Annotation();
37 | for (Annotation a : values)
38 | merged.merge(a);
39 |
40 | if (!rule.equals(currentRule)) {
41 | if (currentRule != null
42 | && (currentAnnotation.count() >= minCount || isUnigramRule(currentRule))) {
43 | currentAnnotation.setAlignment(maxAlignment);
44 | context.write(currentRule, currentAnnotation);
45 | context.progress();
46 | }
47 | currentRule = new RuleWritable(rule);
48 | currentAnnotation = new Annotation();
49 | alignmentCount = 0;
50 | maxAlignment = null;
51 | }
52 | currentAnnotation.merge(merged);
53 | if (alignmentCount < merged.count()) {
54 | maxAlignment = new AlignmentWritable(alignment);
55 | alignmentCount = merged.count();
56 | }
57 | }
58 |
59 | protected void cleanup(Context context) throws IOException, InterruptedException {
60 | if (currentRule != null) {
61 | if (currentAnnotation.count() >= minCount || isUnigramRule(currentRule)) {
62 | currentAnnotation.setAlignment(maxAlignment);
63 | context.write(currentRule, currentAnnotation);
64 | context.progress();
65 | }
66 | }
67 | }
68 |
69 | private static boolean isUnigramRule(RuleWritable rule) {
70 | if (rule.source.length == 1) return !Vocabulary.nt(rule.source[0]);
71 | return rule.target.length == 1 && !Vocabulary.nt(rule.target[0]);
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/extraction/RuleWritableExtractor.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.extraction;
2 |
3 | import org.apache.hadoop.io.Text;
4 |
5 | import edu.jhu.thrax.hadoop.datatypes.AlignmentWritable;
6 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
7 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
8 |
9 | public interface RuleWritableExtractor {
10 | public Iterable extract(Text line);
11 | }
12 |
13 |
14 | class AnnotatedRule {
15 | public RuleWritable rule = null;
16 | public AlignmentWritable f2e = null;
17 | public Annotation annotation = null;
18 |
19 | public AnnotatedRule(RuleWritable r) {
20 | rule = r;
21 | }
22 |
23 | public AnnotatedRule(RuleWritable r, AlignmentWritable f2e, Annotation a) {
24 | this.rule = r;
25 | this.f2e = f2e;
26 | this.annotation = a;
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/extraction/RuleWritableExtractorFactory.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.extraction;
2 |
3 | import org.apache.hadoop.io.LongWritable;
4 | import org.apache.hadoop.io.Text;
5 | import org.apache.hadoop.mapreduce.Mapper;
6 |
7 | import edu.jhu.thrax.hadoop.datatypes.AlignedRuleWritable;
8 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
9 |
10 | public class RuleWritableExtractorFactory {
11 | public static RuleWritableExtractor create(
12 | Mapper.Context context) {
13 | return new HierarchicalRuleWritableExtractor(context);
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/AbstractnessFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features;
2 |
3 | import java.util.Map;
4 |
5 | import org.apache.hadoop.io.IntWritable;
6 | import org.apache.hadoop.io.Writable;
7 |
8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
9 | import edu.jhu.thrax.util.Vocabulary;
10 |
11 | public class AbstractnessFeature implements SimpleFeature {
12 |
13 | public static final String NAME = "abstract";
14 |
15 | private static final IntWritable ZERO = new IntWritable(0);
16 | private static final IntWritable ONE = new IntWritable(1);
17 |
18 | public Writable score(RuleWritable r) {
19 | for (int word : r.source) {
20 | if (!Vocabulary.nt(word)) {
21 | return ZERO;
22 | }
23 | }
24 | for (int word : r.target) {
25 | if (!Vocabulary.nt(word)) {
26 | return ZERO;
27 | }
28 | }
29 | return ONE;
30 | }
31 |
32 | public String getName() {
33 | return NAME;
34 | }
35 |
36 | public void unaryGlueRuleScore(int nt, Map map) {
37 | map.put(Vocabulary.id(NAME), ONE);
38 | }
39 |
40 | public void binaryGlueRuleScore(int nt, Map map) {
41 | map.put(Vocabulary.id(NAME), ONE);
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/AdjacentNonTerminalsFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features;
2 |
3 | import java.util.Map;
4 |
5 | import org.apache.hadoop.io.IntWritable;
6 | import org.apache.hadoop.io.Writable;
7 |
8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
9 | import edu.jhu.thrax.util.Vocabulary;
10 |
11 | public class AdjacentNonTerminalsFeature implements SimpleFeature {
12 |
13 | public static final String NAME = "adjacent";
14 |
15 | private static final IntWritable ZERO = new IntWritable(0);
16 | private static final IntWritable ONE = new IntWritable(1);
17 |
18 | public Writable score(RuleWritable r) {
19 | for (int i = 0; i < r.source.length - 1; ++i)
20 | if (Vocabulary.nt(r.source[i])) {
21 | if (Vocabulary.nt(r.source[i + 1])) {
22 | return ONE;
23 | } else {
24 | i += 2;
25 | continue;
26 | }
27 | }
28 | return ZERO;
29 | }
30 |
31 | public String getName() {
32 | return NAME;
33 | }
34 |
35 | public void unaryGlueRuleScore(int nt, Map map) {
36 | map.put(Vocabulary.id(NAME), ZERO);
37 | }
38 |
39 | public void binaryGlueRuleScore(int nt, Map map) {
40 | map.put(Vocabulary.id(NAME), ONE);
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/CharacterCompressionRatioFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features;
2 |
3 | import java.util.Map;
4 |
5 | import org.apache.hadoop.io.FloatWritable;
6 | import org.apache.hadoop.io.Writable;
7 |
8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
9 | import edu.jhu.thrax.util.Vocabulary;
10 |
11 | public class CharacterCompressionRatioFeature implements SimpleFeature {
12 |
13 | private static final FloatWritable ZERO = new FloatWritable(0f);
14 |
15 | public static final String NAME = "char_cr";
16 |
17 | public Writable score(RuleWritable r) {
18 | int src_length = 0;
19 | for (int tok : r.source) {
20 | if (!Vocabulary.nt(tok)) {
21 | src_length += Vocabulary.word(tok).length();
22 | }
23 | }
24 | src_length += r.source.length - 1;
25 |
26 | int tgt_length = 0;
27 | for (int tok : r.target) {
28 | if (!Vocabulary.nt(tok)) {
29 | tgt_length += Vocabulary.word(tok).length();
30 | }
31 | }
32 | tgt_length += r.target.length - 1;
33 |
34 | if (src_length == 0 || tgt_length == 0)
35 | return ZERO;
36 | else
37 | return new FloatWritable((float) Math.log((float) tgt_length / src_length));
38 | }
39 |
40 | public String getName() {
41 | return NAME;
42 | }
43 |
44 | public void unaryGlueRuleScore(int nt, Map map) {
45 | map.put(Vocabulary.id(NAME), ZERO);
46 | }
47 |
48 | public void binaryGlueRuleScore(int nt, Map map) {
49 | map.put(Vocabulary.id(NAME), ZERO);
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/CharacterCountDifferenceFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features;
2 |
3 | import java.util.Map;
4 |
5 | import org.apache.hadoop.io.IntWritable;
6 | import org.apache.hadoop.io.Writable;
7 |
8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
9 | import edu.jhu.thrax.util.Vocabulary;
10 |
11 | public class CharacterCountDifferenceFeature implements SimpleFeature {
12 |
13 | private static final IntWritable ZERO = new IntWritable(0);
14 |
15 | public static final String NAME = "char_count_difference";
16 |
17 | public Writable score(RuleWritable r) {
18 | int char_difference = 0;
19 | for (int tok : r.source) {
20 | if (!Vocabulary.nt(tok)) {
21 | char_difference -= Vocabulary.word(tok).length();
22 | }
23 | }
24 | char_difference -= r.source.length - 1;
25 |
26 | for (int tok : r.target) {
27 | if (!Vocabulary.nt(tok)) {
28 | char_difference += Vocabulary.word(tok).length();
29 | }
30 | }
31 | char_difference += r.target.length - 1;
32 | return new IntWritable(char_difference);
33 | }
34 |
35 | public String getName() {
36 | return NAME;
37 | }
38 |
39 | public void unaryGlueRuleScore(int nt, Map map) {
40 | map.put(Vocabulary.id(NAME), ZERO);
41 | }
42 |
43 | public void binaryGlueRuleScore(int nt, Map map) {
44 | map.put(Vocabulary.id(NAME), ZERO);
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/ConsumeSourceTerminalsFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features;
2 |
3 | import java.util.Map;
4 |
5 | import org.apache.hadoop.io.IntWritable;
6 | import org.apache.hadoop.io.Writable;
7 |
8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
9 | import edu.jhu.thrax.util.Vocabulary;
10 |
11 | public class ConsumeSourceTerminalsFeature implements SimpleFeature {
12 |
13 | public static final String NAME = "source_terminals_without_target";
14 |
15 | private static final IntWritable ZERO = new IntWritable(0);
16 | private static final IntWritable ONE = new IntWritable(1);
17 |
18 | public Writable score(RuleWritable r) {
19 | for (int tok : r.target) {
20 | if (!Vocabulary.nt(tok)) {
21 | return ZERO;
22 | }
23 | }
24 | for (int tok : r.source) {
25 | if (!Vocabulary.nt(tok)) {
26 | return ONE;
27 | }
28 | }
29 | return ZERO;
30 | }
31 |
32 | public String getName() {
33 | return NAME;
34 | }
35 |
36 | public void unaryGlueRuleScore(int nt, Map map) {
37 | map.put(Vocabulary.id(NAME), ZERO);
38 | }
39 |
40 | public void binaryGlueRuleScore(int nt, Map map) {
41 | map.put(Vocabulary.id(NAME), ZERO);
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/Feature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features;
2 |
3 | import java.util.Map;
4 |
5 | import org.apache.hadoop.io.Writable;
6 |
7 | public interface Feature {
8 |
9 | public String getName();
10 |
11 | public void unaryGlueRuleScore(int nt, Map map);
12 |
13 | public void binaryGlueRuleScore(int nt, Map map);
14 |
15 | }
16 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/GlueRuleFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features;
2 |
3 | import java.util.Map;
4 |
5 | import org.apache.hadoop.io.IntWritable;
6 | import org.apache.hadoop.io.Writable;
7 |
8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
9 | import edu.jhu.thrax.util.Vocabulary;
10 |
11 | public class GlueRuleFeature implements SimpleFeature {
12 |
13 | public static final String NAME = "glue_rule";
14 |
15 | private static final IntWritable ZERO = new IntWritable(0);
16 | private static final IntWritable ONE = new IntWritable(1);
17 |
18 | public Writable score(RuleWritable r) {
19 | return ZERO;
20 | }
21 |
22 | public String getName() {
23 | return NAME;
24 | }
25 |
26 | public void unaryGlueRuleScore(int nt, Map map) {
27 | map.put(Vocabulary.id(NAME), ONE);
28 | }
29 |
30 | public void binaryGlueRuleScore(int nt, Map map) {
31 | map.put(Vocabulary.id(NAME), ONE);
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/IdentityFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features;
2 |
3 | import java.util.Arrays;
4 | import java.util.Map;
5 |
6 | import org.apache.hadoop.io.IntWritable;
7 | import org.apache.hadoop.io.Writable;
8 |
9 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
10 | import edu.jhu.thrax.util.Vocabulary;
11 |
12 | public class IdentityFeature implements SimpleFeature {
13 |
14 | public static final String NAME = "identity";
15 |
16 | private static final IntWritable ZERO = new IntWritable(0);
17 | private static final IntWritable ONE = new IntWritable(1);
18 |
19 | public Writable score(RuleWritable r) {
20 | if (r.monotone && Arrays.equals(r.target, r.source))
21 | return ONE;
22 | else
23 | return ZERO;
24 | }
25 |
26 | public String getName() {
27 | return NAME;
28 | }
29 |
30 | public void unaryGlueRuleScore(int nt, Map map) {
31 | map.put(Vocabulary.id(NAME), ZERO);
32 | }
33 |
34 | public void binaryGlueRuleScore(int nt, Map map) {
35 | map.put(Vocabulary.id(NAME), ZERO);
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/LexicalityFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features;
2 |
3 | import java.util.Map;
4 |
5 | import org.apache.hadoop.io.IntWritable;
6 | import org.apache.hadoop.io.Writable;
7 |
8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
9 | import edu.jhu.thrax.util.Vocabulary;
10 |
11 | public class LexicalityFeature implements SimpleFeature {
12 |
13 | public static final String NAME = "lexical";
14 |
15 | private static final IntWritable ZERO = new IntWritable(0);
16 | private static final IntWritable ONE = new IntWritable(1);
17 |
18 | public Writable score(RuleWritable r) {
19 | for (int tok : r.source)
20 | if (Vocabulary.nt(tok)) return ZERO;
21 | for (int tok : r.target)
22 | if (Vocabulary.nt(tok)) return ZERO;
23 | return ONE;
24 | }
25 |
26 | public String getName() {
27 | return NAME;
28 | }
29 |
30 | public void unaryGlueRuleScore(int nt, Map map) {
31 | map.put(Vocabulary.id(NAME), ZERO);
32 | }
33 |
34 | public void binaryGlueRuleScore(int nt, Map map) {
35 | map.put(Vocabulary.id(NAME), ZERO);
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/MonotonicFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features;
2 |
3 | import java.util.Map;
4 |
5 | import org.apache.hadoop.io.IntWritable;
6 | import org.apache.hadoop.io.Writable;
7 |
8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
9 | import edu.jhu.thrax.util.Vocabulary;
10 |
11 | public class MonotonicFeature implements SimpleFeature {
12 |
13 | public static final String NAME = "monotonic";
14 |
15 | private static final IntWritable ZERO = new IntWritable(0);
16 | private static final IntWritable ONE = new IntWritable(1);
17 |
18 | public Writable score(RuleWritable r) {
19 | return (r.monotone ? ONE : ZERO);
20 | }
21 |
22 | public String getName() {
23 | return NAME;
24 | }
25 |
26 | public void unaryGlueRuleScore(int nt, Map map) {
27 | map.put(Vocabulary.id(NAME), ONE);
28 | }
29 |
30 | public void binaryGlueRuleScore(int nt, Map map) {
31 | map.put(Vocabulary.id(NAME), ONE);
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/PhrasePenaltyFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features;
2 |
3 | import java.util.Map;
4 |
5 | import org.apache.hadoop.io.IntWritable;
6 | import org.apache.hadoop.io.Writable;
7 |
8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
9 | import edu.jhu.thrax.util.Vocabulary;
10 |
11 | public class PhrasePenaltyFeature implements SimpleFeature {
12 |
13 | public static final String NAME = "phrase_penalty";
14 |
15 | private static final IntWritable ONE = new IntWritable(1);
16 |
17 | public Writable score(RuleWritable r) {
18 | return ONE;
19 | }
20 |
21 | public String getName() {
22 | return NAME;
23 | }
24 |
25 | public void unaryGlueRuleScore(int nt, Map map) {
26 | map.put(Vocabulary.id(NAME), ONE);
27 | }
28 |
29 | public void binaryGlueRuleScore(int nt, Map map) {
30 | map.put(Vocabulary.id(NAME), ONE);
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/ProduceTargetTerminalsFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features;
2 |
3 | import java.util.Map;
4 |
5 | import org.apache.hadoop.io.IntWritable;
6 | import org.apache.hadoop.io.Writable;
7 |
8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
9 | import edu.jhu.thrax.util.Vocabulary;
10 |
11 | public class ProduceTargetTerminalsFeature implements SimpleFeature {
12 |
13 | public static final String NAME = "target_terminals_without_source";
14 |
15 | private static final IntWritable ZERO = new IntWritable(0);
16 | private static final IntWritable ONE = new IntWritable(1);
17 |
18 | public Writable score(RuleWritable r) {
19 | for (int tok : r.source)
20 | if (!Vocabulary.nt(tok)) return ZERO;
21 | for (int tok : r.target)
22 | if (!Vocabulary.nt(tok)) return ONE;
23 | return ZERO;
24 | }
25 |
26 | public String getName() {
27 | return NAME;
28 | }
29 |
30 | public void unaryGlueRuleScore(int nt, Map map) {
31 | map.put(Vocabulary.id(NAME), ZERO);
32 | }
33 |
34 | public void binaryGlueRuleScore(int nt, Map map) {
35 | map.put(Vocabulary.id(NAME), ZERO);
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/SimpleFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features;
2 |
3 | import org.apache.hadoop.io.Writable;
4 |
5 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
6 |
7 | public interface SimpleFeature extends Feature {
8 |
9 | public Writable score(RuleWritable r);
10 |
11 | }
12 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/SimpleFeatureFactory.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 |
6 | import edu.jhu.thrax.util.FormatUtils;
7 |
8 | public class SimpleFeatureFactory {
9 |
10 | public static SimpleFeature get(String name) {
11 | if (name.equals(AbstractnessFeature.NAME))
12 | return new AbstractnessFeature();
13 | else if (name.equals(AdjacentNonTerminalsFeature.NAME))
14 | return new AdjacentNonTerminalsFeature();
15 | else if (name.equals(LexicalityFeature.NAME))
16 | return new LexicalityFeature();
17 | else if (name.equals(XRuleFeature.NAME))
18 | return new XRuleFeature();
19 | else if (name.equals(MonotonicFeature.NAME))
20 | return new MonotonicFeature();
21 | else if (name.equals(PhrasePenaltyFeature.NAME))
22 | return new PhrasePenaltyFeature();
23 | else if (name.equals(SourceWordCounterFeature.NAME))
24 | return new SourceWordCounterFeature();
25 | else if (name.equals(TargetWordCounterFeature.NAME))
26 | return new TargetWordCounterFeature();
27 | else if (name.equals(ConsumeSourceTerminalsFeature.NAME))
28 | return new ConsumeSourceTerminalsFeature();
29 | else if (name.equals(ProduceTargetTerminalsFeature.NAME))
30 | return new ProduceTargetTerminalsFeature();
31 | else if (name.equals(IdentityFeature.NAME))
32 | return new IdentityFeature();
33 | else if (name.equals(WordCountDifferenceFeature.NAME))
34 | return new WordCountDifferenceFeature();
35 | else if (name.equals(WordLengthDifferenceFeature.NAME))
36 | return new WordLengthDifferenceFeature();
37 | else if (name.equals(WordCompressionRatioFeature.NAME))
38 | return new WordCompressionRatioFeature();
39 | else if (name.equals(CharacterCountDifferenceFeature.NAME))
40 | return new CharacterCountDifferenceFeature();
41 | else if (name.equals(CharacterCompressionRatioFeature.NAME))
42 | return new CharacterCompressionRatioFeature();
43 | else if (name.equals(GlueRuleFeature.NAME)) return new GlueRuleFeature();
44 |
45 | return null;
46 | }
47 |
48 | public static List getAll(String names) {
49 | String[] feature_names = FormatUtils.P_COMMA_OR_SPACE.split(names);
50 | List features = new ArrayList();
51 |
52 | for (String feature_name : feature_names) {
53 | SimpleFeature feature = get(feature_name);
54 | if (feature != null) features.add(feature);
55 | }
56 | return features;
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/SourceWordCounterFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features;
2 |
3 | import java.util.Map;
4 |
5 | import org.apache.hadoop.io.IntWritable;
6 | import org.apache.hadoop.io.Writable;
7 |
8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
9 | import edu.jhu.thrax.util.Vocabulary;
10 |
11 | public class SourceWordCounterFeature implements SimpleFeature {
12 |
13 | public static final String NAME = "source_word_count";
14 |
15 | private static final IntWritable ZERO = new IntWritable(0);
16 |
17 | public Writable score(RuleWritable r) {
18 | int words = 0;
19 | for (int word : r.source)
20 | if (!Vocabulary.nt(word)) words++;
21 | return new IntWritable(words);
22 | }
23 |
24 | public String getName() {
25 | return NAME;
26 | }
27 |
28 | public void unaryGlueRuleScore(int nt, Map map) {
29 | map.put(Vocabulary.id(NAME), ZERO);
30 | }
31 |
32 | public void binaryGlueRuleScore(int nt, Map map) {
33 | map.put(Vocabulary.id(NAME), ZERO);
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/TargetWordCounterFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features;
2 |
3 | import java.util.Map;
4 |
5 | import org.apache.hadoop.io.IntWritable;
6 | import org.apache.hadoop.io.Writable;
7 |
8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
9 | import edu.jhu.thrax.util.Vocabulary;
10 |
11 | public class TargetWordCounterFeature implements SimpleFeature {
12 |
13 | public static final String NAME = "target_word_count";
14 |
15 | private static final IntWritable ZERO = new IntWritable(0);
16 |
17 | public Writable score(RuleWritable r) {
18 | int words = 0;
19 | for (int tok : r.target)
20 | if (!Vocabulary.nt(tok)) words++;
21 | return new IntWritable(words);
22 | }
23 |
24 | public String getName() {
25 | return NAME;
26 | }
27 |
28 | public void unaryGlueRuleScore(int nt, Map map) {
29 | map.put(Vocabulary.id(NAME), ZERO);
30 | }
31 |
32 | public void binaryGlueRuleScore(int nt, Map map) {
33 | map.put(Vocabulary.id(NAME), ZERO);
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/WordCompressionRatioFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features;
2 |
3 | import java.util.Map;
4 |
5 | import org.apache.hadoop.io.FloatWritable;
6 | import org.apache.hadoop.io.IntWritable;
7 | import org.apache.hadoop.io.Writable;
8 |
9 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
10 | import edu.jhu.thrax.util.Vocabulary;
11 |
12 | public class WordCompressionRatioFeature implements SimpleFeature {
13 |
14 | public static final String NAME = "word_cr";
15 |
16 | private static final IntWritable ZERO = new IntWritable(0);
17 |
18 | public Writable score(RuleWritable r) {
19 | int src_count = 0;
20 | for (int tok : r.source)
21 | if (!Vocabulary.nt(tok)) src_count++;
22 | int tgt_count = 0;
23 | for (int tok : r.target)
24 | if (!Vocabulary.nt(tok)) tgt_count++;
25 | if (src_count == 0 || tgt_count == 0) {
26 | return ZERO;
27 | } else {
28 | return new FloatWritable((float) Math.log((float) tgt_count / src_count));
29 | }
30 | }
31 |
32 | public String getName() {
33 | return NAME;
34 | }
35 |
36 | public void unaryGlueRuleScore(int nt, Map map) {
37 | map.put(Vocabulary.id(NAME), ZERO);
38 | }
39 |
40 | public void binaryGlueRuleScore(int nt, Map map) {
41 | map.put(Vocabulary.id(NAME), ZERO);
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/WordCountDifferenceFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features;
2 |
3 | import java.util.Map;
4 |
5 | import org.apache.hadoop.io.IntWritable;
6 | import org.apache.hadoop.io.Writable;
7 |
8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
9 | import edu.jhu.thrax.util.Vocabulary;
10 |
11 | public class WordCountDifferenceFeature implements SimpleFeature {
12 |
13 | public static final String NAME = "word_count_difference";
14 |
15 | private static final IntWritable ZERO = new IntWritable(0);
16 |
17 | public Writable score(RuleWritable r) {
18 | int word_difference = 0;
19 | for (int tok : r.source)
20 | if (!Vocabulary.nt(tok)) word_difference--;
21 | for (int tok : r.target)
22 | if (!Vocabulary.nt(tok)) word_difference++;
23 | return new IntWritable(word_difference);
24 | }
25 |
26 | public String getName() {
27 | return NAME;
28 | }
29 |
30 | public void unaryGlueRuleScore(int nt, Map map) {
31 | map.put(Vocabulary.id(NAME), ZERO);
32 | }
33 |
34 | public void binaryGlueRuleScore(int nt, Map map) {
35 | map.put(Vocabulary.id(NAME), ZERO);
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/WordLengthDifferenceFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features;
2 |
3 | import java.util.Map;
4 |
5 | import org.apache.hadoop.io.FloatWritable;
6 | import org.apache.hadoop.io.Writable;
7 |
8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
9 | import edu.jhu.thrax.util.Vocabulary;
10 |
11 | public class WordLengthDifferenceFeature implements SimpleFeature {
12 |
13 | public static final String NAME = "word_length_difference";
14 |
15 | private static final FloatWritable ZERO = new FloatWritable(0);
16 |
17 | public Writable score(RuleWritable r) {
18 | int src_length = 0;
19 | int src_count = 0;
20 | for (int tok : r.source) {
21 | if (!Vocabulary.nt(tok)) {
22 | src_length += Vocabulary.word(tok).length();
23 | src_count++;
24 | }
25 | }
26 | int tgt_length = 0;
27 | int tgt_count = 0;
28 | for (int tok : r.target) {
29 | if (!Vocabulary.nt(tok)) {
30 | tgt_length += Vocabulary.word(tok).length();
31 | tgt_count++;
32 | }
33 | }
34 | if (src_count == 0 || tgt_count == 0) {
35 | return ZERO;
36 | } else {
37 | float avg_src_length = (float) src_length / src_count;
38 | float avg_tgt_length = (float) tgt_length / tgt_count;
39 | return new FloatWritable(avg_tgt_length - avg_src_length);
40 | }
41 | }
42 |
43 | public String getName() {
44 | return NAME;
45 | }
46 |
47 | public void unaryGlueRuleScore(int nt, Map map) {
48 | map.put(Vocabulary.id(NAME), ZERO);
49 | }
50 |
51 | public void binaryGlueRuleScore(int nt, Map map) {
52 | map.put(Vocabulary.id(NAME), ZERO);
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/XRuleFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features;
2 |
3 | import java.util.Map;
4 |
5 | import org.apache.hadoop.io.IntWritable;
6 | import org.apache.hadoop.io.Writable;
7 |
8 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
9 | import edu.jhu.thrax.util.Vocabulary;
10 |
11 | public class XRuleFeature implements SimpleFeature {
12 |
13 | public static final String NAME = "x_rule";
14 |
15 | private static final IntWritable ZERO = new IntWritable(0);
16 | private static final IntWritable ONE = new IntWritable(1);
17 |
18 | // TODO: should be default nonterminal and not explicitly X.
19 | private final int PATTERN = Vocabulary.id("[X]");
20 |
21 | public Writable score(RuleWritable r) {
22 | return (r.lhs == PATTERN ? ONE : ZERO);
23 | }
24 |
25 | public String getName() {
26 | return NAME;
27 | }
28 |
29 | public void unaryGlueRuleScore(int nt, Map map) {
30 | map.put(Vocabulary.id(NAME), ZERO);
31 | }
32 |
33 | public void binaryGlueRuleScore(int nt, Map map) {
34 | map.put(Vocabulary.id(NAME), ZERO);
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/annotation/AlignmentFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features.annotation;
2 |
3 | import java.io.IOException;
4 | import java.util.Map;
5 | import java.util.Set;
6 |
7 | import org.apache.hadoop.io.IntWritable;
8 | import org.apache.hadoop.io.Writable;
9 | import org.apache.hadoop.mapreduce.Reducer.Context;
10 |
11 | import edu.jhu.thrax.hadoop.datatypes.AlignmentWritable;
12 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
13 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
14 | import edu.jhu.thrax.hadoop.jobs.ThraxJob;
15 | import edu.jhu.thrax.util.Vocabulary;
16 |
17 | @SuppressWarnings("rawtypes")
18 | public class AlignmentFeature implements AnnotationFeature {
19 |
20 | public static final String NAME = "alignment";
21 |
22 | private static final IntWritable ZERO = new IntWritable(0);
23 |
24 | public String getName() {
25 | return NAME;
26 | }
27 |
28 | public AlignmentWritable score(RuleWritable r, Annotation annotation) {
29 | return annotation.f2e();
30 | }
31 |
32 | public void unaryGlueRuleScore(int nt, Map map) {
33 | map.put(Vocabulary.id(NAME), ZERO);
34 | }
35 |
36 | public void binaryGlueRuleScore(int nt, Map map) {
37 | map.put(Vocabulary.id(NAME), ZERO);
38 | }
39 |
40 | @Override
41 | public void init(Context context) throws IOException, InterruptedException {}
42 |
43 | @Override
44 | public Set> getPrerequisites() {
45 | return null;
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/annotation/AnnotationFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features.annotation;
2 |
3 | import java.io.IOException;
4 | import java.util.Set;
5 |
6 | import org.apache.hadoop.io.Writable;
7 | import org.apache.hadoop.mapreduce.Reducer.Context;
8 |
9 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
10 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
11 | import edu.jhu.thrax.hadoop.features.Feature;
12 | import edu.jhu.thrax.hadoop.jobs.ThraxJob;
13 |
14 | public interface AnnotationFeature extends Feature {
15 |
16 | @SuppressWarnings("rawtypes")
17 | public void init(Context context) throws IOException, InterruptedException;
18 |
19 | public Writable score(RuleWritable r, Annotation annotation);
20 |
21 | // TODO: move this into its own interface, have AF extend it.
22 | public Set> getPrerequisites();
23 | }
24 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/annotation/AnnotationFeatureFactory.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features.annotation;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 |
6 | import edu.jhu.thrax.util.FormatUtils;
7 |
8 | public class AnnotationFeatureFactory {
9 |
10 | public static AnnotationFeature get(String name) {
11 | if (name.equals(UnalignedSourceCounterFeature.NAME))
12 | return new UnalignedSourceCounterFeature();
13 | else if (name.equals(UnalignedTargetCounterFeature.NAME))
14 | return new UnalignedTargetCounterFeature();
15 | else if (name.equals(RarityPenaltyFeature.NAME))
16 | return new RarityPenaltyFeature();
17 | else if (name.equals(CountFeature.NAME))
18 | return new CountFeature();
19 | else if (name.equals(LogCountFeature.NAME))
20 | return new LogCountFeature();
21 | else if (name.equals(SourceGivenTargetLexicalProbabilityFeature.NAME))
22 | return new SourceGivenTargetLexicalProbabilityFeature();
23 | else if (name.equals(TargetGivenSourceLexicalProbabilityFeature.NAME))
24 | return new TargetGivenSourceLexicalProbabilityFeature();
25 | else if (name.equals(AlignmentFeature.NAME))
26 | return new AlignmentFeature();
27 |
28 | return null;
29 | }
30 |
31 | public static List getAll(String names) {
32 | String[] feature_names = FormatUtils.P_COMMA_OR_SPACE.split(names);
33 | List features = new ArrayList();
34 |
35 | for (String feature_name : feature_names) {
36 | AnnotationFeature feature = get(feature_name);
37 | if (feature != null) features.add(feature);
38 | }
39 | return features;
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/annotation/AnnotationFeatureJob.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features.annotation;
2 |
3 | import java.io.IOException;
4 | import java.util.HashSet;
5 | import java.util.Set;
6 |
7 | import org.apache.hadoop.conf.Configuration;
8 | import org.apache.hadoop.fs.Path;
9 | import org.apache.hadoop.mapreduce.Job;
10 | import org.apache.hadoop.mapreduce.Mapper;
11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
12 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
14 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
15 |
16 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
17 | import edu.jhu.thrax.hadoop.datatypes.FeaturePair;
18 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
19 | import edu.jhu.thrax.hadoop.jobs.DefaultValues;
20 | import edu.jhu.thrax.hadoop.jobs.ExtractionJob;
21 | import edu.jhu.thrax.hadoop.jobs.ThraxJob;
22 |
23 | public class AnnotationFeatureJob implements ThraxJob {
24 |
25 | public AnnotationFeatureJob() {}
26 |
27 | protected static HashSet> prereqs =
28 | new HashSet>();
29 |
30 | public Set> getPrerequisites() {
31 | prereqs.add(ExtractionJob.class);
32 | return prereqs;
33 | }
34 |
35 | public static void addPrerequisites(Iterable> cs) {
36 | if (cs != null)
37 | for (Class extends ThraxJob> c : cs)
38 | prereqs.add(c);
39 | }
40 |
41 | public static void addPrerequisite(Class extends ThraxJob> c) {
42 | prereqs.add(c);
43 | }
44 |
45 | public String getOutputSuffix() {
46 | return getName();
47 | }
48 |
49 | public Job getJob(Configuration conf) throws IOException {
50 | String name = getName();
51 | Job job = new Job(conf, name);
52 | job.setJarByClass(this.getClass());
53 |
54 | job.setMapperClass(Mapper.class);
55 | job.setPartitionerClass(RuleWritable.YieldPartitioner.class);
56 | job.setReducerClass(AnnotationReducer.class);
57 |
58 | job.setInputFormatClass(SequenceFileInputFormat.class);
59 | job.setMapOutputKeyClass(RuleWritable.class);
60 | job.setMapOutputValueClass(Annotation.class);
61 | job.setOutputKeyClass(RuleWritable.class);
62 | job.setOutputValueClass(FeaturePair.class);
63 | job.setOutputFormatClass(SequenceFileOutputFormat.class);
64 |
65 | int num_reducers = conf.getInt("thrax.reducers", conf.getInt("mapreduce.job.reduces", DefaultValues.DEFAULT_NUM_REDUCERS));
66 | job.setNumReduceTasks(num_reducers);
67 |
68 | FileInputFormat.setInputPaths(job, new Path(conf.get("thrax.work-dir") + "rules"));
69 | FileOutputFormat.setOutputPath(job, new Path(conf.get("thrax.work-dir") + "annotation"));
70 | return job;
71 | }
72 |
73 | @Override
74 | public String getName() {
75 | return "annotation";
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/annotation/AnnotationPassthroughFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features.annotation;
2 |
3 | import java.io.IOException;
4 | import java.util.Map;
5 | import java.util.Set;
6 |
7 | import org.apache.hadoop.io.Writable;
8 | import org.apache.hadoop.mapreduce.Reducer.Context;
9 |
10 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
11 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
12 | import edu.jhu.thrax.hadoop.jobs.ThraxJob;
13 |
14 | @SuppressWarnings("rawtypes")
15 | public class AnnotationPassthroughFeature implements AnnotationFeature {
16 |
17 | public static final String NAME = "annotation";
18 |
19 | public String getName() {
20 | return NAME;
21 | }
22 |
23 | public Annotation score(RuleWritable r, Annotation annotation) {
24 | return annotation;
25 | }
26 |
27 | public void unaryGlueRuleScore(int nt, Map map) {
28 | }
29 |
30 | public void binaryGlueRuleScore(int nt, Map map) {
31 | }
32 |
33 | @Override
34 | public void init(Context context) throws IOException, InterruptedException {}
35 |
36 | @Override
37 | public Set> getPrerequisites() {
38 | return null;
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/annotation/AnnotationReducer.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features.annotation;
2 |
3 | import java.io.IOException;
4 | import java.util.ArrayList;
5 | import java.util.List;
6 |
7 | import org.apache.hadoop.conf.Configuration;
8 | import org.apache.hadoop.mapreduce.Reducer;
9 |
10 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
11 | import edu.jhu.thrax.hadoop.datatypes.FeaturePair;
12 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
13 | import edu.jhu.thrax.util.BackwardsCompatibility;
14 | import edu.jhu.thrax.util.Vocabulary;
15 |
16 | public class AnnotationReducer extends Reducer {
17 |
18 | private List annotationFeatures;
19 |
20 | public AnnotationReducer() {}
21 |
22 | protected void setup(Context context) throws IOException, InterruptedException {
23 | Configuration conf = context.getConfiguration();
24 | String vocabulary_path = conf.getRaw("thrax.work-dir") + "vocabulary/part-*";
25 | Vocabulary.initialize(conf, vocabulary_path);
26 |
27 | String features = BackwardsCompatibility.equivalent(conf.get("thrax.features", ""));
28 |
29 | // Paraphrasing only needs the annotation to be passed through.
30 | String type = conf.get("thrax.type", "translation");
31 | if ("paraphrasing".equals(type)) {
32 | annotationFeatures = new ArrayList();
33 | annotationFeatures.add(new AnnotationPassthroughFeature());
34 | } else {
35 | annotationFeatures = AnnotationFeatureFactory.getAll(features);
36 | }
37 |
38 | for (AnnotationFeature af : annotationFeatures)
39 | af.init(context);
40 | }
41 |
42 | protected void reduce(RuleWritable key, Iterable values, Context context)
43 | throws IOException, InterruptedException {
44 | for (Annotation annotation : values) {
45 | for (AnnotationFeature f : annotationFeatures) {
46 | context.write(key, new FeaturePair(Vocabulary.id(f.getName()), f.score(key, annotation)));
47 | }
48 | }
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/annotation/CountFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features.annotation;
2 |
3 | import java.util.Map;
4 | import java.util.Set;
5 |
6 | import org.apache.hadoop.io.IntWritable;
7 | import org.apache.hadoop.io.Writable;
8 | import org.apache.hadoop.mapreduce.Reducer.Context;
9 |
10 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
11 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
12 | import edu.jhu.thrax.hadoop.jobs.ThraxJob;
13 | import edu.jhu.thrax.util.Vocabulary;
14 |
15 | @SuppressWarnings("rawtypes")
16 | public class CountFeature implements AnnotationFeature {
17 |
18 | public static final String NAME = "count";
19 |
20 | private static final IntWritable ZERO = new IntWritable(0);
21 |
22 | public String getName() {
23 | return NAME;
24 | }
25 |
26 | public void unaryGlueRuleScore(int nt, Map map) {
27 | map.put(Vocabulary.id(NAME), ZERO);
28 | }
29 |
30 | public void binaryGlueRuleScore(int nt, Map map) {
31 | map.put(Vocabulary.id(NAME), ZERO);
32 | }
33 |
34 | @Override
35 | public Writable score(RuleWritable r, Annotation annotation) {
36 | return new IntWritable(annotation.count());
37 | }
38 |
39 | @Override
40 | public void init(Context context) {}
41 |
42 | @Override
43 | public Set> getPrerequisites() {
44 | return null;
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/annotation/LogCountFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features.annotation;
2 |
3 | import java.util.Map;
4 | import java.util.Set;
5 |
6 | import org.apache.hadoop.io.FloatWritable;
7 | import org.apache.hadoop.io.Writable;
8 | import org.apache.hadoop.mapreduce.Reducer.Context;
9 |
10 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
11 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
12 | import edu.jhu.thrax.hadoop.jobs.ThraxJob;
13 | import edu.jhu.thrax.util.Vocabulary;
14 |
15 | @SuppressWarnings("rawtypes")
16 | public class LogCountFeature implements AnnotationFeature {
17 |
18 | public static final String NAME = "logcount";
19 |
20 | private static final FloatWritable ZERO = new FloatWritable(0);
21 |
22 | public String getName() {
23 | return NAME;
24 | }
25 |
26 | public void unaryGlueRuleScore(int nt, Map map) {
27 | map.put(Vocabulary.id(NAME), ZERO);
28 | }
29 |
30 | public void binaryGlueRuleScore(int nt, Map map) {
31 | map.put(Vocabulary.id(NAME), ZERO);
32 | }
33 |
34 | @Override
35 | public Writable score(RuleWritable r, Annotation annotation) {
36 | return new FloatWritable((float) Math.log(annotation.count()));
37 | }
38 |
39 | @Override
40 | public void init(Context context) {}
41 |
42 | @Override
43 | public Set> getPrerequisites() {
44 | return null;
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/annotation/RarityPenaltyFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features.annotation;
2 |
3 | import java.util.Map;
4 | import java.util.Set;
5 |
6 | import org.apache.hadoop.io.FloatWritable;
7 | import org.apache.hadoop.io.Writable;
8 | import org.apache.hadoop.mapreduce.Reducer.Context;
9 |
10 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
11 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
12 | import edu.jhu.thrax.hadoop.jobs.ThraxJob;
13 | import edu.jhu.thrax.util.Vocabulary;
14 |
15 | @SuppressWarnings("rawtypes")
16 | public class RarityPenaltyFeature implements AnnotationFeature {
17 |
18 | public static final String NAME = "rarity";
19 |
20 | private static final FloatWritable ZERO = new FloatWritable(0.0f);
21 |
22 | public String getName() {
23 | return NAME;
24 | }
25 |
26 | public void unaryGlueRuleScore(int nt, Map map) {
27 | map.put(Vocabulary.id(NAME), ZERO);
28 | }
29 |
30 | public void binaryGlueRuleScore(int nt, Map map) {
31 | map.put(Vocabulary.id(NAME), ZERO);
32 | }
33 |
34 | @Override
35 | public Writable score(RuleWritable r, Annotation annotation) {
36 | return new FloatWritable((float) Math.exp(1 - annotation.count()));
37 | }
38 |
39 | @Override
40 | public void init(Context context) {}
41 |
42 | @Override
43 | public Set> getPrerequisites() {
44 | return null;
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/annotation/UnalignedSourceCounterFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features.annotation;
2 |
3 | import java.io.IOException;
4 | import java.util.Map;
5 | import java.util.Set;
6 |
7 | import org.apache.hadoop.io.IntWritable;
8 | import org.apache.hadoop.io.Writable;
9 | import org.apache.hadoop.mapreduce.Reducer.Context;
10 |
11 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
12 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
13 | import edu.jhu.thrax.hadoop.jobs.ThraxJob;
14 | import edu.jhu.thrax.util.Vocabulary;
15 |
16 | @SuppressWarnings("rawtypes")
17 | public class UnalignedSourceCounterFeature implements AnnotationFeature {
18 |
19 | public static final String NAME = "unaligned_source";
20 |
21 | private static final IntWritable ZERO = new IntWritable(0);
22 |
23 | public String getName() {
24 | return NAME;
25 | }
26 |
27 | public IntWritable score(RuleWritable r, Annotation annotation) {
28 | byte[] f2e = annotation.f2e().points;
29 | int[] src = r.source;
30 |
31 | int count = 0;
32 | int i = 0, j = 0;
33 | for (i = 0; i < src.length; ++i) {
34 | if (Vocabulary.nt(src[i])) continue;
35 | if (j >= f2e.length || i != f2e[j]) count++;
36 | while (j < f2e.length && f2e[j] <= i)
37 | j += 2;
38 | }
39 | return new IntWritable(count);
40 | }
41 |
42 | public void unaryGlueRuleScore(int nt, Map map) {
43 | map.put(Vocabulary.id(NAME), ZERO);
44 | }
45 |
46 | public void binaryGlueRuleScore(int nt, Map map) {
47 | map.put(Vocabulary.id(NAME), ZERO);
48 | }
49 |
50 | @Override
51 | public void init(Context context) throws IOException, InterruptedException {}
52 |
53 | @Override
54 | public Set> getPrerequisites() {
55 | return null;
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/annotation/UnalignedTargetCounterFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features.annotation;
2 |
3 | import java.io.IOException;
4 | import java.util.Map;
5 | import java.util.Set;
6 |
7 | import org.apache.hadoop.io.IntWritable;
8 | import org.apache.hadoop.io.Writable;
9 | import org.apache.hadoop.mapreduce.Reducer.Context;
10 |
11 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
12 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
13 | import edu.jhu.thrax.hadoop.jobs.ThraxJob;
14 | import edu.jhu.thrax.util.Vocabulary;
15 |
16 | @SuppressWarnings("rawtypes")
17 | public class UnalignedTargetCounterFeature implements AnnotationFeature {
18 |
19 | public static final String NAME = "unaligned_target";
20 |
21 | private static final IntWritable ZERO = new IntWritable(0);
22 |
23 | public String getName() {
24 | return NAME;
25 | }
26 |
27 | public IntWritable score(RuleWritable r, Annotation annotation) {
28 | byte[] e2f = annotation.e2f().points;
29 | int[] tgt = r.target;
30 |
31 | int count = 0;
32 | int i = 0, j = 0;
33 | for (i = 0; i < tgt.length; ++i) {
34 | if (Vocabulary.nt(tgt[i])) continue;
35 | if (j >= e2f.length || i != e2f[j]) count++;
36 | while (j < e2f.length && e2f[j] <= i)
37 | j += 2;
38 | }
39 | return new IntWritable(count);
40 | }
41 |
42 | public void unaryGlueRuleScore(int nt, Map map) {
43 | map.put(Vocabulary.id(NAME), ZERO);
44 | }
45 |
46 | public void binaryGlueRuleScore(int nt, Map map) {
47 | map.put(Vocabulary.id(NAME), ZERO);
48 | }
49 |
50 | @Override
51 | public void init(Context context) throws IOException, InterruptedException {}
52 |
53 | @Override
54 | public Set> getPrerequisites() {
55 | return null;
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/mapred/MapReduceFeatureFactory.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features.mapred;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 |
6 | import edu.jhu.thrax.hadoop.features.annotation.CountFeature;
7 | import edu.jhu.thrax.util.FormatUtils;
8 |
9 | public class MapReduceFeatureFactory {
10 |
11 | public static MapReduceFeature get(String name) {
12 | if (name.equals(SourcePhraseGivenTargetFeature.NAME))
13 | return new SourcePhraseGivenTargetFeature();
14 | else if (name.equals(TargetPhraseGivenSourceFeature.NAME))
15 | return new TargetPhraseGivenSourceFeature();
16 | else if (name.equals(GoodTuringSmoothedTargetPhraseGivenSourceFeature.NAME))
17 | return new GoodTuringSmoothedTargetPhraseGivenSourceFeature();
18 | else if (name.equals(GoodTuringSmoothedSourcePhraseGivenTargetFeature.NAME))
19 | return new GoodTuringSmoothedSourcePhraseGivenTargetFeature();
20 | else if (name.equals(SourcePhraseGivenLHSFeature.NAME))
21 | return new SourcePhraseGivenLHSFeature();
22 | else if (name.equals(LhsGivenSourcePhraseFeature.NAME))
23 | return new LhsGivenSourcePhraseFeature();
24 | else if (name.equals(SourcePhraseGivenTargetandLHSFeature.NAME))
25 | return new SourcePhraseGivenTargetandLHSFeature();
26 | else if (name.equals(TargetPhraseGivenSourceandLHSFeature.NAME))
27 | return new TargetPhraseGivenSourceandLHSFeature();
28 | else if (name.equals(TargetPhraseGivenLHSFeature.NAME))
29 | return new TargetPhraseGivenLHSFeature();
30 | else if (name.equals(LhsGivenTargetPhraseFeature.NAME))
31 | return new LhsGivenTargetPhraseFeature();
32 | else if (name.equals(SourceCountFeature.NAME))
33 | return new SourceCountFeature();
34 | else if (name.equals(TargetCountFeature.NAME))
35 | return new TargetCountFeature();
36 |
37 | return null;
38 | }
39 |
40 | public static List getAll(String names) {
41 | String[] feature_names = FormatUtils.P_COMMA_OR_SPACE.split(names);
42 | List features = new ArrayList();
43 |
44 | for (String feature_name : feature_names) {
45 | MapReduceFeature feature = get(feature_name);
46 | if (feature != null) features.add(feature);
47 | }
48 | return features;
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/mapred/coc/GoodTuringSmoother.java:
--------------------------------------------------------------------------------
1 | // Copyright 2015 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | package edu.jhu.thrax.hadoop.features.mapred.coc;
3 |
4 | public class GoodTuringSmoother {
5 | private CountOfCountsEstimator estimator;
6 |
7 | public GoodTuringSmoother(CountOfCountsEstimator estimator) {
8 | this.estimator = estimator;
9 | }
10 |
11 | public double smoothedCount(int count) {
12 | double turingFraction = estimator.getEstimatedCountOfCount(count + 1) / estimator.getEstimatedCountOfCount(count);
13 | return (count + 1) * turingFraction;
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/pivot/NonAggregatingPivotedFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features.pivot;
2 |
3 | import java.util.Map;
4 |
5 | import org.apache.hadoop.io.FloatWritable;
6 | import org.apache.hadoop.io.Writable;
7 |
8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
9 | import edu.jhu.thrax.util.Vocabulary;
10 |
11 | public abstract class NonAggregatingPivotedFeature implements PivotedFeature {
12 |
13 | private static final FloatWritable ZERO = new FloatWritable(0.0f);
14 |
15 | private float value;
16 |
17 | public void initializeAggregation() {
18 | value = Float.MAX_VALUE;
19 | }
20 |
21 | public void aggregate(FeatureMap features) {
22 | FloatWritable val = (FloatWritable) features.get(getName());
23 | if (value == Float.MAX_VALUE) {
24 | value = val.get();
25 | } else {
26 | if (value != val.get()) {
27 | throw new RuntimeException("Diverging values in pseudo-aggregation: " + value + " versus "
28 | + val.get());
29 | }
30 | }
31 | }
32 |
33 | public FloatWritable finalizeAggregation() {
34 | return new FloatWritable(value);
35 | }
36 |
37 | public void unaryGlueRuleScore(int nt, Map map) {
38 | map.put(Vocabulary.id(getName()), ZERO);
39 | }
40 |
41 | public void binaryGlueRuleScore(int nt, Map map) {
42 | map.put(Vocabulary.id(getName()), ZERO);
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/pivot/PivotedAnnotationFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features.pivot;
2 |
3 | import java.util.HashSet;
4 | import java.util.Map;
5 | import java.util.Set;
6 |
7 | import org.apache.hadoop.io.Writable;
8 |
9 | import edu.jhu.thrax.hadoop.datatypes.AlignmentWritable;
10 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
11 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
12 | import edu.jhu.thrax.hadoop.features.annotation.AlignmentFeature;
13 | import edu.jhu.thrax.hadoop.features.annotation.AnnotationPassthroughFeature;
14 |
15 | public class PivotedAnnotationFeature implements PivotedFeature {
16 |
17 | public static final String NAME = "annotation";
18 |
19 | private Annotation aggregated = null;
20 |
21 | public String getName() {
22 | return NAME;
23 | }
24 |
25 | public Set getPrerequisites() {
26 | Set prereqs = new HashSet();
27 | prereqs.add(AlignmentFeature.NAME);
28 | return prereqs;
29 | }
30 |
31 | public Annotation pivot(FeatureMap src, FeatureMap tgt) {
32 | AlignmentWritable src_f2e = ((AlignmentWritable) src.get(AlignmentFeature.NAME));
33 | AlignmentWritable tgt_f2e = ((AlignmentWritable) tgt.get(AlignmentFeature.NAME));
34 |
35 | return new Annotation(src_f2e.join(tgt_f2e));
36 | }
37 |
38 | public void unaryGlueRuleScore(int nt, Map map) {}
39 |
40 | public void binaryGlueRuleScore(int nt, Map map) {}
41 |
42 | public void initializeAggregation() {
43 | aggregated = null;
44 | }
45 |
46 | public void aggregate(FeatureMap a) {
47 | Annotation annotation = (Annotation) a.get(AnnotationPassthroughFeature.NAME);
48 | if (aggregated == null) {
49 | aggregated = new Annotation(annotation);
50 | } else {
51 | aggregated.setAlignment(aggregated.f2e().intersect(annotation.f2e()));
52 | aggregated.merge(annotation);
53 | }
54 | }
55 |
56 | public Annotation finalizeAggregation() {
57 | return aggregated;
58 | }
59 |
60 | @Override
61 | public Set getLowerBoundLabels() {
62 | return null;
63 | }
64 |
65 | @Override
66 | public Set getUpperBoundLabels() {
67 | return null;
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/pivot/PivotedFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features.pivot;
2 |
3 | import java.util.Set;
4 |
5 | import org.apache.hadoop.io.Writable;
6 |
7 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
8 | import edu.jhu.thrax.hadoop.features.Feature;
9 |
10 | public interface PivotedFeature extends Feature {
11 |
12 | public Set getPrerequisites();
13 |
14 | public Writable pivot(FeatureMap src, FeatureMap tgt);
15 |
16 | public void initializeAggregation();
17 |
18 | public void aggregate(FeatureMap a);
19 |
20 | public Writable finalizeAggregation();
21 |
22 | public Set getLowerBoundLabels();
23 |
24 | public Set getUpperBoundLabels();
25 | }
26 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/pivot/PivotedFeatureFactory.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features.pivot;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 |
6 | import edu.jhu.thrax.util.FormatUtils;
7 |
8 | public class PivotedFeatureFactory {
9 |
10 | public static PivotedFeature get(String name) {
11 | if (name.equals(PivotedTargetPhraseGivenSourceFeature.NAME))
12 | return new PivotedTargetPhraseGivenSourceFeature();
13 | else if (name.equals(PivotedSourcePhraseGivenTargetFeature.NAME))
14 | return new PivotedSourcePhraseGivenTargetFeature();
15 | else if (name.equals(PivotedRarityPenaltyFeature.NAME))
16 | return new PivotedRarityPenaltyFeature();
17 | else if (name.equals(PivotedLexicalSourceGivenTargetFeature.NAME))
18 | return new PivotedLexicalSourceGivenTargetFeature();
19 | else if (name.equals(PivotedLexicalTargetGivenSourceFeature.NAME))
20 | return new PivotedLexicalTargetGivenSourceFeature();
21 | else if (name.equals(PivotedSourcePhraseGivenLHSFeature.NAME))
22 | return new PivotedSourcePhraseGivenLHSFeature();
23 | else if (name.equals(PivotedLhsGivenSourcePhraseFeature.NAME))
24 | return new PivotedLhsGivenSourcePhraseFeature();
25 | else if (name.equals(PivotedSourcePhraseGivenTargetAndLHSFeature.NAME))
26 | return new PivotedSourcePhraseGivenTargetAndLHSFeature();
27 | else if (name.equals(PivotedTargetPhraseGivenLHSFeature.NAME))
28 | return new PivotedTargetPhraseGivenLHSFeature();
29 | else if (name.equals(PivotedLhsGivenTargetPhraseFeature.NAME))
30 | return new PivotedLhsGivenTargetPhraseFeature();
31 | else if (name.equals(PivotedTargetPhraseGivenSourceAndLHSFeature.NAME))
32 | return new PivotedTargetPhraseGivenSourceAndLHSFeature();
33 |
34 | return null;
35 | }
36 |
37 | public static List getAll(String names) {
38 | String[] feature_names = FormatUtils.P_COMMA_OR_SPACE.split(names);
39 | List features = new ArrayList();
40 |
41 | for (String feature_name : feature_names) {
42 | PivotedFeature feature = get(feature_name);
43 | if (feature != null) features.add(feature);
44 | }
45 | return features;
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/pivot/PivotedLexicalSourceGivenTargetFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features.pivot;
2 |
3 | import java.util.HashSet;
4 | import java.util.Set;
5 |
6 | import org.apache.hadoop.io.FloatWritable;
7 |
8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
9 | import edu.jhu.thrax.hadoop.features.annotation.SourceGivenTargetLexicalProbabilityFeature;
10 | import edu.jhu.thrax.hadoop.features.annotation.TargetGivenSourceLexicalProbabilityFeature;
11 |
12 | public class PivotedLexicalSourceGivenTargetFeature extends PivotedNegLogProbFeature {
13 |
14 | public static final String NAME = SourceGivenTargetLexicalProbabilityFeature.NAME;
15 |
16 | public String getName() {
17 | return NAME;
18 | }
19 |
20 | public Set getPrerequisites() {
21 | Set prereqs = new HashSet();
22 | prereqs.add(SourceGivenTargetLexicalProbabilityFeature.NAME);
23 | prereqs.add(TargetGivenSourceLexicalProbabilityFeature.NAME);
24 | return prereqs;
25 | }
26 |
27 | public FloatWritable pivot(FeatureMap src, FeatureMap tgt) {
28 | float egf = ((FloatWritable) tgt.get(TargetGivenSourceLexicalProbabilityFeature.NAME)).get();
29 | float fge = ((FloatWritable) src.get(SourceGivenTargetLexicalProbabilityFeature.NAME)).get();
30 |
31 | return new FloatWritable(egf + fge);
32 | }
33 |
34 | @Override
35 | public Set getLowerBoundLabels() {
36 | Set lower_bound_labels = new HashSet();
37 | lower_bound_labels.add(TargetGivenSourceLexicalProbabilityFeature.NAME);
38 | lower_bound_labels.add(SourceGivenTargetLexicalProbabilityFeature.NAME);
39 | return lower_bound_labels;
40 | }
41 |
42 | @Override
43 | public Set getUpperBoundLabels() {
44 | return null;
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/pivot/PivotedLexicalTargetGivenSourceFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features.pivot;
2 |
3 | import java.util.HashSet;
4 | import java.util.Set;
5 |
6 | import org.apache.hadoop.io.FloatWritable;
7 |
8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
9 | import edu.jhu.thrax.hadoop.features.annotation.SourceGivenTargetLexicalProbabilityFeature;
10 | import edu.jhu.thrax.hadoop.features.annotation.TargetGivenSourceLexicalProbabilityFeature;
11 |
12 | public class PivotedLexicalTargetGivenSourceFeature extends PivotedNegLogProbFeature {
13 |
14 | public static final String NAME = TargetGivenSourceLexicalProbabilityFeature.NAME;
15 |
16 | public String getName() {
17 | return NAME;
18 | }
19 |
20 | public Set getPrerequisites() {
21 | Set prereqs = new HashSet();
22 | prereqs.add(TargetGivenSourceLexicalProbabilityFeature.NAME);
23 | prereqs.add(SourceGivenTargetLexicalProbabilityFeature.NAME);
24 | return prereqs;
25 | }
26 |
27 | public FloatWritable pivot(FeatureMap src, FeatureMap tgt) {
28 | float egf = ((FloatWritable) src.get(TargetGivenSourceLexicalProbabilityFeature.NAME)).get();
29 | float fge = ((FloatWritable) tgt.get(SourceGivenTargetLexicalProbabilityFeature.NAME)).get();
30 |
31 | return new FloatWritable(egf + fge);
32 | }
33 |
34 | @Override
35 | public Set getLowerBoundLabels() {
36 | Set lower_bound_labels = new HashSet();
37 | lower_bound_labels.add(TargetGivenSourceLexicalProbabilityFeature.NAME);
38 | lower_bound_labels.add(SourceGivenTargetLexicalProbabilityFeature.NAME);
39 | return lower_bound_labels;
40 | }
41 |
42 | @Override
43 | public Set getUpperBoundLabels() {
44 | return null;
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/pivot/PivotedLhsGivenSourcePhraseFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features.pivot;
2 |
3 | import java.util.HashSet;
4 | import java.util.Set;
5 |
6 | import org.apache.hadoop.io.FloatWritable;
7 |
8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
9 | import edu.jhu.thrax.hadoop.features.mapred.LhsGivenSourcePhraseFeature;
10 | import edu.jhu.thrax.hadoop.features.mapred.LhsGivenTargetPhraseFeature;
11 |
12 | public class PivotedLhsGivenSourcePhraseFeature extends NonAggregatingPivotedFeature {
13 |
14 | public static final String NAME = LhsGivenSourcePhraseFeature.NAME;
15 |
16 | public String getName() {
17 | return NAME;
18 | }
19 |
20 | public Set getPrerequisites() {
21 | Set prereqs = new HashSet();
22 | prereqs.add(LhsGivenTargetPhraseFeature.NAME);
23 | return prereqs;
24 | }
25 |
26 | public FloatWritable pivot(FeatureMap src, FeatureMap tgt) {
27 | return new FloatWritable(((FloatWritable) src.get(LhsGivenTargetPhraseFeature.NAME)).get());
28 | }
29 |
30 | @Override
31 | public Set getLowerBoundLabels() {
32 | Set lower_bound_labels = new HashSet();
33 | lower_bound_labels.add(LhsGivenTargetPhraseFeature.NAME);
34 | return lower_bound_labels;
35 | }
36 |
37 | @Override
38 | public Set getUpperBoundLabels() {
39 | return null;
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/pivot/PivotedLhsGivenTargetPhraseFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features.pivot;
2 |
3 | import java.util.HashSet;
4 | import java.util.Set;
5 |
6 | import org.apache.hadoop.io.FloatWritable;
7 |
8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
9 | import edu.jhu.thrax.hadoop.features.mapred.LhsGivenTargetPhraseFeature;
10 |
11 | public class PivotedLhsGivenTargetPhraseFeature extends NonAggregatingPivotedFeature {
12 |
13 | public static final String NAME = LhsGivenTargetPhraseFeature.NAME;
14 |
15 | public String getName() {
16 | return NAME;
17 | }
18 |
19 | public Set getPrerequisites() {
20 | Set prereqs = new HashSet();
21 | prereqs.add(LhsGivenTargetPhraseFeature.NAME);
22 | return prereqs;
23 | }
24 |
25 | public FloatWritable pivot(FeatureMap src, FeatureMap tgt) {
26 | return new FloatWritable(((FloatWritable) tgt.get(LhsGivenTargetPhraseFeature.NAME)).get());
27 | }
28 |
29 | @Override
30 | public Set getLowerBoundLabels() {
31 | Set lower_bound_labels = new HashSet();
32 | lower_bound_labels.add(LhsGivenTargetPhraseFeature.NAME);
33 | return lower_bound_labels;
34 | }
35 |
36 | @Override
37 | public Set getUpperBoundLabels() {
38 | return null;
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/pivot/PivotedNegLogProbFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features.pivot;
2 |
3 | import java.util.Map;
4 |
5 | import org.apache.hadoop.io.FloatWritable;
6 | import org.apache.hadoop.io.Writable;
7 |
8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
9 | import edu.jhu.thrax.util.NegLogMath;
10 | import edu.jhu.thrax.util.Vocabulary;
11 |
12 | public abstract class PivotedNegLogProbFeature implements PivotedFeature {
13 |
14 | private static final FloatWritable ONE_PROB = new FloatWritable(0.0f);
15 |
16 | private float aggregated;
17 |
18 | public void initializeAggregation() {
19 | aggregated = 64.0f;
20 | }
21 |
22 | public void aggregate(FeatureMap features) {
23 | FloatWritable val = (FloatWritable) features.get(getName());
24 | aggregated = NegLogMath.logAdd(aggregated, val.get());
25 | }
26 |
27 | public FloatWritable finalizeAggregation() {
28 | return new FloatWritable(aggregated);
29 | }
30 |
31 | public void unaryGlueRuleScore(int nt, Map map) {
32 | map.put(Vocabulary.id(getName()), ONE_PROB);
33 | }
34 |
35 | public void binaryGlueRuleScore(int nt, Map map) {
36 | map.put(Vocabulary.id(getName()), ONE_PROB);
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/pivot/PivotedRarityPenaltyFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features.pivot;
2 |
3 | import java.util.HashSet;
4 | import java.util.Map;
5 | import java.util.Set;
6 |
7 | import org.apache.hadoop.io.FloatWritable;
8 | import org.apache.hadoop.io.Writable;
9 |
10 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
11 | import edu.jhu.thrax.hadoop.features.annotation.RarityPenaltyFeature;
12 | import edu.jhu.thrax.util.Vocabulary;
13 |
14 | public class PivotedRarityPenaltyFeature implements PivotedFeature {
15 |
16 | public static final String NAME = RarityPenaltyFeature.NAME;
17 |
18 | private static final FloatWritable ZERO = new FloatWritable(0.0f);
19 |
20 | private static final float RENORMALIZE = (float) Math.exp(-1);
21 |
22 | private float aggregated_rp;
23 |
24 | public String getName() {
25 | return NAME;
26 | }
27 |
28 | public Set getPrerequisites() {
29 | Set prereqs = new HashSet();
30 | prereqs.add(RarityPenaltyFeature.NAME);
31 | return prereqs;
32 | }
33 |
34 | public FloatWritable pivot(FeatureMap a, FeatureMap b) {
35 | float a_rp = ((FloatWritable) a.get(RarityPenaltyFeature.NAME)).get();
36 | float b_rp = ((FloatWritable) b.get(RarityPenaltyFeature.NAME)).get();
37 | return new FloatWritable(Math.max(a_rp, b_rp));
38 | }
39 |
40 | public void unaryGlueRuleScore(int nt, Map map) {
41 | map.put(Vocabulary.id(NAME), ZERO);
42 | }
43 |
44 | public void binaryGlueRuleScore(int nt, Map map) {
45 | map.put(Vocabulary.id(NAME), ZERO);
46 | }
47 |
48 | public void initializeAggregation() {
49 | aggregated_rp = -1;
50 | }
51 |
52 | public void aggregate(FeatureMap a) {
53 | float rp = ((FloatWritable) a.get(NAME)).get();
54 | if (aggregated_rp == -1) {
55 | aggregated_rp = rp;
56 | } else {
57 | // Rarity is exp(1 - count). To compute rarity over a sum of counts:
58 | // rarity_{1+2} = exp(1 - (count_1 + count_2)) = exp(1 - count_1) * exp(-count_2) =
59 | // = exp(1 - count_1) * exp(1 - count_2) * exp(-1) = rarity_1 * rarity_2 * exp(-1)
60 | aggregated_rp *= rp * RENORMALIZE;
61 | }
62 | }
63 |
64 | public FloatWritable finalizeAggregation() {
65 | return new FloatWritable(aggregated_rp);
66 | }
67 |
68 | @Override
69 | public Set getLowerBoundLabels() {
70 | Set lower_bound_labels = new HashSet();
71 | lower_bound_labels.add(RarityPenaltyFeature.NAME);
72 | return lower_bound_labels;
73 | }
74 |
75 | @Override
76 | public Set getUpperBoundLabels() {
77 | return null;
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/pivot/PivotedSourcePhraseGivenLHSFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features.pivot;
2 |
3 | import java.util.HashSet;
4 | import java.util.Set;
5 |
6 | import org.apache.hadoop.io.FloatWritable;
7 |
8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
9 | import edu.jhu.thrax.hadoop.features.mapred.SourcePhraseGivenLHSFeature;
10 | import edu.jhu.thrax.hadoop.features.mapred.TargetPhraseGivenLHSFeature;
11 |
12 | public class PivotedSourcePhraseGivenLHSFeature extends NonAggregatingPivotedFeature {
13 |
14 | public static final String NAME = SourcePhraseGivenLHSFeature.NAME;
15 |
16 | public String getName() {
17 | return NAME;
18 | }
19 |
20 | public Set getPrerequisites() {
21 | Set prereqs = new HashSet();
22 | prereqs.add(TargetPhraseGivenLHSFeature.NAME);
23 | return prereqs;
24 | }
25 |
26 | public FloatWritable pivot(FeatureMap src, FeatureMap tgt) {
27 | return new FloatWritable(((FloatWritable) src.get(TargetPhraseGivenLHSFeature.NAME)).get());
28 | }
29 |
30 | @Override
31 | public Set getLowerBoundLabels() {
32 | Set lower_bound_labels = new HashSet();
33 | lower_bound_labels.add(TargetPhraseGivenLHSFeature.NAME);
34 | return lower_bound_labels;
35 | }
36 |
37 | @Override
38 | public Set getUpperBoundLabels() {
39 | return null;
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/pivot/PivotedSourcePhraseGivenTargetAndLHSFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features.pivot;
2 |
3 | import java.util.HashSet;
4 | import java.util.Set;
5 |
6 | import org.apache.hadoop.io.FloatWritable;
7 |
8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
9 | import edu.jhu.thrax.hadoop.features.mapred.SourcePhraseGivenTargetandLHSFeature;
10 | import edu.jhu.thrax.hadoop.features.mapred.TargetPhraseGivenSourceandLHSFeature;
11 |
12 | public class PivotedSourcePhraseGivenTargetAndLHSFeature extends PivotedNegLogProbFeature {
13 |
14 | public static final String NAME = SourcePhraseGivenTargetandLHSFeature.NAME;
15 |
16 | public String getName() {
17 | return NAME;
18 | }
19 |
20 | public Set getPrerequisites() {
21 | Set prereqs = new HashSet();
22 | prereqs.add(SourcePhraseGivenTargetandLHSFeature.NAME);
23 | prereqs.add(TargetPhraseGivenSourceandLHSFeature.NAME);
24 | return prereqs;
25 | }
26 |
27 | public FloatWritable pivot(FeatureMap src, FeatureMap tgt) {
28 | float fge = ((FloatWritable) src.get(TargetPhraseGivenSourceandLHSFeature.NAME)).get();
29 | float egf = ((FloatWritable) tgt.get(SourcePhraseGivenTargetandLHSFeature.NAME)).get();
30 |
31 | return new FloatWritable(egf + fge);
32 | }
33 |
34 | @Override
35 | public Set getLowerBoundLabels() {
36 | Set lower_bound_labels = new HashSet();
37 | lower_bound_labels.add(TargetPhraseGivenSourceandLHSFeature.NAME);
38 | lower_bound_labels.add(SourcePhraseGivenTargetandLHSFeature.NAME);
39 | return lower_bound_labels;
40 | }
41 |
42 | @Override
43 | public Set getUpperBoundLabels() {
44 | return null;
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/pivot/PivotedSourcePhraseGivenTargetFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features.pivot;
2 |
3 | import java.util.HashSet;
4 | import java.util.Set;
5 |
6 | import org.apache.hadoop.io.FloatWritable;
7 |
8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
9 | import edu.jhu.thrax.hadoop.features.mapred.SourcePhraseGivenTargetFeature;
10 | import edu.jhu.thrax.hadoop.features.mapred.TargetPhraseGivenSourceFeature;
11 |
12 | public class PivotedSourcePhraseGivenTargetFeature extends PivotedNegLogProbFeature {
13 |
14 | public static final String NAME = SourcePhraseGivenTargetFeature.NAME;
15 |
16 | public String getName() {
17 | return NAME;
18 | }
19 |
20 | public Set getPrerequisites() {
21 | Set prereqs = new HashSet();
22 | prereqs.add(TargetPhraseGivenSourceFeature.NAME);
23 | prereqs.add(SourcePhraseGivenTargetFeature.NAME);
24 | return prereqs;
25 | }
26 |
27 | public FloatWritable pivot(FeatureMap src, FeatureMap tgt) {
28 | float src_f = ((FloatWritable) src.get(TargetPhraseGivenSourceFeature.NAME)).get();
29 | float f_tgt = ((FloatWritable) tgt.get(SourcePhraseGivenTargetFeature.NAME)).get();
30 |
31 | return new FloatWritable(src_f + f_tgt);
32 | }
33 |
34 | @Override
35 | public Set getLowerBoundLabels() {
36 | Set lower_bound_labels = new HashSet();
37 | lower_bound_labels.add(TargetPhraseGivenSourceFeature.NAME);
38 | lower_bound_labels.add(SourcePhraseGivenTargetFeature.NAME);
39 | return lower_bound_labels;
40 | }
41 |
42 | @Override
43 | public Set getUpperBoundLabels() {
44 | return null;
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/pivot/PivotedTargetPhraseGivenLHSFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features.pivot;
2 |
3 | import java.util.HashSet;
4 | import java.util.Set;
5 |
6 | import org.apache.hadoop.io.FloatWritable;
7 |
8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
9 | import edu.jhu.thrax.hadoop.features.mapred.TargetPhraseGivenLHSFeature;
10 |
11 | public class PivotedTargetPhraseGivenLHSFeature extends NonAggregatingPivotedFeature {
12 |
13 | public static final String NAME = TargetPhraseGivenLHSFeature.NAME;
14 |
15 | public String getName() {
16 | return NAME;
17 | }
18 |
19 | public Set getPrerequisites() {
20 | Set prereqs = new HashSet();
21 | prereqs.add(TargetPhraseGivenLHSFeature.NAME);
22 | return prereqs;
23 | }
24 |
25 | public FloatWritable pivot(FeatureMap src, FeatureMap tgt) {
26 | return new FloatWritable(((FloatWritable) tgt.get(TargetPhraseGivenLHSFeature.NAME)).get());
27 | }
28 |
29 | @Override
30 | public Set getLowerBoundLabels() {
31 | Set lower_bound_labels = new HashSet();
32 | lower_bound_labels.add(TargetPhraseGivenLHSFeature.NAME);
33 | return lower_bound_labels;
34 | }
35 |
36 | @Override
37 | public Set getUpperBoundLabels() {
38 | return null;
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/pivot/PivotedTargetPhraseGivenSourceAndLHSFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features.pivot;
2 |
3 | import java.util.HashSet;
4 | import java.util.Set;
5 |
6 | import org.apache.hadoop.io.FloatWritable;
7 |
8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
9 | import edu.jhu.thrax.hadoop.features.mapred.SourcePhraseGivenTargetandLHSFeature;
10 | import edu.jhu.thrax.hadoop.features.mapred.TargetPhraseGivenSourceandLHSFeature;
11 |
12 | public class PivotedTargetPhraseGivenSourceAndLHSFeature extends PivotedNegLogProbFeature {
13 |
14 | public static final String NAME = TargetPhraseGivenSourceandLHSFeature.NAME;
15 |
16 | public String getName() {
17 | return NAME;
18 | }
19 |
20 | public Set getPrerequisites() {
21 | Set prereqs = new HashSet();
22 | prereqs.add(TargetPhraseGivenSourceandLHSFeature.NAME);
23 | prereqs.add(SourcePhraseGivenTargetandLHSFeature.NAME);
24 | return prereqs;
25 | }
26 |
27 | public FloatWritable pivot(FeatureMap src, FeatureMap tgt) {
28 | float fge = ((FloatWritable) tgt.get(TargetPhraseGivenSourceandLHSFeature.NAME)).get();
29 | float egf = ((FloatWritable) src.get(SourcePhraseGivenTargetandLHSFeature.NAME)).get();
30 |
31 | return new FloatWritable(egf + fge);
32 | }
33 |
34 | @Override
35 | public Set getLowerBoundLabels() {
36 | Set lower_bound_labels = new HashSet();
37 | lower_bound_labels.add(TargetPhraseGivenSourceandLHSFeature.NAME);
38 | lower_bound_labels.add(SourcePhraseGivenTargetandLHSFeature.NAME);
39 | return lower_bound_labels;
40 | }
41 |
42 | @Override
43 | public Set getUpperBoundLabels() {
44 | return null;
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/features/pivot/PivotedTargetPhraseGivenSourceFeature.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.features.pivot;
2 |
3 | import java.util.HashSet;
4 | import java.util.Set;
5 |
6 | import org.apache.hadoop.io.FloatWritable;
7 |
8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
9 | import edu.jhu.thrax.hadoop.features.mapred.SourcePhraseGivenTargetFeature;
10 | import edu.jhu.thrax.hadoop.features.mapred.TargetPhraseGivenSourceFeature;
11 |
12 | public class PivotedTargetPhraseGivenSourceFeature extends PivotedNegLogProbFeature {
13 |
14 | public static final String NAME = TargetPhraseGivenSourceFeature.NAME;
15 |
16 | public String getName() {
17 | return NAME;
18 | }
19 |
20 | public Set getPrerequisites() {
21 | Set prereqs = new HashSet();
22 | prereqs.add(TargetPhraseGivenSourceFeature.NAME);
23 | prereqs.add(SourcePhraseGivenTargetFeature.NAME);
24 | return prereqs;
25 | }
26 |
27 | public FloatWritable pivot(FeatureMap src, FeatureMap tgt) {
28 | float tgt_f = ((FloatWritable) tgt.get(TargetPhraseGivenSourceFeature.NAME)).get();
29 | float f_src = ((FloatWritable) src.get(SourcePhraseGivenTargetFeature.NAME)).get();
30 |
31 | return new FloatWritable(tgt_f + f_src);
32 | }
33 |
34 | @Override
35 | public Set getLowerBoundLabels() {
36 | Set lower_bound_labels = new HashSet();
37 | lower_bound_labels.add(TargetPhraseGivenSourceFeature.NAME);
38 | lower_bound_labels.add(SourcePhraseGivenTargetFeature.NAME);
39 | return lower_bound_labels;
40 | }
41 |
42 | @Override
43 | public Set getUpperBoundLabels() {
44 | return null;
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/jobs/DefaultValues.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.jobs;
2 |
3 | public class DefaultValues {
4 | public static int DEFAULT_NUM_REDUCERS = 4;
5 |
6 | private DefaultValues() {};
7 | }
8 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/jobs/DistributionalContextExtractionJob.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.jobs;
2 |
3 | import java.io.IOException;
4 | import java.util.HashSet;
5 | import java.util.Set;
6 |
7 | import org.apache.hadoop.conf.Configuration;
8 | import org.apache.hadoop.fs.Path;
9 | import org.apache.hadoop.io.NullWritable;
10 | import org.apache.hadoop.io.Text;
11 | import org.apache.hadoop.mapreduce.Job;
12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
14 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
15 |
16 | import edu.jhu.thrax.hadoop.distributional.ContextWritable;
17 | import edu.jhu.thrax.hadoop.distributional.DistributionalContextCombiner;
18 | import edu.jhu.thrax.hadoop.distributional.DistributionalContextMapper;
19 | import edu.jhu.thrax.hadoop.distributional.DistributionalContextReducer;
20 | import edu.jhu.thrax.hadoop.distributional.SignatureWritable;
21 |
22 | public class DistributionalContextExtractionJob implements ThraxJob {
23 |
24 | public Job getJob(Configuration conf) throws IOException {
25 | Job job = new Job(conf, "distributional");
26 |
27 | job.setJarByClass(DistributionalContextMapper.class);
28 |
29 | job.setMapperClass(DistributionalContextMapper.class);
30 | job.setCombinerClass(DistributionalContextCombiner.class);
31 | job.setReducerClass(DistributionalContextReducer.class);
32 |
33 | job.setMapOutputKeyClass(Text.class);
34 | job.setMapOutputValueClass(ContextWritable.class);
35 |
36 | job.setOutputKeyClass(SignatureWritable.class);
37 | job.setOutputValueClass(NullWritable.class);
38 |
39 | job.setOutputFormatClass(SequenceFileOutputFormat.class);
40 |
41 | int numReducers = conf.getInt("thrax.reducers", conf.getInt("mapreduce.job.reduces", DefaultValues.DEFAULT_NUM_REDUCERS));
42 | job.setNumReduceTasks(numReducers);
43 |
44 | FileInputFormat.setInputPaths(job, new Path(conf.get("thrax.input-file")));
45 | FileOutputFormat.setOutputPath(job, new Path(conf.get("thrax.work-dir") + "signatures"));
46 |
47 | int max_split_size = conf.getInt("thrax.max-split-size", 0);
48 | if (max_split_size != 0) FileInputFormat.setMaxInputSplitSize(job, max_split_size);
49 |
50 | return job;
51 | }
52 |
53 | public String getName() {
54 | return "distributional";
55 | }
56 |
57 | public String getOutputSuffix() {
58 | return null;
59 | }
60 |
61 | @Override
62 | public Set> getPrerequisites() {
63 | return new HashSet>();
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/jobs/DistributionalContextSortingJob.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.jobs;
2 |
3 | import java.io.IOException;
4 | import java.util.HashSet;
5 | import java.util.Set;
6 |
7 | import org.apache.hadoop.conf.Configuration;
8 | import org.apache.hadoop.fs.Path;
9 | import org.apache.hadoop.io.NullWritable;
10 | import org.apache.hadoop.mapreduce.Job;
11 | import org.apache.hadoop.mapreduce.Mapper;
12 | import org.apache.hadoop.mapreduce.Reducer;
13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
14 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
15 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
16 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
17 |
18 | import edu.jhu.thrax.hadoop.distributional.DistributionalContextMapper;
19 | import edu.jhu.thrax.hadoop.distributional.SignatureWritable;
20 |
21 | public class DistributionalContextSortingJob implements ThraxJob {
22 |
23 | private static HashSet> prereqs =
24 | new HashSet>();
25 |
26 | public Job getJob(Configuration conf) throws IOException {
27 | Job job = new Job(conf, "sorting");
28 |
29 | job.setJarByClass(DistributionalContextMapper.class);
30 |
31 | job.setMapperClass(Mapper.class);
32 | job.setReducerClass(Reducer.class);
33 |
34 | job.setInputFormatClass(SequenceFileInputFormat.class);
35 |
36 | job.setOutputKeyClass(SignatureWritable.class);
37 | job.setOutputValueClass(NullWritable.class);
38 |
39 | job.setOutputFormatClass(SequenceFileOutputFormat.class);
40 |
41 | // TODO: Figure out how to make this workable with multiple reducers. Currently -getmerge-ing
42 | // multiple sequence file outputs from several reducers yields a broken file.
43 | job.setNumReduceTasks(1);
44 |
45 | FileInputFormat.setInputPaths(job, new Path(conf.get("thrax.work-dir") + "signatures"));
46 | FileOutputFormat.setOutputPath(job, new Path(conf.get("thrax.outputPath", "")));
47 |
48 | int max_split_size = conf.getInt("thrax.max-split-size", 0);
49 | if (max_split_size != 0) FileInputFormat.setMaxInputSplitSize(job, max_split_size);
50 |
51 | return job;
52 | }
53 |
54 | public String getName() {
55 | return "sorting";
56 | }
57 |
58 | public Set> getPrerequisites() {
59 | prereqs.add(DistributionalContextExtractionJob.class);
60 | return prereqs;
61 | }
62 |
63 | public String getOutputSuffix() {
64 | return null;
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/jobs/ExtractionJob.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.jobs;
2 |
3 | import java.io.IOException;
4 | import java.util.HashSet;
5 | import java.util.Set;
6 |
7 | import org.apache.hadoop.conf.Configuration;
8 | import org.apache.hadoop.fs.Path;
9 | import org.apache.hadoop.mapreduce.Job;
10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
11 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
12 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
13 |
14 | import edu.jhu.thrax.hadoop.datatypes.AlignedRuleWritable;
15 | import edu.jhu.thrax.hadoop.datatypes.Annotation;
16 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
17 | import edu.jhu.thrax.hadoop.extraction.ExtractionCombiner;
18 | import edu.jhu.thrax.hadoop.extraction.ExtractionMapper;
19 | import edu.jhu.thrax.hadoop.extraction.ExtractionReducer;
20 |
21 | public class ExtractionJob implements ThraxJob {
22 |
23 | public Set> getPrerequisites() {
24 | Set> result = new HashSet>();
25 | result.add(VocabularyJob.class);
26 | return result;
27 | }
28 |
29 | public Job getJob(Configuration conf) throws IOException {
30 | Job job = new Job(conf, "extraction");
31 | job.setJarByClass(ExtractionMapper.class);
32 |
33 | job.setMapperClass(ExtractionMapper.class);
34 | job.setCombinerClass(ExtractionCombiner.class);
35 | job.setReducerClass(ExtractionReducer.class);
36 |
37 | job.setSortComparatorClass(AlignedRuleWritable.RuleYieldComparator.class);
38 | job.setPartitionerClass(AlignedRuleWritable.RuleYieldPartitioner.class);
39 |
40 | job.setMapOutputKeyClass(AlignedRuleWritable.class);
41 | job.setMapOutputValueClass(Annotation.class);
42 | job.setOutputKeyClass(RuleWritable.class);
43 | job.setOutputValueClass(Annotation.class);
44 |
45 | job.setOutputFormatClass(SequenceFileOutputFormat.class);
46 |
47 | int numReducers = conf.getInt("thrax.reducers", conf.getInt("mapreduce.job.reduces", DefaultValues.DEFAULT_NUM_REDUCERS));
48 | job.setNumReduceTasks(numReducers);
49 |
50 | FileInputFormat.setInputPaths(job, new Path(conf.get("thrax.input-file")));
51 | int maxSplitSize = conf.getInt("thrax.max-split-size", 0);
52 | if (maxSplitSize != 0) FileInputFormat.setMaxInputSplitSize(job, maxSplitSize);
53 |
54 | FileOutputFormat.setOutputPath(job, new Path(conf.get("thrax.work-dir") + "rules"));
55 |
56 | return job;
57 | }
58 |
59 | // TODO: unify names of jobs and their output directories
60 |
61 | public String getName() {
62 | return "extraction";
63 | }
64 |
65 | public String getOutputSuffix() {
66 | return "rules";
67 | }
68 | }
69 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/jobs/FeatureCollectionJob.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.jobs;
2 |
3 | import java.io.IOException;
4 | import java.util.HashSet;
5 | import java.util.Set;
6 |
7 | import org.apache.hadoop.conf.Configuration;
8 | import org.apache.hadoop.fs.Path;
9 | import org.apache.hadoop.mapreduce.Job;
10 | import org.apache.hadoop.mapreduce.Mapper;
11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
12 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
14 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
15 |
16 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
17 | import edu.jhu.thrax.hadoop.datatypes.FeaturePair;
18 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
19 | import edu.jhu.thrax.hadoop.paraphrasing.FeatureCollectionReducer;
20 |
21 | public class FeatureCollectionJob implements ThraxJob {
22 |
23 | private static HashSet> prereqs =
24 | new HashSet>();
25 |
26 | private static HashSet prereq_names = new HashSet();
27 |
28 | public static void addPrerequisite(Class extends ThraxJob> c) {
29 | prereqs.add(c);
30 | try {
31 | ThraxJob prereq;
32 | prereq = c.newInstance();
33 | prereq_names.add(prereq.getOutputSuffix());
34 | } catch (Exception e) {
35 | e.printStackTrace();
36 | }
37 | }
38 |
39 | public Set> getPrerequisites() {
40 | prereqs.add(ExtractionJob.class);
41 | return prereqs;
42 | }
43 |
44 | public Job getJob(Configuration conf) throws IOException {
45 | Job job = new Job(conf, "collect");
46 |
47 | String workDir = conf.get("thrax.work-dir");
48 |
49 | job.setJarByClass(FeatureCollectionReducer.class);
50 |
51 | job.setMapperClass(Mapper.class);
52 | job.setReducerClass(FeatureCollectionReducer.class);
53 |
54 | job.setInputFormatClass(SequenceFileInputFormat.class);
55 | job.setMapOutputKeyClass(RuleWritable.class);
56 | job.setMapOutputValueClass(FeaturePair.class);
57 | job.setOutputKeyClass(RuleWritable.class);
58 | job.setOutputValueClass(FeatureMap.class);
59 | job.setOutputFormatClass(SequenceFileOutputFormat.class);
60 |
61 | job.setPartitionerClass(RuleWritable.YieldPartitioner.class);
62 |
63 | int numReducers = conf.getInt("thrax.reducers", conf.getInt("mapreduce.job.reduces", DefaultValues.DEFAULT_NUM_REDUCERS));
64 | job.setNumReduceTasks(numReducers);
65 |
66 | int maxSplitSize = conf.getInt("thrax.max-split-size", 0);
67 | if (maxSplitSize != 0) FileInputFormat.setMaxInputSplitSize(job, maxSplitSize * 20);
68 |
69 | for (String prereq_name : prereq_names)
70 | FileInputFormat.addInputPath(job, new Path(workDir + prereq_name));
71 |
72 | // TODO: double-check this.
73 | if (FileInputFormat.getInputPaths(job).length == 0)
74 | FileInputFormat.addInputPath(job, new Path(workDir + "rules"));
75 |
76 | String outputPath = workDir + "collected";
77 | FileOutputFormat.setOutputPath(job, new Path(outputPath));
78 |
79 | return job;
80 | }
81 |
82 | public String getName() {
83 | return "collect";
84 | }
85 |
86 | public String getOutputSuffix() {
87 | return "collected";
88 | }
89 | }
90 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/jobs/JobState.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.jobs;
2 |
3 | public enum JobState
4 | {
5 | PREREQ_FAILED,
6 | FAILED,
7 | READY,
8 | RUNNING,
9 | SUCCESS,
10 | WAITING,
11 | PLANNED
12 | }
13 |
14 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/jobs/ParaphraseAggregationJob.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.jobs;
2 |
3 | import java.io.IOException;
4 | import java.util.HashSet;
5 | import java.util.Set;
6 |
7 | import org.apache.hadoop.conf.Configuration;
8 | import org.apache.hadoop.fs.Path;
9 | import org.apache.hadoop.io.NullWritable;
10 | import org.apache.hadoop.io.Text;
11 | import org.apache.hadoop.io.compress.GzipCodec;
12 | import org.apache.hadoop.mapreduce.Job;
13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
14 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
15 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
16 |
17 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
18 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
19 | import edu.jhu.thrax.hadoop.paraphrasing.AggregationCombiner;
20 | import edu.jhu.thrax.hadoop.paraphrasing.AggregationMapper;
21 | import edu.jhu.thrax.hadoop.paraphrasing.AggregationReducer;
22 |
23 | public class ParaphraseAggregationJob implements ThraxJob {
24 |
25 | private static HashSet> prereqs =
26 | new HashSet>();
27 |
28 | public Job getJob(Configuration conf) throws IOException {
29 | Job job = new Job(conf, "aggregate");
30 |
31 | job.setJarByClass(AggregationReducer.class);
32 |
33 | job.setMapperClass(AggregationMapper.class);
34 | job.setCombinerClass(AggregationCombiner.class);
35 | job.setReducerClass(AggregationReducer.class);
36 |
37 | job.setInputFormatClass(SequenceFileInputFormat.class);
38 | job.setMapOutputKeyClass(RuleWritable.class);
39 | job.setMapOutputValueClass(FeatureMap.class);
40 | job.setOutputKeyClass(Text.class);
41 | job.setOutputValueClass(NullWritable.class);
42 |
43 | job.setSortComparatorClass(RuleWritable.YieldComparator.class);
44 | job.setPartitionerClass(RuleWritable.FirstWordPartitioner.class);
45 |
46 | FileInputFormat.setInputPaths(job, new Path(conf.get("thrax.work-dir") + "pivoted"));
47 | int maxSplitSize = conf.getInt("thrax.max-split-size", 0);
48 | if (maxSplitSize != 0) FileInputFormat.setMaxInputSplitSize(job, maxSplitSize * 20);
49 |
50 | int numReducers = conf.getInt("thrax.reducers", conf.getInt("mapreduce.job.reduces", DefaultValues.DEFAULT_NUM_REDUCERS));
51 | job.setNumReduceTasks(numReducers);
52 |
53 | String outputPath = conf.get("thrax.outputPath", "");
54 | FileOutputFormat.setOutputPath(job, new Path(outputPath));
55 |
56 | FileOutputFormat.setCompressOutput(job, true);
57 | FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
58 |
59 | return job;
60 | }
61 |
62 | public String getName() {
63 | return "aggregate";
64 | }
65 |
66 | public static void addPrerequisite(Class extends ThraxJob> c) {
67 | prereqs.add(c);
68 | }
69 |
70 | public Set> getPrerequisites() {
71 | prereqs.add(ParaphrasePivotingJob.class);
72 | return prereqs;
73 | }
74 |
75 | public String getOutputSuffix() {
76 | return null;
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/jobs/ParaphrasePivotingJob.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.jobs;
2 |
3 | import java.io.IOException;
4 | import java.util.HashSet;
5 | import java.util.Set;
6 |
7 | import org.apache.hadoop.conf.Configuration;
8 | import org.apache.hadoop.fs.Path;
9 | import org.apache.hadoop.mapreduce.Job;
10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
11 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
12 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
13 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
14 |
15 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
16 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
17 | import edu.jhu.thrax.hadoop.paraphrasing.PivotingMapper;
18 | import edu.jhu.thrax.hadoop.paraphrasing.PivotingReducer;
19 |
20 | public class ParaphrasePivotingJob implements ThraxJob {
21 |
22 | private static HashSet> prereqs =
23 | new HashSet>();
24 |
25 | public static void addPrerequisite(Class extends ThraxJob> c) {
26 | prereqs.add(c);
27 | }
28 |
29 | public Set> getPrerequisites() {
30 | prereqs.add(FeatureCollectionJob.class);
31 | return prereqs;
32 | }
33 |
34 | public Job getJob(Configuration conf) throws IOException {
35 | Job job = new Job(conf, "pivoting");
36 |
37 | job.setJarByClass(PivotingReducer.class);
38 |
39 | job.setMapperClass(PivotingMapper.class);
40 | job.setReducerClass(PivotingReducer.class);
41 |
42 | job.setInputFormatClass(SequenceFileInputFormat.class);
43 | job.setMapOutputKeyClass(RuleWritable.class);
44 | job.setMapOutputValueClass(FeatureMap.class);
45 | job.setOutputKeyClass(RuleWritable.class);
46 | job.setOutputValueClass(FeatureMap.class);
47 | job.setOutputFormatClass(SequenceFileOutputFormat.class);
48 |
49 | job.setPartitionerClass(RuleWritable.SourcePartitioner.class);
50 |
51 | FileInputFormat.setInputPaths(job, new Path(conf.get("thrax.work-dir") + "collected"));
52 | int maxSplitSize = conf.getInt("thrax.max-split-size", 0);
53 | if (maxSplitSize != 0) FileInputFormat.setMaxInputSplitSize(job, maxSplitSize * 20);
54 |
55 | int numReducers = conf.getInt("thrax.reducers", conf.getInt("mapreduce.job.reduces", DefaultValues.DEFAULT_NUM_REDUCERS));
56 | job.setNumReduceTasks(numReducers);
57 |
58 | FileOutputFormat.setOutputPath(job, new Path(conf.get("thrax.work-dir") + "pivoted"));
59 |
60 | return job;
61 | }
62 |
63 | public String getName() {
64 | return "pivoting";
65 | }
66 |
67 | public String getOutputSuffix() {
68 | return "pivoted";
69 | }
70 | }
71 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/jobs/SchedulerException.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.jobs;
2 |
3 | public class SchedulerException extends Exception
4 | {
5 | private static final long serialVersionUID = 9090L;
6 |
7 | public SchedulerException(String s)
8 | {
9 | super(s);
10 | }
11 | }
12 |
13 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/jobs/SourceWordGivenTargetWordProbabilityJob.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.jobs;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.hadoop.conf.Configuration;
6 | import org.apache.hadoop.fs.Path;
7 | import org.apache.hadoop.mapreduce.Job;
8 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
9 |
10 | public class SourceWordGivenTargetWordProbabilityJob extends WordLexprobJob {
11 |
12 | public SourceWordGivenTargetWordProbabilityJob() {
13 | super(true);
14 | }
15 |
16 | public Job getJob(Configuration conf) throws IOException {
17 | Job job = super.getJob(conf);
18 | FileOutputFormat.setOutputPath(job, new Path(conf.get("thrax.work-dir") + "lexprobs_sgt"));
19 | return job;
20 | }
21 |
22 | public String getName() {
23 | return "source-word-lexprob";
24 | }
25 |
26 | public String getOutputSuffix() {
27 | return "lexprobs_sgt";
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/jobs/TargetWordGivenSourceWordProbabilityJob.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.jobs;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.hadoop.conf.Configuration;
6 | import org.apache.hadoop.fs.Path;
7 | import org.apache.hadoop.mapreduce.Job;
8 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
9 |
10 | public class TargetWordGivenSourceWordProbabilityJob extends WordLexprobJob {
11 |
12 | public TargetWordGivenSourceWordProbabilityJob() {
13 | super(false);
14 | }
15 |
16 | public Job getJob(Configuration conf) throws IOException {
17 | Job job = super.getJob(conf);
18 | FileOutputFormat.setOutputPath(job, new Path(conf.get("thrax.work-dir") + "lexprobs_tgs"));
19 | return job;
20 | }
21 |
22 | @Override
23 | public String getName() {
24 | return "target-word-lexprob";
25 | }
26 |
27 | @Override
28 | public String getOutputSuffix() {
29 | return "lexprobs_tgs";
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/jobs/ThraxJob.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.jobs;
2 |
3 | import java.io.IOException;
4 | import java.util.Set;
5 |
6 | import org.apache.hadoop.conf.Configuration;
7 | import org.apache.hadoop.mapreduce.Job;
8 |
9 | public interface ThraxJob {
10 |
11 | public Job getJob(Configuration conf) throws IOException;
12 |
13 | public Set> getPrerequisites();
14 |
15 | public String getName();
16 |
17 | public String getOutputSuffix();
18 | }
19 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/jobs/WordLexprobJob.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.jobs;
2 |
3 | import java.io.IOException;
4 | import java.util.HashSet;
5 | import java.util.Set;
6 |
7 | import org.apache.hadoop.conf.Configuration;
8 | import org.apache.hadoop.fs.Path;
9 | import org.apache.hadoop.io.FloatWritable;
10 | import org.apache.hadoop.io.IntWritable;
11 | import org.apache.hadoop.io.LongWritable;
12 | import org.apache.hadoop.mapreduce.Job;
13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
14 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
15 | import org.apache.hadoop.mapreduce.lib.reduce.IntSumReducer;
16 |
17 | import edu.jhu.thrax.hadoop.features.WordLexicalProbabilityCalculator;
18 |
19 | public abstract class WordLexprobJob implements ThraxJob {
20 | public static final String SOURCE_GIVEN_TARGET = "thrax.__wordlexprob_sgt";
21 | private boolean isSourceGivenTarget;
22 |
23 | public WordLexprobJob(boolean isSrcGivenTgt) {
24 | isSourceGivenTarget = isSrcGivenTgt;
25 | }
26 |
27 | public Set> getPrerequisites() {
28 | Set> result = new HashSet>();
29 | result.add(VocabularyJob.class);
30 | return result;
31 | }
32 |
33 | public Job getJob(Configuration conf) throws IOException {
34 | Configuration theConf = new Configuration(conf);
35 | theConf.setBoolean(SOURCE_GIVEN_TARGET, isSourceGivenTarget);
36 | Job job = new Job(theConf, getName());
37 | job.setJarByClass(WordLexicalProbabilityCalculator.class);
38 | job.setMapperClass(WordLexicalProbabilityCalculator.Map.class);
39 | job.setCombinerClass(IntSumReducer.class);
40 |
41 | job.setPartitionerClass(WordLexicalProbabilityCalculator.Partition.class);
42 | job.setReducerClass(WordLexicalProbabilityCalculator.Reduce.class);
43 |
44 | job.setMapOutputKeyClass(LongWritable.class);
45 | job.setMapOutputValueClass(IntWritable.class);
46 |
47 | int numReducers = conf.getInt("thrax.reducers", conf.getInt("mapreduce.job.reduces", DefaultValues.DEFAULT_NUM_REDUCERS));
48 | job.setNumReduceTasks(numReducers);
49 |
50 | job.setOutputKeyClass(LongWritable.class);
51 | job.setOutputValueClass(FloatWritable.class);
52 |
53 | job.setOutputFormatClass(SequenceFileOutputFormat.class);
54 |
55 | FileInputFormat.setInputPaths(job, new Path(conf.get("thrax.input-file")));
56 | int maxSplitSize = conf.getInt("thrax.max-split-size", 0);
57 | if (maxSplitSize != 0) {
58 | FileInputFormat.setMaxInputSplitSize(job, maxSplitSize);
59 | }
60 | return job;
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/output/OutputReducer.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.output;
2 |
3 | import java.io.IOException;
4 | import java.util.List;
5 | import java.util.Map;
6 | import java.util.TreeMap;
7 |
8 | import org.apache.hadoop.conf.Configuration;
9 | import org.apache.hadoop.io.NullWritable;
10 | import org.apache.hadoop.io.Text;
11 | import org.apache.hadoop.io.Writable;
12 | import org.apache.hadoop.mapreduce.Reducer;
13 |
14 | import edu.jhu.thrax.hadoop.datatypes.FeaturePair;
15 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
16 | import edu.jhu.thrax.hadoop.features.SimpleFeature;
17 | import edu.jhu.thrax.hadoop.features.SimpleFeatureFactory;
18 | import edu.jhu.thrax.util.BackwardsCompatibility;
19 | import edu.jhu.thrax.util.FormatUtils;
20 | import edu.jhu.thrax.util.Vocabulary;
21 |
22 | public class OutputReducer extends Reducer {
23 |
24 | private boolean label;
25 | private boolean sparse;
26 |
27 | private List simpleFeatures;
28 |
29 | protected void setup(Context context) throws IOException, InterruptedException {
30 | Configuration conf = context.getConfiguration();
31 | String vocabulary_path = conf.getRaw("thrax.work-dir") + "vocabulary/part-*";
32 | Vocabulary.initialize(conf, vocabulary_path);
33 |
34 | label = conf.getBoolean("thrax.label-feature-scores", true);
35 | sparse = conf.getBoolean("thrax.sparse-feature-vectors", false);
36 |
37 | String features = BackwardsCompatibility.equivalent(conf.get("thrax.features", ""));
38 | simpleFeatures = SimpleFeatureFactory.getAll(features);
39 | }
40 |
41 | protected void reduce(RuleWritable key, Iterable values, Context context)
42 | throws IOException, InterruptedException {
43 | Map features = new TreeMap();
44 | for (FeaturePair fp : values)
45 | features.put(Vocabulary.word(fp.key), fp.val.get());
46 | for (SimpleFeature feature : simpleFeatures)
47 | features.put(feature.getName(), feature.score(key));
48 | context.write(FormatUtils.ruleToText(key, features, label, sparse), NullWritable.get());
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/paraphrasing/AggregationCombiner.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.paraphrasing;
2 |
3 | import java.io.IOException;
4 | import java.util.ArrayList;
5 | import java.util.List;
6 |
7 | import org.apache.hadoop.conf.Configuration;
8 | import org.apache.hadoop.mapreduce.Reducer;
9 |
10 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
11 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
12 | import edu.jhu.thrax.hadoop.features.annotation.AnnotationFeature;
13 | import edu.jhu.thrax.hadoop.features.annotation.AnnotationFeatureFactory;
14 | import edu.jhu.thrax.hadoop.features.pivot.PivotedAnnotationFeature;
15 | import edu.jhu.thrax.hadoop.features.pivot.PivotedFeature;
16 | import edu.jhu.thrax.hadoop.features.pivot.PivotedFeatureFactory;
17 | import edu.jhu.thrax.util.BackwardsCompatibility;
18 | import edu.jhu.thrax.util.FormatUtils;
19 | import edu.jhu.thrax.util.Vocabulary;
20 |
21 | public class AggregationCombiner
22 | extends Reducer {
23 |
24 | private List pivotedFeatures;
25 |
26 | protected void setup(Context context) throws IOException, InterruptedException {
27 | Configuration conf = context.getConfiguration();
28 | String vocabulary_path = conf.getRaw("thrax.work-dir") + "vocabulary/part-*";
29 | Vocabulary.initialize(conf, vocabulary_path);
30 |
31 | pivotedFeatures = new ArrayList();
32 | List annotationFeatures = new ArrayList();
33 |
34 | String features = BackwardsCompatibility.equivalent(conf.get("thrax.features", ""));
35 | for (String f_name : FormatUtils.P_COMMA_OR_SPACE.split(features)) {
36 | PivotedFeature pf = PivotedFeatureFactory.get(f_name);
37 | if (pf != null) {
38 | pivotedFeatures.add(pf);
39 | } else {
40 | AnnotationFeature af = AnnotationFeatureFactory.get(f_name);
41 | if (af != null) {
42 | annotationFeatures.add(af);
43 | }
44 | }
45 | }
46 | if (!annotationFeatures.isEmpty()) pivotedFeatures.add(new PivotedAnnotationFeature());
47 | }
48 |
49 | protected void reduce(RuleWritable key, Iterable values, Context context)
50 | throws IOException, InterruptedException {
51 | FeatureMap merged = new FeatureMap();
52 |
53 | for (PivotedFeature feature : pivotedFeatures)
54 | feature.initializeAggregation();
55 | for (FeatureMap feature_map : values) {
56 | for (PivotedFeature feature : pivotedFeatures) {
57 | try {
58 | feature.aggregate(feature_map);
59 | } catch (Exception e) {
60 | throw new RuntimeException(key.toString() + " on " + feature.getName() + ": "
61 | + e.getMessage());
62 | }
63 | }
64 | }
65 | for (PivotedFeature feature : pivotedFeatures)
66 | merged.put(feature.getName(), feature.finalizeAggregation());
67 | context.write(key, merged);
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/paraphrasing/AggregationMapper.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.paraphrasing;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.hadoop.conf.Configuration;
6 | import org.apache.hadoop.mapreduce.Mapper;
7 |
8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
9 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
10 | import edu.jhu.thrax.util.Vocabulary;
11 |
12 | public class AggregationMapper extends Mapper {
13 |
14 | protected void setup(Context context) throws IOException, InterruptedException {
15 | Configuration conf = context.getConfiguration();
16 | Vocabulary.initialize(conf);
17 | }
18 |
19 | protected void map(RuleWritable key, FeatureMap value, Context context) throws IOException,
20 | InterruptedException {
21 | context.write(key, value);
22 | context.progress();
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/paraphrasing/FeatureCollectionReducer.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.paraphrasing;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.hadoop.conf.Configuration;
6 | import org.apache.hadoop.mapreduce.Reducer;
7 |
8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
9 | import edu.jhu.thrax.hadoop.datatypes.FeaturePair;
10 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
11 | import edu.jhu.thrax.util.Vocabulary;
12 |
13 | public class FeatureCollectionReducer
14 | extends Reducer {
15 |
16 | protected void setup(Context context) throws IOException, InterruptedException {
17 | Configuration conf = context.getConfiguration();
18 | String vocabulary_path = conf.getRaw("thrax.work-dir") + "vocabulary/part-*";
19 | Vocabulary.initialize(conf, vocabulary_path);
20 | }
21 |
22 | protected void reduce(RuleWritable key, Iterable values, Context context)
23 | throws IOException, InterruptedException {
24 | FeatureMap features = new FeatureMap();
25 | for (FeaturePair fp : values)
26 | features.put(fp.key, fp.val.get());
27 | context.write(key, features);
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/paraphrasing/PivotingMapper.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.paraphrasing;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.hadoop.conf.Configuration;
6 | import org.apache.hadoop.mapreduce.Mapper;
7 |
8 | import edu.jhu.thrax.hadoop.datatypes.FeatureMap;
9 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
10 | import edu.jhu.thrax.util.Vocabulary;
11 |
12 | public class PivotingMapper extends Mapper {
13 |
14 | protected void setup(Context context) throws IOException, InterruptedException {
15 | Configuration conf = context.getConfiguration();
16 | Vocabulary.initialize(conf);
17 | }
18 |
19 | protected void map(RuleWritable key, FeatureMap value, Context context) throws IOException,
20 | InterruptedException {
21 | context.write(key, value);
22 | context.progress();
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/tools/ExtractionTool.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.tools;
2 |
3 | import java.util.Map;
4 |
5 | import org.apache.hadoop.conf.Configuration;
6 | import org.apache.hadoop.conf.Configured;
7 | import org.apache.hadoop.fs.Path;
8 | import org.apache.hadoop.io.IntWritable;
9 | import org.apache.hadoop.mapreduce.Job;
10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
11 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
12 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
13 | import org.apache.hadoop.mapreduce.lib.reduce.IntSumReducer;
14 | import org.apache.hadoop.util.Tool;
15 | import org.apache.hadoop.util.ToolRunner;
16 |
17 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
18 | import edu.jhu.thrax.hadoop.extraction.ExtractionMapper;
19 | import edu.jhu.thrax.util.ConfFileParser;
20 |
21 | public class ExtractionTool extends Configured implements Tool
22 | {
23 | public int run(String [] argv) throws Exception
24 | {
25 | if (argv.length < 1) {
26 | System.err.println("USAGE: ExtractionTool ");
27 | return 1;
28 | }
29 | String thraxConf = argv[0];
30 | Configuration conf = getConf();
31 |
32 | Map options = ConfFileParser.parse(thraxConf);
33 | for (String opt : options.keySet()) {
34 | conf.set("thrax." + opt, options.get(opt));
35 | }
36 | String inputPath = conf.get("thrax.input-file");
37 | if (inputPath == null) {
38 | System.err.println("Set input-file key in conf file " + thraxConf + "!");
39 | return 1;
40 | }
41 | String workDir = conf.get("thrax.work-dir");
42 | if (workDir == null) {
43 | System.err.println("Set work-dir key in conf file " + thraxConf + "!");
44 | return 1;
45 | }
46 |
47 | Job job = new Job(conf, "thrax");
48 | job.setJarByClass(ExtractionMapper.class);
49 | job.setMapperClass(ExtractionMapper.class);
50 | job.setCombinerClass(IntSumReducer.class);
51 | job.setReducerClass(IntSumReducer.class);
52 |
53 | job.setMapOutputKeyClass(RuleWritable.class);
54 | job.setMapOutputValueClass(IntWritable.class);
55 |
56 | job.setOutputKeyClass(RuleWritable.class);
57 | job.setOutputValueClass(IntWritable.class);
58 |
59 | job.setOutputFormatClass(SequenceFileOutputFormat.class);
60 |
61 | FileInputFormat.setInputPaths(job, new Path(inputPath));
62 | if (!workDir.endsWith(Path.SEPARATOR))
63 | workDir += Path.SEPARATOR;
64 | FileOutputFormat.setOutputPath(job, new Path(workDir + "rules"));
65 |
66 | job.submit();
67 | return 0;
68 | }
69 |
70 | public static void main(String [] argv) throws Exception
71 | {
72 | int exit_code = ToolRunner.run(null, new ExtractionTool(), argv);
73 | System.exit(exit_code);
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/tools/FeatureTool.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.tools;
2 |
3 | import java.util.Map;
4 |
5 | import org.apache.hadoop.conf.Configuration;
6 | import org.apache.hadoop.conf.Configured;
7 | import org.apache.hadoop.fs.Path;
8 | import org.apache.hadoop.io.IntWritable;
9 | import org.apache.hadoop.mapreduce.Job;
10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
11 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
12 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
13 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
14 | import org.apache.hadoop.util.Tool;
15 | import org.apache.hadoop.util.ToolRunner;
16 |
17 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
18 | import edu.jhu.thrax.hadoop.features.mapred.MapReduceFeature;
19 | import edu.jhu.thrax.hadoop.features.mapred.MapReduceFeatureFactory;
20 | import edu.jhu.thrax.util.ConfFileParser;
21 |
22 | public class FeatureTool extends Configured implements Tool
23 | {
24 | public int run(String [] argv) throws Exception
25 | {
26 | if (argv.length < 2) {
27 | System.err.println("usage: FeatureTool ");
28 | return 1;
29 | }
30 | String confFile = argv[0];
31 | String featureName = argv[1];
32 | MapReduceFeature f = MapReduceFeatureFactory.get(featureName);
33 | if (!(f instanceof MapReduceFeature)) {
34 | System.err.println("Not a MapReduceFeature: " + featureName);
35 | return 1;
36 | }
37 | Configuration conf = getConf();
38 | Map options = ConfFileParser.parse(confFile);
39 | for (String opt : options.keySet()) {
40 | conf.set("thrax." + opt, options.get(opt));
41 | }
42 | String workDir = conf.get("thrax.work-dir");
43 | if (workDir == null) {
44 | System.err.println("set work-dir key in conf file " + confFile + "!");
45 | return 1;
46 | }
47 | if (!workDir.endsWith(Path.SEPARATOR)) {
48 | workDir += Path.SEPARATOR;
49 | conf.set("thrax.work-dir", workDir);
50 | }
51 | Job job = new Job(conf, String.format("thrax-%s", featureName));
52 |
53 | job.setJarByClass(f.getClass());
54 | job.setMapperClass(f.mapperClass());
55 | job.setCombinerClass(f.combinerClass());
56 | job.setSortComparatorClass(f.sortComparatorClass());
57 | job.setPartitionerClass(f.partitionerClass());
58 | job.setReducerClass(f.reducerClass());
59 |
60 | job.setInputFormatClass(SequenceFileInputFormat.class);
61 |
62 | job.setMapOutputKeyClass(RuleWritable.class);
63 | job.setMapOutputValueClass(IntWritable.class);
64 |
65 | job.setOutputKeyClass(RuleWritable.class);
66 | job.setOutputValueClass(IntWritable.class);
67 |
68 | job.setOutputFormatClass(SequenceFileOutputFormat.class);
69 |
70 | FileInputFormat.setInputPaths(job, new Path(workDir + "rules"));
71 | FileOutputFormat.setOutputPath(job, new Path(workDir + featureName));
72 |
73 | job.submit();
74 | return 0;
75 | }
76 |
77 | public static void main(String [] argv) throws Exception
78 | {
79 | int exit_code = ToolRunner.run(null, new FeatureTool(), argv);
80 | System.exit(exit_code);
81 | }
82 | }
83 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/tools/OutputTool.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.tools;
2 |
3 | import java.util.Map;
4 |
5 | import org.apache.hadoop.conf.Configuration;
6 | import org.apache.hadoop.conf.Configured;
7 | import org.apache.hadoop.fs.Path;
8 | import org.apache.hadoop.io.NullWritable;
9 | import org.apache.hadoop.mapreduce.Job;
10 | import org.apache.hadoop.mapreduce.Mapper;
11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
12 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
14 | import org.apache.hadoop.util.Tool;
15 | import org.apache.hadoop.util.ToolRunner;
16 |
17 | import edu.jhu.thrax.hadoop.datatypes.RuleWritable;
18 | import edu.jhu.thrax.hadoop.features.mapred.MapReduceFeature;
19 | import edu.jhu.thrax.hadoop.features.mapred.MapReduceFeatureFactory;
20 | import edu.jhu.thrax.hadoop.output.OutputReducer;
21 | import edu.jhu.thrax.util.BackwardsCompatibility;
22 | import edu.jhu.thrax.util.ConfFileParser;
23 | import edu.jhu.thrax.util.FormatUtils;
24 |
25 | public class OutputTool extends Configured implements Tool
26 | {
27 | public int run(String [] argv) throws Exception
28 | {
29 | if (argv.length < 1) {
30 | System.err.println("usage: OutputTool ");
31 | return 1;
32 | }
33 | String confFile = argv[0];
34 | Map options = ConfFileParser.parse(confFile);
35 | Configuration conf = getConf();
36 | for (String opt : options.keySet()) {
37 | conf.set("thrax." + opt, options.get(opt));
38 | }
39 | String workDir = conf.get("thrax.work-dir");
40 | if (workDir == null) {
41 | System.err.println("Set work-dir key in conf file " + confFile + "!");
42 | return 1;
43 | }
44 | if (!workDir.endsWith(Path.SEPARATOR)) {
45 | workDir += Path.SEPARATOR;
46 | conf.set("thrax.work-dir", workDir);
47 | }
48 | Job job = new Job(conf, "thrax-collect");
49 | job.setJarByClass(OutputReducer.class);
50 |
51 | job.setMapperClass(Mapper.class);
52 | job.setReducerClass(OutputReducer.class);
53 |
54 | job.setInputFormatClass(SequenceFileInputFormat.class);
55 |
56 | job.setMapOutputKeyClass(RuleWritable.class);
57 | job.setMapOutputValueClass(NullWritable.class);
58 |
59 | job.setOutputKeyClass(RuleWritable.class);
60 | job.setOutputValueClass(NullWritable.class);
61 |
62 | String features = BackwardsCompatibility.equivalent(conf.get("thrax.features", ""));
63 | for (String feature : FormatUtils.P_SPACE.split(features)) {
64 | if (MapReduceFeatureFactory.get(feature) instanceof MapReduceFeature) {
65 | FileInputFormat.addInputPath(job, new Path(workDir + feature));
66 | }
67 | }
68 | if (FileInputFormat.getInputPaths(job).length == 0)
69 | FileInputFormat.addInputPath(job, new Path(workDir + "rules"));
70 | FileOutputFormat.setOutputPath(job, new Path(workDir + "final"));
71 |
72 | job.submit();
73 | return 0;
74 | }
75 |
76 | public static void main(String [] argv) throws Exception
77 | {
78 | int exit_code = ToolRunner.run(null, new OutputTool(), argv);
79 | System.exit(exit_code);
80 | }
81 | }
82 |
--------------------------------------------------------------------------------
/src/edu/jhu/thrax/hadoop/tools/SourceWordGivenTargetWordProbabilityTool.java:
--------------------------------------------------------------------------------
1 | package edu.jhu.thrax.hadoop.tools;
2 |
3 | import java.util.Map;
4 |
5 | import org.apache.hadoop.conf.Configuration;
6 | import org.apache.hadoop.conf.Configured;
7 | import org.apache.hadoop.fs.Path;
8 | import org.apache.hadoop.io.FloatWritable;
9 | import org.apache.hadoop.io.IntWritable;
10 | import org.apache.hadoop.mapreduce.Job;
11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
12 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
13 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
14 | import org.apache.hadoop.mapreduce.lib.reduce.IntSumReducer;
15 | import org.apache.hadoop.util.Tool;
16 | import org.apache.hadoop.util.ToolRunner;
17 |
18 | import edu.jhu.thrax.hadoop.datatypes.TextPair;
19 | import edu.jhu.thrax.hadoop.features.WordLexicalProbabilityCalculator;
20 | import edu.jhu.thrax.hadoop.jobs.WordLexprobJob;
21 | import edu.jhu.thrax.util.ConfFileParser;
22 |
23 | public class SourceWordGivenTargetWordProbabilityTool extends Configured implements Tool
24 | {
25 | public int run(String [] argv) throws Exception
26 | {
27 | if (argv.length < 1) {
28 | System.err.println("usage: SourceWordGivenTargetWordProbabilityTool