├── .gitignore ├── .gitmodules ├── .travis.yml ├── CHANGES.md ├── KEYS ├── LICENSE.txt ├── NOTICE.txt ├── README.md ├── bin ├── bleu ├── debug-joshua ├── extract-1best ├── joshua ├── joshua-decoder ├── meteor └── pipeline.pl ├── demo ├── README.md ├── apache_joshua_logo.png ├── apache_joshua_logo_faded.png ├── bootstrap │ ├── css │ │ ├── bootstrap-theme.css │ │ ├── bootstrap-theme.css.map │ │ ├── bootstrap-theme.min.css │ │ ├── bootstrap-theme.min.css.map │ │ ├── bootstrap.css │ │ ├── bootstrap.css.map │ │ ├── bootstrap.min.css │ │ └── bootstrap.min.css.map │ ├── fonts │ │ ├── glyphicons-halflings-regular.eot │ │ ├── glyphicons-halflings-regular.svg │ │ ├── glyphicons-halflings-regular.ttf │ │ ├── glyphicons-halflings-regular.woff │ │ └── glyphicons-halflings-regular.woff2 │ └── js │ │ ├── bootstrap.js │ │ ├── bootstrap.min.js │ │ ├── jquery-2.2.0.min.js │ │ └── npm.js ├── demo.config ├── demo.js ├── favicon.ico ├── index.html └── scripts │ ├── joshua.sh │ └── web_server.py ├── distribution ├── docker │ ├── README.md │ ├── dev │ │ └── Dockerfile │ └── kenlm │ │ ├── Dockerfile │ │ └── README.md ├── joshua-full │ ├── README.md │ ├── actions.yaml │ ├── actions │ │ ├── add-language-pack │ │ └── remove-language-pack │ ├── config.yaml │ ├── copyright │ ├── icon.svg │ ├── layer.yaml │ ├── metadata.yaml │ ├── reactive │ │ └── joshua_runtime.py │ └── tests │ │ ├── 00-setup │ │ └── 10-deploy └── joshua-runtime │ ├── README.md │ ├── actions.yaml │ ├── actions │ ├── add-language-pack │ └── remove-language-pack │ ├── config.yaml │ ├── copyright │ ├── icon.svg │ ├── layer.yaml │ ├── metadata.yaml │ ├── reactive │ └── joshua_runtime.py │ └── tests │ ├── 00-setup │ └── 10-deploy ├── doap_joshua.rdf ├── doc ├── Doxyfile ├── Eclipse.howto ├── documentation.md ├── joshua-logo.png ├── mainpage.md ├── release.md ├── sparse_features.md ├── troubleshooting.md └── zmert_release │ ├── zmert_doc_v1.40.pdf │ └── zmert_v1.40.zip ├── download-deps.sh ├── eclipse-codeformat.xml ├── examples ├── README.md ├── download.sh ├── old │ ├── PRO │ │ ├── joshua.config │ │ ├── libsvm_command │ │ ├── megam_command │ │ ├── params.txt │ │ ├── pro.config │ │ ├── run.sh │ │ └── sparse_feat │ └── ZMERT │ │ ├── README_ZMERT.txt │ │ ├── ZMERT.out.verbosity1 │ │ ├── ZMERT.out.verbosity2 │ │ ├── ZMERT_config_ex2.txt │ │ ├── config_ex2.txt │ │ ├── decoder_command_ex2 │ │ ├── params.txt │ │ ├── ref.0 │ │ ├── ref.1 │ │ ├── ref.2 │ │ ├── ref.3 │ │ └── src.txt ├── training │ ├── README │ ├── ghkm.sh │ ├── hiero-europarl.sh │ ├── hiero.sh │ ├── phrase.sh │ └── samt.sh └── tree_visualizer │ ├── README │ ├── tree.en │ ├── tree.en.2 │ ├── tree.fr │ ├── tree.ref │ └── tree_visualizer.cmd ├── jni ├── build_kenlm.sh └── kenlm_wrap.cc ├── pom.xml ├── scripts ├── analysis │ ├── sentence-by-sentence.pl │ └── tree_visualizer ├── copy-config.pl ├── distributedLM │ ├── config.template │ ├── create_remote_sym_tbl.pl │ ├── filter_lm.pl │ ├── get_grammar_eng_voc.pl │ ├── get_grammar_eng_voc_from_cn_voc.pl │ ├── global_symol_list │ ├── job_start_lmserver.sh │ └── lm.list.withweights ├── ems │ ├── config.ghkm │ ├── config.hiero │ ├── config.phrase │ └── experiment.meta ├── features │ └── addSparseFeatures.py ├── filter_grammar_to_sentences.sh ├── language-pack │ ├── README.template │ ├── VERSIONS │ ├── build_lp.sh │ ├── copy_model.py │ ├── prepare.sh │ └── test_lp.sh ├── lm │ └── compile_berkeley.py ├── misc │ ├── canonical_path │ └── iso639 ├── preparation │ ├── detokenize.pl │ ├── lowercase.pl │ ├── nonbreaking_prefixes │ │ ├── nonbreaking_prefix.ca │ │ ├── nonbreaking_prefix.cs │ │ ├── nonbreaking_prefix.de │ │ ├── nonbreaking_prefix.el │ │ ├── nonbreaking_prefix.en │ │ ├── nonbreaking_prefix.es │ │ ├── nonbreaking_prefix.fr │ │ ├── nonbreaking_prefix.hu │ │ ├── nonbreaking_prefix.is │ │ ├── nonbreaking_prefix.it │ │ ├── nonbreaking_prefix.lv │ │ ├── nonbreaking_prefix.nl │ │ ├── nonbreaking_prefix.pl │ │ ├── nonbreaking_prefix.pt │ │ ├── nonbreaking_prefix.ro │ │ ├── nonbreaking_prefix.ru │ │ ├── nonbreaking_prefix.sk │ │ ├── nonbreaking_prefix.sl │ │ └── nonbreaking_prefix.sv │ ├── normalize.pl │ ├── preprocess.sh │ └── tokenize.pl ├── samt │ ├── filterGrammar.py │ ├── lexprob2samt.py │ ├── pipeline.sh │ ├── postprocessSAMT.sh │ └── selectFeatures.py ├── support │ ├── .gitignore │ ├── bbn2plf.pl │ ├── create_glue_grammar.sh │ ├── extract-1best │ ├── filter_grammar.sh │ ├── grammar-packer.pl │ ├── make-release.sh │ ├── merge_lms.py │ ├── moses2joshua.pl │ ├── moses2joshua_grammar.pl │ ├── phrase2hiero.py │ ├── prepare.sh │ ├── query_http.py │ ├── run_bundler.py │ ├── score-hypothesis.pl │ ├── split2files │ └── write-version.sh ├── thrax │ ├── run_thrax.py │ └── strip_label.py ├── toolkit │ ├── chunki.py │ ├── extract_references.py │ ├── joini.py │ └── shorti.py └── training │ ├── TODO │ ├── add-OOVs.pl │ ├── build-vocab.pl │ ├── cachepipe │ ├── CachePipe.pm │ ├── README │ └── bashrc │ ├── class-lm │ └── replaceTokensWithClasses.py │ ├── filter-empty-lines.pl │ ├── filter-rules.pl │ ├── get_grammar_features.pl │ ├── lowercase-leaves.pl │ ├── mira │ ├── feature_label_munger.pl │ └── run-mira.pl │ ├── paralign.pl │ ├── parallelize │ ├── LocalConfig.pm │ ├── Makefile │ ├── parallelize.pl │ ├── sentclient.c │ ├── sentserver.c │ └── sentserver.h │ ├── parmbr.sh │ ├── paste │ ├── pipeline.pl │ ├── run-giza.pl │ ├── run_tuner.py │ ├── scat │ ├── summarize.pl │ ├── templates │ ├── alignment │ │ ├── jacana │ │ │ └── resources │ │ │ │ ├── freedict │ │ │ │ ├── en-fr.dict.gz │ │ │ │ ├── en-fr.fr-en.dict.gz │ │ │ │ └── fr-en.dict.gz │ │ │ │ ├── model │ │ │ │ ├── EnglishChunk.bin.gz │ │ │ │ ├── EnglishSD.bin.gz │ │ │ │ ├── EnglishTok.bin.gz │ │ │ │ ├── fr-en.model │ │ │ │ ├── tag.bin.gz │ │ │ │ └── tagdict │ │ │ │ └── wiktionary │ │ │ │ └── en-fr.csv.gz │ │ └── word-align.conf │ ├── glue-grammar │ ├── glue-grammar.itg │ ├── hadoop │ │ ├── core-site.xml │ │ ├── hdfs-site.xml │ │ ├── mapred-site.xml │ │ ├── masters │ │ └── slaves │ ├── thrax-hiero.conf │ ├── thrax-phrasal.conf │ ├── thrax-phrase-gt.conf │ ├── thrax-phrase.conf │ ├── thrax-samt.conf │ └── tune │ │ ├── decoder_command │ │ ├── decoder_command.qsub │ │ └── joshua.config │ ├── trim_parallel_corpus.pl │ └── unmap-html.pl └── src ├── main ├── assembly │ └── src.xml ├── java │ └── org │ │ └── apache │ │ └── joshua │ │ ├── adagrad │ │ ├── AdaGrad.java │ │ ├── AdaGradCore.java │ │ └── Optimizer.java │ │ ├── corpus │ │ ├── AbstractPhrase.java │ │ ├── BasicPhrase.java │ │ ├── ContiguousPhrase.java │ │ ├── Corpus.java │ │ ├── Phrase.java │ │ ├── Span.java │ │ ├── SymbolTable.java │ │ ├── TerminalIterator.java │ │ ├── Vocabulary.java │ │ └── syntax │ │ │ ├── ArraySyntaxTree.java │ │ │ └── SyntaxTree.java │ │ ├── decoder │ │ ├── ArgsParser.java │ │ ├── BLEU.java │ │ ├── Decoder.java │ │ ├── DecoderTask.java │ │ ├── JoshuaConfiguration.java │ │ ├── JoshuaDecoder.java │ │ ├── LanguageModelStateManager.java │ │ ├── LmPool.java │ │ ├── NbestMinRiskReranker.java │ │ ├── StructuredTranslation.java │ │ ├── StructuredTranslationFactory.java │ │ ├── Support.java │ │ ├── Translation.java │ │ ├── TranslationResponseStream.java │ │ ├── chart_parser │ │ │ ├── Cell.java │ │ │ ├── Chart.java │ │ │ ├── ComputeNodeResult.java │ │ │ ├── CubePruneState.java │ │ │ ├── DotChart.java │ │ │ ├── SourcePath.java │ │ │ ├── StateConstraint.java │ │ │ ├── SuperNode.java │ │ │ └── package-info.java │ │ ├── ff │ │ │ ├── ArityPhrasePenalty.java │ │ │ ├── FeatureFunction.java │ │ │ ├── FeatureVector.java │ │ │ ├── LabelCombinationFF.java │ │ │ ├── LabelSubstitutionFF.java │ │ │ ├── LexicalFeatures.java │ │ │ ├── OOVPenalty.java │ │ │ ├── PhraseModel.java │ │ │ ├── PhrasePenalty.java │ │ │ ├── RuleCountBin.java │ │ │ ├── RuleFF.java │ │ │ ├── RuleLength.java │ │ │ ├── RulePropertiesQuerying.java │ │ │ ├── RuleShape.java │ │ │ ├── SourceDependentFF.java │ │ │ ├── SourcePathFF.java │ │ │ ├── StatefulFF.java │ │ │ ├── StatelessFF.java │ │ │ ├── TargetBigram.java │ │ │ ├── WordPenalty.java │ │ │ ├── fragmentlm │ │ │ │ ├── ConcatenationIterator.java │ │ │ │ ├── FragmentLMFF.java │ │ │ │ ├── PennTreebankReader.java │ │ │ │ ├── Tree.java │ │ │ │ └── Trees.java │ │ │ ├── lm │ │ │ │ ├── AbstractLM.java │ │ │ │ ├── ArpaFile.java │ │ │ │ ├── ArpaNgram.java │ │ │ │ ├── ClassMap.java │ │ │ │ ├── DefaultNGramLanguageModel.java │ │ │ │ ├── KenLM.java │ │ │ │ ├── LanguageModelFF.java │ │ │ │ ├── NGramLanguageModel.java │ │ │ │ ├── StateMinimizingLanguageModel.java │ │ │ │ ├── berkeley_lm │ │ │ │ │ ├── LICENSE │ │ │ │ │ ├── LMGrammarBerkeley.java │ │ │ │ │ ├── README │ │ │ │ │ └── SymbolTableWrapper.java │ │ │ │ ├── bloomfilter_lm │ │ │ │ │ ├── BloomFilter.java │ │ │ │ │ ├── BloomFilterLanguageModel.java │ │ │ │ │ └── package-info.java │ │ │ │ ├── buildin_lm │ │ │ │ │ ├── TrieLM.java │ │ │ │ │ └── package-info.java │ │ │ │ └── package-info.java │ │ │ ├── package-info.java │ │ │ ├── phrase │ │ │ │ └── Distortion.java │ │ │ ├── similarity │ │ │ │ └── EdgePhraseSimilarityFF.java │ │ │ ├── state_maintenance │ │ │ │ ├── DPState.java │ │ │ │ ├── KenLMState.java │ │ │ │ └── NgramDPState.java │ │ │ └── tm │ │ │ │ ├── AbstractGrammar.java │ │ │ │ ├── BasicRuleCollection.java │ │ │ │ ├── CreateGlueGrammar.java │ │ │ │ ├── Grammar.java │ │ │ │ ├── GrammarReader.java │ │ │ │ ├── OwnerId.java │ │ │ │ ├── OwnerMap.java │ │ │ │ ├── Rule.java │ │ │ │ ├── RuleCollection.java │ │ │ │ ├── Trie.java │ │ │ │ ├── UnsortedRuleCollectionException.java │ │ │ │ ├── format │ │ │ │ ├── HieroFormatReader.java │ │ │ │ └── MosesFormatReader.java │ │ │ │ ├── hash_based │ │ │ │ ├── ExtensionIterator.java │ │ │ │ ├── MemoryBasedBatchGrammar.java │ │ │ │ ├── MemoryBasedRuleBin.java │ │ │ │ ├── MemoryBasedTrie.java │ │ │ │ └── package-info.java │ │ │ │ ├── package-info.java │ │ │ │ └── packed │ │ │ │ ├── PackedGrammar.java │ │ │ │ └── SliceAggregatingTrie.java │ │ ├── hypergraph │ │ │ ├── AlignedSourceTokens.java │ │ │ ├── AllSpansWalker.java │ │ │ ├── DefaultInsideOutside.java │ │ │ ├── FeatureVectorExtractor.java │ │ │ ├── ForestWalker.java │ │ │ ├── GrammarBuilderWalkerFunction.java │ │ │ ├── HGNode.java │ │ │ ├── HyperEdge.java │ │ │ ├── HyperGraph.java │ │ │ ├── HyperGraphPruning.java │ │ │ ├── KBestExtractor.java │ │ │ ├── OutputStringExtractor.java │ │ │ ├── StringToTreeConverter.java │ │ │ ├── TrivialInsideOutside.java │ │ │ ├── ViterbiExtractor.java │ │ │ ├── WalkerFunction.java │ │ │ ├── WordAlignmentExtractor.java │ │ │ ├── WordAlignmentState.java │ │ │ └── package-info.java │ │ ├── io │ │ │ ├── DeNormalize.java │ │ │ ├── JSONMessage.java │ │ │ └── TranslationRequestStream.java │ │ ├── package-info.java │ │ ├── phrase │ │ │ ├── Candidate.java │ │ │ ├── Coverage.java │ │ │ ├── Future.java │ │ │ ├── Header.java │ │ │ ├── Hypothesis.java │ │ │ ├── Note.java │ │ │ ├── PhraseChart.java │ │ │ ├── PhraseNodes.java │ │ │ ├── PhraseTable.java │ │ │ ├── Stack.java │ │ │ └── Stacks.java │ │ └── segment_file │ │ │ ├── ConstraintRule.java │ │ │ ├── ConstraintSpan.java │ │ │ ├── ParseTreeInput.java │ │ │ ├── ParsedSentence.java │ │ │ ├── Sentence.java │ │ │ ├── Token.java │ │ │ └── package-info.java │ │ ├── lattice │ │ ├── Arc.java │ │ ├── Lattice.java │ │ ├── Node.java │ │ ├── NodeIdentifierComparator.java │ │ └── package-info.java │ │ ├── metrics │ │ ├── BLEU.java │ │ ├── BLEU_SBP.java │ │ ├── CHRF.java │ │ ├── EvaluationMetric.java │ │ ├── GradeLevelBLEU.java │ │ ├── METEOR.java │ │ ├── MinimumChangeBLEU.java │ │ ├── NewMetric.java.template │ │ ├── Precis.java │ │ ├── PrecisMinusSourceBLEU.java │ │ ├── SARI.java │ │ ├── SourceBLEU.java │ │ ├── TER.java │ │ ├── TERMinusBLEU.java │ │ ├── TercomRunner.java │ │ └── ZeroOneLoss.java │ │ ├── mira │ │ ├── MIRA.java │ │ ├── MIRACore.java │ │ └── Optimizer.java │ │ ├── oracle │ │ ├── OracleExtractionHG.java │ │ ├── OracleExtractor.java │ │ ├── SplitHg.java │ │ └── package-info.java │ │ ├── pro │ │ ├── ClassifierInterface.java │ │ ├── ClassifierMegaM.java │ │ ├── ClassifierPerceptron.java │ │ ├── ClassifierSVM.java │ │ ├── Optimizer.java │ │ ├── PRO.java │ │ └── PROCore.java │ │ ├── server │ │ ├── ServerThread.java │ │ └── TcpServer.java │ │ ├── subsample │ │ ├── AlignedSubsampler.java │ │ ├── Alignment.java │ │ ├── BiCorpus.java │ │ ├── BiCorpusFactory.java │ │ ├── PhrasePair.java │ │ ├── PhraseReader.java │ │ ├── PhraseWriter.java │ │ ├── Subsampler.java │ │ ├── SubsamplerCLI.java │ │ └── package-info.java │ │ ├── tools │ │ ├── GrammarPacker.java │ │ ├── GrammarPackerCli.java │ │ ├── LabelPhrases.java │ │ └── TestSetFilter.java │ │ ├── ui │ │ ├── Orientation.java │ │ ├── StartupWindow.java │ │ ├── package-info.java │ │ └── tree_visualizer │ │ │ ├── DerivationTree.java │ │ │ ├── DerivationTreeEdge.java │ │ │ ├── DerivationTreeTransformer.java │ │ │ ├── DerivationViewer.java │ │ │ ├── DerivationViewerApplet.java │ │ │ ├── Node.java │ │ │ ├── browser │ │ │ ├── Browser.java │ │ │ ├── DerivationTreeFrame.java │ │ │ └── TranslationInfo.java │ │ │ └── tree │ │ │ └── Tree.java │ │ ├── util │ │ ├── Algorithms.java │ │ ├── Bits.java │ │ ├── Cache.java │ │ ├── ChartSpan.java │ │ ├── Constants.java │ │ ├── Counted.java │ │ ├── Counts.java │ │ ├── ExtractTopCand.java │ │ ├── FileUtility.java │ │ ├── FormatUtils.java │ │ ├── JoshuaEval.java │ │ ├── ListUtil.java │ │ ├── NBestListUtility.java │ │ ├── Ngram.java │ │ ├── PackedGrammarServer.java │ │ ├── Pair.java │ │ ├── Platform.java │ │ ├── Regex.java │ │ ├── StreamGobbler.java │ │ ├── UnicodeCharacterName.java │ │ ├── encoding │ │ │ ├── Analyzer.java │ │ │ ├── EightBitQuantizer.java │ │ │ ├── EncoderConfiguration.java │ │ │ ├── EncoderFactory.java │ │ │ ├── FeatureTypeAnalyzer.java │ │ │ ├── FloatEncoder.java │ │ │ ├── IntEncoder.java │ │ │ ├── PrimitiveFloatEncoder.java │ │ │ ├── PrimitiveIntEncoder.java │ │ │ └── VariableQuantizer.java │ │ ├── io │ │ │ ├── BinaryIn.java │ │ │ ├── BinaryOut.java │ │ │ ├── ExistingUTF8EncodedTextFile.java │ │ │ ├── IndexedReader.java │ │ │ ├── LineReader.java │ │ │ ├── ProgressInputStream.java │ │ │ ├── Reader.java │ │ │ └── package-info.java │ │ ├── package-info.java │ │ └── quantization │ │ │ ├── BooleanQuantizer.java │ │ │ ├── Quantizer.java │ │ │ ├── QuantizerConfiguration.java │ │ │ ├── QuantizerFactory.java │ │ │ ├── StatelessQuantizer.java │ │ │ └── package-info.java │ │ └── zmert │ │ ├── IntermediateOptimizer.java │ │ ├── MertCore.java │ │ ├── ZMERT.java │ │ └── package-info.java └── resources │ └── log4j.properties ├── overview.html └── test ├── java └── org │ └── apache │ └── joshua │ ├── corpus │ ├── SpanTest.java │ └── VocabularyTest.java │ ├── decoder │ ├── TestTranslation.java │ ├── cky │ │ ├── BnEnDecodingTest.java │ │ ├── ConstrainedTest.java │ │ ├── DenormalizationTest.java │ │ ├── DoNotCrashTest.java │ │ ├── LeftStateTest.java │ │ ├── LowercaseTest.java │ │ ├── NAryTest.java │ │ ├── NoGrammarTest.java │ │ ├── NumTranslationOptionsTest.java │ │ ├── OOVListTest.java │ │ ├── RescoringTest.java │ │ ├── SourceAnnotationsTest.java │ │ ├── SparseFeatureTest.java │ │ ├── TargetBigram.java │ │ ├── TestUtil.java │ │ ├── TooLongTest.java │ │ ├── TreeOutputTest.java │ │ └── UniqueHypothesesTest.java │ ├── ff │ │ ├── lm │ │ │ ├── ArpaFileTest.java │ │ │ ├── LanguageModelFFTest.java │ │ │ ├── berkeley_lm │ │ │ │ ├── LMBerkeleySentenceProbablityTest.java │ │ │ │ └── LMGrammarBerkeleyTest.java │ │ │ └── class_lm │ │ │ │ ├── ClassBasedLanguageModelTest.java │ │ │ │ └── ClassMapTest.java │ │ └── tm │ │ │ └── OwnerMapTest.java │ ├── io │ │ ├── DeNormalizeTest.java │ │ └── TranslationRequestStreamTest.java │ ├── kbest_extraction │ │ └── KBestExtractionTest.java │ ├── phrase │ │ ├── CoverageTest.java │ │ ├── constrained │ │ │ └── ConstrainedPhraseDecodingTest.java │ │ └── decode │ │ │ └── PhraseDecodingTest.java │ └── segment_file │ │ ├── AlmostTooLongSentenceTest.java │ │ └── SentenceTest.java │ ├── lattice │ ├── ArcTest.java │ ├── LatticeTest.java │ └── NodeTest.java │ ├── packed │ ├── Benchmark.java │ ├── CountRules.java │ ├── PrintRules.java │ ├── README │ ├── packer.config │ └── small_grammar │ ├── system │ ├── AlignmentMapTest.java │ ├── KenLmTest.java │ ├── LmOovFeatureTest.java │ ├── MultithreadedTranslationTests.java │ ├── StructuredOutputTest.java │ └── StructuredTranslationTest.java │ ├── ui │ └── tree_visualizer │ │ └── tree │ │ └── TreeTest.java │ ├── util │ ├── BitsTest.java │ ├── CacheTest.java │ ├── CountsTest.java │ ├── FormatUtilsTest.java │ └── io │ │ ├── BinaryTest.java │ │ └── KenLmTestUtil.java │ └── zmert │ └── BLEUTest.java └── resources ├── berkeley_lm └── lm ├── bn-en ├── hiero │ ├── .gitignore │ ├── class.map │ ├── class_lm_9gram.gz │ ├── glue-grammar │ ├── grammar.gz │ ├── input.bn │ ├── joshua-berkeleylm.config │ ├── joshua-classlm.config │ ├── joshua.config │ ├── lm.gz │ ├── output-berkeleylm.gold │ ├── output-classlm.gold │ └── output.gold ├── packed │ ├── .gitignore │ ├── grammar.glue │ ├── grammar.gz │ ├── grammar.packed │ │ ├── encoding │ │ ├── slice_00000.features │ │ ├── slice_00000.source │ │ ├── slice_00000.target │ │ ├── slice_00000.target.lookup │ │ └── vocabulary │ ├── input.bn │ ├── joshua.config │ ├── lm.gz │ └── output.gold └── samt │ ├── grammar.glue │ ├── grammar.gz │ ├── input.bn │ ├── joshua.config │ ├── lm.gz │ └── output.gold ├── data └── tiny.en ├── decoder ├── constrained │ ├── .gitignore │ ├── glue-grammar │ ├── grammar.gz │ ├── input.bn │ ├── joshua.config │ ├── lm.gz │ └── output.gold ├── dont-crash │ └── input ├── left-state │ ├── glue-grammar │ ├── grammar.gz │ ├── input.bn │ ├── joshua.config │ ├── lm.gz │ └── output.gold ├── lowercaser │ ├── config │ ├── grammar.glue │ ├── grammar.test │ └── joshua.config ├── moses-compat │ ├── NEEDS_UPDATING │ ├── n-best.txt │ ├── output.expected │ └── test.sh ├── n-ary │ ├── glue-grammar │ ├── grammar │ ├── input.txt │ ├── joshua.config │ ├── lm.gz │ ├── output.gold │ └── weights ├── num_translation_options │ ├── README │ ├── glue-grammar │ ├── grammar.gz │ ├── grammar.packed │ │ ├── encoding │ │ ├── slice_00000.features │ │ ├── slice_00000.source │ │ ├── slice_00000.target │ │ ├── slice_00000.target.lookup │ │ └── vocabulary │ ├── input │ ├── joshua-packed.config │ ├── joshua.config │ ├── lm.gz │ ├── output-no-dot-chart.gold │ ├── output-packed.gold │ └── output.gold ├── oov-list │ ├── glue-grammar │ ├── grammar │ ├── input.txt │ ├── joshua.config │ ├── lm.gz │ └── output.gold ├── phrase │ └── unique-hypotheses │ │ ├── joshua.config │ │ ├── lm.1.gz │ │ └── rules.1.gz ├── rescoring │ ├── glue-grammar │ ├── grammar.gz │ ├── input.txt │ ├── joshua.config │ ├── lm.gz │ └── output.gold ├── source-annotations │ ├── grammar │ ├── grammar.glue │ ├── input.txt │ ├── joshua.config │ ├── lm.kenlm │ └── output.gold ├── target-bigram │ └── vocab └── tree-output │ ├── glue-grammar │ ├── grammar.gz │ ├── input │ ├── joshua.config │ ├── lm.gz │ └── output.gold ├── grammar.glue ├── grammar └── sparse-features │ ├── grammar │ ├── grammar.glue │ ├── grammar.packed │ ├── encoding │ ├── slice_00000.features │ ├── slice_00000.source │ ├── slice_00000.target │ ├── slice_00000.target.lookup │ └── vocabulary │ ├── joshua-packed.config │ └── joshua.config ├── joshua └── README.broken ├── kbest_extraction ├── glue-grammar ├── grammar ├── joshua.config ├── lm.gz ├── output.gold └── output.scores.gold ├── kenlm └── oilers.kenlm ├── lattice-short ├── README ├── glue-grammar ├── grammar.test ├── input ├── joshua.config ├── output.expected ├── test.lm └── test.sh ├── lattice ├── .gitignore ├── README ├── glue-grammar ├── grammar.test ├── joshua.config ├── output.expected ├── test-lattice.pdf ├── test.lm ├── test.plf └── test.sh ├── lm ├── berkeley │ ├── lm │ ├── lm.berkeleylm │ ├── lm.berkeleylm.gz │ └── lm.gz └── class_lm │ ├── class.map │ └── class_lm_9gram.gz ├── lm_oov └── joshua.config ├── packed-grammar ├── .gitignore ├── README ├── grammar.gz ├── input.bn ├── joshua.config ├── lm.gz ├── output.gold ├── reference.en.0 ├── reference.en.1 ├── reference.en.2 ├── reference.en.3 ├── test-multiple.sh └── test.sh ├── parser ├── grammar ├── grammar.glue ├── input ├── output.gold ├── parse.config ├── test.sh └── weights ├── phrase_decoder ├── config ├── config.packed ├── constrained.config ├── constrained.output.gold ├── lm.1.gz ├── output.gold ├── rules.1.gz └── rules.packed │ ├── config │ ├── encoding │ ├── slice_00000.features │ ├── slice_00000.source │ ├── slice_00000.target │ ├── slice_00000.target.lookup │ └── vocabulary ├── pipeline ├── .gitignore ├── Makefile ├── final-bleu.gold ├── input │ ├── devtest.en.0 │ ├── devtest.en.1 │ ├── devtest.en.2 │ ├── devtest.en.3 │ ├── devtest.ur │ ├── train.en │ ├── train.ur │ ├── tune.en.0 │ ├── tune.en.1 │ ├── tune.en.2 │ ├── tune.en.3 │ └── tune.ur ├── test-ghkm.sh └── test.sh ├── prune-equivalent-translations.py ├── run-all-tests.sh ├── scripts ├── .gitignore ├── merge_lms_test.py ├── normalization │ ├── .gitignore │ ├── data │ │ ├── train.en │ │ └── train.en.norm │ └── test.sh ├── run_bundler_test.py └── support │ └── moses_grammar │ ├── input │ ├── output.expected │ └── test.sh ├── server ├── http │ ├── expected │ └── test.sh └── tcp-text │ ├── expected │ └── test.sh ├── testng.xml ├── thrax ├── .gitignore ├── extraction │ ├── input │ │ ├── thrax.conf │ │ ├── train.a │ │ ├── train.en │ │ └── train.ps │ └── test.sh └── filtering │ ├── dev.hi-en.hi.1 │ ├── exact.gold │ ├── exact.log.gold │ ├── fast.gold │ ├── fast.log.gold │ ├── grammar.de │ ├── grammar.filtered.gz │ ├── input.de │ ├── loose.log.gold │ ├── test-exact.sh │ ├── test-fast.sh │ └── test-loose.sh ├── wa_grammar └── wa_grammar.packed ├── config ├── encoding ├── slice_00000.alignments ├── slice_00000.features ├── slice_00000.source ├── slice_00000.target ├── slice_00000.target.lookup └── vocabulary /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/.gitmodules -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | sudo: true 3 | 4 | matrix: 5 | include: 6 | - os: linux 7 | jdk: oraclejdk8 8 | - os: osx 9 | osx_image: xcode8 10 | - os: linux 11 | dist: trusty 12 | jdk: oraclejdk8 13 | before_install: 14 | - sudo apt-get -y install software-properties-common 15 | - sudo add-apt-repository -y ppa:george-edison55/cmake-3.x 16 | - sudo apt-get -y update 17 | - sudo apt-get -y install build-essential libboost-all-dev zlib1g-dev libbz2-dev liblzma-dev cmake cmake-data 18 | - export JOSHUA=`pwd` 19 | - echo y | ./download-deps.sh 20 | - os: osx 21 | osx_image: xcode8 22 | before_install: 23 | - brew update 24 | - brew install ant 25 | - export JOSHUA=`pwd` 26 | - export JAVA_HOME=`/usr/libexec/java_home -v 1.8` 27 | - ./download-deps.sh 28 | -------------------------------------------------------------------------------- /NOTICE.txt: -------------------------------------------------------------------------------- 1 | Apache Joshua 2 | Copyright 2016 The Apache Software Foundation 3 | 4 | This product includes software developed at The Apache Software 5 | Foundation (http://www.apache.org/). 6 | 7 | In addition, this product includes software dependencies. See 8 | the accompanying LICENSE.txt for a listing of dependencies 9 | that are NOT Apache licensed (with pointers to their licensing) -------------------------------------------------------------------------------- /bin/bleu: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | if [[ -z $2 ]]; then 19 | echo "Usage: bleu output reference" 20 | exit 1 21 | fi 22 | 23 | output=$1 24 | refs=$2 25 | 26 | format=plain 27 | grep "|||" $output > /dev/null 28 | if [[ $? -eq 0 ]]; then 29 | format=nbest 30 | fi 31 | 32 | numrefs=$(ls $refs* | wc -l) 33 | verbose=0 34 | 35 | JOSHUA=$(dirname $0)/.. 36 | java -cp $JOSHUA/target/joshua-*-jar-with-dependencies.jar \ 37 | -Dfile.encoding=utf8 -Djava.library.path=$JOSHUA/lib \ 38 | -Xmx256m -Xms256m \ 39 | org.apache.joshua.util.JoshuaEval -cand $output -format $format -ref $refs -rps $numrefs -m BLEU 4 closest -v 0 40 | -------------------------------------------------------------------------------- /bin/extract-1best: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | ## memory usage; default is 10 GB 19 | mem=10g 20 | 21 | if [[ $1 == "-m" ]]; then 22 | mem=$2 23 | shift 24 | shift 25 | fi 26 | 27 | set -u 28 | 29 | JOSHUA=$(dirname $0)/.. 30 | RES_OVERRIDE_PATH=$JOSHUA/src/main/resources 31 | # Get the jar file (only the most recent, if there are many) 32 | JAR_PATH=$(ls -t $JOSHUA/target/joshua-*-jar-with-dependencies.jar | head -n1) 33 | exec java -mx${mem} \ 34 | -Dfile.encoding=utf8 \ 35 | -Djava.library.path=$JOSHUA/lib \ 36 | -cp $JAR_PATH \ 37 | org.apache.joshua.util.ExtractTopCand 38 | -------------------------------------------------------------------------------- /bin/joshua: -------------------------------------------------------------------------------- 1 | joshua-decoder -------------------------------------------------------------------------------- /bin/meteor: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | if [[ -z $3 ]]; then 19 | echo "Usage: meteor output reference lang" 20 | exit 1 21 | fi 22 | 23 | output=$1 24 | reference=$2 25 | lang=$3 26 | 27 | 28 | if [[ ! -e $reference ]]; then 29 | reference=$reference.0 30 | fi 31 | 32 | java -Xmx2G -jar $METEOR/meteor-*.jar $output $reference -l $lang 33 | -------------------------------------------------------------------------------- /bin/pipeline.pl: -------------------------------------------------------------------------------- 1 | ../scripts/training/pipeline.pl -------------------------------------------------------------------------------- /demo/README.md: -------------------------------------------------------------------------------- 1 | This provides a server demonstration for Joshua. You can load a model, 2 | feed it sentences, add custom rules, and view the translations. 3 | 4 | There are two steps: 5 | 6 | 1. Start Joshua in server mode, using the config file provided in this directory: 7 | 8 | $JOSHUA/bin/joshua -config demo.config 9 | 10 | As a third option, you could pass it your own config file on a real 11 | pre-built model, such as Joshua's language packs. 12 | 13 | Command-line arguments override values in the config file, so if 14 | you need to change the port only, you can do: 15 | 16 | $JOSHUA/bin/joshua -config demo.config -server-port 5675 17 | 18 | 1. Load the index.html file, and make sure the values in the "Parameters" 19 | tab match your server settings above. You can also pass these values 20 | in the query string, e.g., 21 | 22 | index.html?port=5674&server=localhost 23 | 24 | That's it! 25 | -------------------------------------------------------------------------------- /demo/apache_joshua_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/demo/apache_joshua_logo.png -------------------------------------------------------------------------------- /demo/apache_joshua_logo_faded.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/demo/apache_joshua_logo_faded.png -------------------------------------------------------------------------------- /demo/bootstrap/fonts/glyphicons-halflings-regular.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/demo/bootstrap/fonts/glyphicons-halflings-regular.eot -------------------------------------------------------------------------------- /demo/bootstrap/fonts/glyphicons-halflings-regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/demo/bootstrap/fonts/glyphicons-halflings-regular.ttf -------------------------------------------------------------------------------- /demo/bootstrap/fonts/glyphicons-halflings-regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/demo/bootstrap/fonts/glyphicons-halflings-regular.woff -------------------------------------------------------------------------------- /demo/bootstrap/fonts/glyphicons-halflings-regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/demo/bootstrap/fonts/glyphicons-halflings-regular.woff2 -------------------------------------------------------------------------------- /demo/bootstrap/js/npm.js: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one or more 2 | // contributor license agreements. See the NOTICE file distributed with 3 | // this work for additional information regarding copyright ownership. 4 | // The ASF licenses this file to You under the Apache License, Version 2.0 5 | // (the "License"); you may not use this file except in compliance with 6 | // the License. You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | 16 | // This file is autogenerated via the `commonjs` Grunt task. You can require() this file in a CommonJS environment. 17 | require('../../js/transition.js') 18 | require('../../js/alert.js') 19 | require('../../js/button.js') 20 | require('../../js/carousel.js') 21 | require('../../js/collapse.js') 22 | require('../../js/dropdown.js') 23 | require('../../js/modal.js') 24 | require('../../js/tooltip.js') 25 | require('../../js/popover.js') 26 | require('../../js/scrollspy.js') 27 | require('../../js/tab.js') 28 | require('../../js/affix.js') -------------------------------------------------------------------------------- /demo/demo.config: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # The custom grammar, which allows adding custom phrase table entries 17 | tm = phrase -owner custom -maxspan 0 -path custom.grammar 18 | 19 | server-type = http 20 | server-port = 5674 21 | mark-oovs = true 22 | lowercase = true 23 | project-case = true 24 | output-format = %S 25 | feature-function = OOVPenalty 26 | feature-function = PhrasePenalty -owner custom 27 | 28 | # Set the feature weights. This helps the model disprefer OOVs 29 | OOVPenalty 1 30 | PhrasePenalty -1 31 | -------------------------------------------------------------------------------- /demo/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/demo/favicon.ico -------------------------------------------------------------------------------- /demo/scripts/joshua.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # Easy way to start up Joshua 18 | 19 | set -u 20 | 21 | $JOSHUA/bin/joshua \ 22 | -server-type http -server-port 5674 \ 23 | -feature-function OOVPenalty \ 24 | -feature-function "PhrasePenalty -owner custom" \ 25 | -weight-overwrite "OOVPenalty 1 PhrasePenalty -1" \ 26 | -lowercase -project-case \ 27 | -mark-oovs 28 | -------------------------------------------------------------------------------- /demo/scripts/web_server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | """ 18 | Starts a web server on port 8000. Start it in the current directory and 19 | it will serve up index.html. 20 | """ 21 | 22 | import sys 23 | import SimpleHTTPServer 24 | import SocketServer 25 | 26 | port = 8000 27 | 28 | handler = MyRequestHandler 29 | httpd = SocketServer.TCPServer(("", port), handler) 30 | httpd.serve_forever() 31 | -------------------------------------------------------------------------------- /distribution/docker/README.md: -------------------------------------------------------------------------------- 1 | This directory contains files for using Joshua with Docker. 2 | 3 | - dev/Dockerfile 4 | 5 | This will help you compile the development version of Joshua, including the 3rd party 6 | libraries and support tools. 7 | 8 | - kenlm/Dockerfile 9 | 10 | This is used by the language packs for getting the runtime version of Joshua to work 11 | with KenLM. 12 | -------------------------------------------------------------------------------- /distribution/docker/kenlm/README.md: -------------------------------------------------------------------------------- 1 | This Docker container installs KenLM and uses it to start a language pack with KenLM 2 | language models instead of BerkeleyLM ones. It requires version 3 or above language packs. 3 | 4 | To use it, you need to do two things when running docker: 5 | 6 | - Mount the version 3 language pack to /model 7 | - Choose a local (host) port and bind it to the docker port that Joshua will run on 8 | 9 | This can be accomplished with the following command: 10 | 11 | docker run -p 127.0.0.1:5674:5674 -v /path/to/LP:/model -it joshua/kenlm 12 | 13 | This will make the language pack available on port 5674 on localhost. 14 | -------------------------------------------------------------------------------- /distribution/joshua-full/README.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | Joshua is an open-source statistical machine translation decoder for phrase-based 4 | (new in 6.0), hierarchical, and syntax-based machine translation, written in Java. 5 | It is developed at the Human Language Technology Center of Excellence at Johns 6 | Hopkins University. 7 | 8 | This charm provides the full development environment which allows users to both build 9 | and deploy language packs to the server and run translations against them. 10 | 11 | To build language packs it is advised you use this charm in conjunction with one of the 12 | Hadoop bundles available in the charm store to allow you to make use of a full Hadoop 13 | cluster for the Thrax execution 14 | 15 | # Usage 16 | 17 | To deploy joshua-full: 18 | 19 | juju deploy cs:~apachesoftwarefoundation/joshua-full 20 | 21 | ## Known Limitations and Issues 22 | 23 | Currently Joshua only supports a single language pack deployed against it at once. 24 | 25 | # Configuration 26 | 27 | Port: specify the port you want the Joshua http interface to run on for remote 28 | calls to the Joshua server. 29 | 30 | Memory: amount of RAM the server should consume. 31 | 32 | # Contact Information 33 | 34 | To contact the authors swing by the dev mailing list: 35 | dev@joshua.apache.org 36 | 37 | ## Apache Joshua 38 | 39 | - http://joshua.apache.org 40 | - https://issues.apache.org/jira/browse/joshua 41 | - dev@joshua.apache.org 42 | 43 | -------------------------------------------------------------------------------- /distribution/joshua-full/actions.yaml: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | add-language-pack: 20 | description: install a language pack 21 | params: 22 | name: 23 | type: string 24 | description: name of the pack to install 25 | #translate-phrase: 26 | # description: translate a phrase 27 | # params: 28 | # phrase: 29 | # type: string 30 | # description: the phrase you would like joshua to translate 31 | -------------------------------------------------------------------------------- /distribution/joshua-full/actions/remove-language-pack: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | set -ex 20 | 21 | rm -rf /opt/language-pack 22 | charms.reactive remove_state languagepack.installed 23 | hooks/update-status 24 | 25 | -------------------------------------------------------------------------------- /distribution/joshua-full/config.yaml: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | options: 19 | port: 20 | type: string 21 | default: "5432" 22 | description: "Port server runs on" 23 | memory: 24 | type: string 25 | default: "4g" 26 | description: "RAM limit 2g,4g etc" 27 | -------------------------------------------------------------------------------- /distribution/joshua-full/copyright: -------------------------------------------------------------------------------- 1 | Format: http://dep.debian.net/deps/dep5/ 2 | 3 | Files: * 4 | Copyright: Copyright 2015, Canonical Ltd., All Rights Reserved. 5 | License: Apache License 2.0 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | . 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | . 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | -------------------------------------------------------------------------------- /distribution/joshua-full/layer.yaml: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | includes: ['layer:basic', 'interface:java', 'layer:hadoop-client'] 20 | repo: https://github.com/buggtb/joshua.git 21 | -------------------------------------------------------------------------------- /distribution/joshua-full/tests/00-setup: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | sudo add-apt-repository ppa:juju/stable -y 20 | sudo apt-get update 21 | sudo apt-get install amulet python-requests -y 22 | -------------------------------------------------------------------------------- /distribution/joshua-runtime/README.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | Joshua is an open-source statistical machine translation decoder for phrase-based 4 | (new in 6.0), hierarchical, and syntax-based machine translation, written in Java. 5 | It is developed at the Human Language Technology Center of Excellence at Johns 6 | Hopkins University. 7 | 8 | This charm provides the runtime environment which allows users to deploy language 9 | packs to the server and run translations against them. 10 | 11 | There are a number of language packs available and developers are able to build 12 | their own using the joshua-full charm available in the charm store. 13 | 14 | 15 | # Usage 16 | 17 | To deploy joshua-runtime: 18 | 19 | juju deploy cs:~apachesoftwarefoundation/joshua-runtime 20 | 21 | ## Known Limitations and Issues 22 | 23 | Currently Joshua only supports a single language pack deployed against it at once. 24 | 25 | # Configuration 26 | 27 | Port: specify the port you want the Joshua http interface to run on for remote 28 | calls to the Joshua server. 29 | 30 | Memory: amount of RAM the server should consume. 31 | 32 | # Contact Information 33 | 34 | To contact the authors swing by the dev mailing list: 35 | dev@joshua.apache.org 36 | 37 | ## Apache Joshua 38 | 39 | - http://joshua.apache.org 40 | - https://issues.apache.org/jira/browse/joshua 41 | - dev@joshua.apache.org 42 | 43 | -------------------------------------------------------------------------------- /distribution/joshua-runtime/actions.yaml: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | add-language-pack: 19 | description: install a language pack 20 | params: 21 | name: 22 | type: string 23 | description: name of the pack to install 24 | #translate-phrase: 25 | # description: translate a phrase 26 | # params: 27 | # phrase: 28 | # type: string 29 | # description: the phrase you would like joshua to translate 30 | -------------------------------------------------------------------------------- /distribution/joshua-runtime/actions/remove-language-pack: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | set -ex 20 | 21 | rm -rf /opt/language-pack 22 | charms.reactive remove_state languagepack.installed 23 | hooks/update-status 24 | 25 | -------------------------------------------------------------------------------- /distribution/joshua-runtime/config.yaml: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | options: 19 | port: 20 | type: string 21 | default: "5432" 22 | description: "Port server runs on" 23 | memory: 24 | type: string 25 | default: "4g" 26 | description: "RAM limit 2g,4g etc" 27 | -------------------------------------------------------------------------------- /distribution/joshua-runtime/copyright: -------------------------------------------------------------------------------- 1 | Format: http://dep.debian.net/deps/dep5/ 2 | 3 | Files: * 4 | Copyright: Copyright 2015, Canonical Ltd., All Rights Reserved. 5 | License: Apache License 2.0 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | . 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | . 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | -------------------------------------------------------------------------------- /distribution/joshua-runtime/layer.yaml: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | includes: ['layer:basic', 'interface:java'] 19 | repo: https://github.com/buggtb/joshua.git 20 | -------------------------------------------------------------------------------- /distribution/joshua-runtime/tests/00-setup: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | sudo add-apt-repository ppa:juju/stable -y 20 | sudo apt-get update 21 | sudo apt-get install amulet python-requests -y 22 | -------------------------------------------------------------------------------- /doc/Eclipse.howto: -------------------------------------------------------------------------------- 1 | Install the Subclipse SVN plugin in Eclipse. 2 | 3 | 4 | File -> New -> Other 5 | 6 | SVN -> Checkout project from SVN 7 | 8 | Click Next 9 | 10 | Select "Create a new repository location" 11 | Click Next 12 | Url: https://joshua.svn.sourceforge.net/svnroot/joshua/trunk 13 | Click Next 14 | 15 | In the Select folder dialog, 16 | select the node for "https://joshua.svn.sourceforge.net/svnroot/joshua/trunk" 17 | 18 | Click Next 19 | 20 | 21 | In the dialog "Check out as a project configured using the New Project Wizard" should be selected. 22 | Click Finish. 23 | 24 | In the New Project dialog that pops up, select "Java Project". 25 | Click Next. 26 | 27 | In the New Java Project dialog, type in a Project name (ex: joshua). 28 | Click Finish. 29 | Click OK. 30 | 31 | A popup will show "SVN Checkout". 32 | You should now have the project checked out. 33 | 34 | Right-click on the project folder, and select Properties. 35 | Select Java Build Path. 36 | Select the Libraries tab. 37 | Click Add JARs. 38 | Select lib/je-3.2.23.jar 39 | Click OK. 40 | Click OK. 41 | -------------------------------------------------------------------------------- /doc/joshua-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/doc/joshua-logo.png -------------------------------------------------------------------------------- /doc/release.md: -------------------------------------------------------------------------------- 1 | Creating a release {#mainpage} 2 | ================== 3 | 4 | Define `$JOSHUA` and `$JOSHUA_VERISON` (the release version), then 5 | call 6 | 7 | ant release 8 | 9 | This should be done in a freshly checked-out copy, since it will wipe 10 | out all non-tracked files, download the web site, and do some other 11 | things. You should also make sure that you tag the release in the 12 | source code. 13 | 14 | git tag -a $JOSHUA_RELEASE 15 | git push --tags 16 | 17 | Here's an example of building a release versioned "2012-07-18". It will be placed at 18 | `release/joshua-2012-07-18.tgz`. 19 | 20 | export JOSHUA_VERSION=2012-07-18 21 | export HADOOP=/path/to/hadoop 22 | export HADOOP_CONF_DIR=/path/to/hadoop/config 23 | export HADOOP_VERSION="0.20.203.0" 24 | export AWS_SDK=/path/to/aws 25 | export AWS_VERSION="1.1.3" 26 | 27 | git clone https://github.com/joshua-decoder/joshua.git 28 | cd joshua 29 | export JOSHUA=`pwd` 30 | git submodule update --init 31 | cd thrax 32 | ant 33 | cd $JOSHUA 34 | ant release 35 | -------------------------------------------------------------------------------- /doc/troubleshooting.md: -------------------------------------------------------------------------------- 1 | Troubleshooting {#troubleshooting} 2 | ================ 3 | 4 | Make sure that the `JOSHUA` environment variable has been set to the directory 5 | created by the git clone command or from extracting the release tarball. 6 | 7 | # Ant build errors 8 | 9 | First of all, make sure `ant init` has been run successfully at least once 10 | before running any other Ant tasks. 11 | 12 | ## Ant version 13 | 14 | download-ivy: 15 | 16 | BUILD FAILED 17 | /home/lorland1/workspace/mt/joshua/build.xml:310: Ivy requires Ant version 1.8.0 or greater. Please upgrade to the latest version. 18 | 19 | If an older version of ant is in the system, the developer can manually 20 | download ivy.jar, copy it to `$JOSHUA/lib/`, and delete the `download-ivy` ant 21 | target. Ant 1.8.0 was released in April 2010, so it's not requiring anything 22 | bleeding edge. 23 | -------------------------------------------------------------------------------- /doc/zmert_release/zmert_doc_v1.40.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/doc/zmert_release/zmert_doc_v1.40.pdf -------------------------------------------------------------------------------- /doc/zmert_release/zmert_v1.40.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/doc/zmert_release/zmert_v1.40.zip -------------------------------------------------------------------------------- /examples/download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | echo "Downloading corpus..." 18 | rm -rf fisher-callhome-corpus-1.0 19 | rm -f fisher-callhome-corpus-1.0.tgz 20 | wget -q -O fisher-callhome-corpus-1.0.tgz https://github.com/joshua-decoder/fisher-callhome-corpus/archive/v1.0.tar.gz 21 | 22 | echo "Unpacking..." 23 | tar xzf fisher-callhome-corpus-1.0.tgz 24 | 25 | echo "Linking..." 26 | ln -sf fisher-callhome-corpus-1.0 data 27 | 28 | echo "Done. See the files in data/. You can now build the examples." 29 | -------------------------------------------------------------------------------- /examples/old/PRO/libsvm_command: -------------------------------------------------------------------------------- 1 | java -cp ./bin joshua.pro.classifier.libsvm.svm_train -h 0 -s 0 -t 0 ./PRO_test/libsvm_train.data 2 | -------------------------------------------------------------------------------- /examples/old/PRO/megam_command: -------------------------------------------------------------------------------- 1 | ./joshua/pro/classifier/megam_i686.opt -fvals -nobias binary ./PRO_test/megam_train.data > ./PRO_test/megam_weights 2 | -------------------------------------------------------------------------------- /examples/old/PRO/params.txt: -------------------------------------------------------------------------------- 1 | lm ||| 1 Opt 0.1 +Inf +0.5 +1.5 2 | phrasemodel pt 0 ||| 0.5 Opt -Inf +Inf -1 +1 3 | phrasemodel pt 1 ||| 0.5 Opt -Inf +Inf -1 +1 4 | phrasemodel pt 2 ||| 0.5 Opt -Inf +Inf -1 +1 5 | wordpenalty ||| -1 Opt -Inf +Inf -5 0 6 | discriminative ./sparse_feat.example ||| 1.0 Opt -Inf +Inf -5 5 7 | normalization = absval 1 lm 8 | -------------------------------------------------------------------------------- /examples/old/PRO/run.sh: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | java -cp $JOSHUA/bin joshua.pro.PRO pro.config 17 | 18 | #Note: to use Meteor as metric, add the following path(where the Meteor can find its resources) to the classpath: 19 | #java -cp ./bin:./joshua/zmert/meteor/resources joshua.pro.PRO pro.config 20 | 21 | -------------------------------------------------------------------------------- /examples/old/ZMERT/decoder_command_ex2: -------------------------------------------------------------------------------- 1 | java -Xmx1200m -Xms1200m -cp bin -Djava.library.path=lib joshua.decoder.JoshuaDecoder ZMERT_example/config_ex2.txt ZMERT_example/src.txt ZMERT_example/nbest_ex2.out 2 | -------------------------------------------------------------------------------- /examples/old/ZMERT/params.txt: -------------------------------------------------------------------------------- 1 | lm ||| 1.000000 Opt 0.1 +Inf +0.5 +1.5 2 | phrasemodel pt 0 ||| 1.066893 Opt -Inf +Inf -1 +1 3 | phrasemodel pt 1 ||| 0.752247 Opt -Inf +Inf -1 +1 4 | phrasemodel pt 2 ||| 0.589793 Opt -Inf +Inf -1 +1 5 | wordpenalty ||| -2.844814 Opt -Inf +Inf -5 0 6 | normalization = absval 1 lm 7 | -------------------------------------------------------------------------------- /examples/training/README: -------------------------------------------------------------------------------- 1 | The scripts here demonstrate how to use the pipeline to build different types of 2 | models (including GHKM, Hiero, SAMT, and traditional phrase-based models). 3 | 4 | They all use the data that is downloaded with the ../download.sh script. 5 | -------------------------------------------------------------------------------- /examples/tree_visualizer/README: -------------------------------------------------------------------------------- 1 | These examples demonstrate how to use the tree visualizer. 2 | Build the visualizer by running the following commmand to compile 3 | the relevant code: 4 | 5 | ant -f $JOSHUA/build.xml tree_visualizer 6 | 7 | You can run the visualizer by using the visualizer invocation 8 | script under UNIX/Linux/OS-X: 9 | 10 | $JOSHUA/scripts/analysis/tree_visualizer tree.fr tree.ref tree.en 11 | 12 | or under Windows: 13 | 14 | tree_visualizer.cmd tree.fr tree.ref tree.en 15 | 16 | To invoke the tree visualizer, you need at least three arguments: 17 | 18 | 1) a file with source sentences, one per line 19 | 2) a file with reference translations, one per line 20 | 3) an n-best file like the one produced by Joshua. 21 | 22 | Note that the n-best file needs to have source-aligned derivation trees, so 23 | you should run the Joshua decoder with use_tree_nbest = true and 24 | include_align_index = true. 25 | 26 | Additional arguments should be more n-best files to compare to the first one. 27 | 28 | Click on the sentence that displays, and you will be presented 29 | with the visualization in a second window. 30 | -------------------------------------------------------------------------------- /examples/tree_visualizer/tree.en: -------------------------------------------------------------------------------- 1 | 0 ||| (ROOT{0-6} (S{0-6} (X{0-1} i) (X{1-6} (X{2-4} asked) (X{1-2} her) (X{4-6} a question)))) 2 | 1 ||| (ROOT{0-6} (S{0-6} (X{0-1} he) (X{1-6} (X{1-3} visited) (X{3-6} the (X{4-6} (X{5-6} white) (X{4-5} house)))))) 3 | -------------------------------------------------------------------------------- /examples/tree_visualizer/tree.en.2: -------------------------------------------------------------------------------- 1 | 0 ||| (ROOT{0-6} (S{0-6} (NP{0-1} i) (VP{1-6} (V{2-4} asked) (NP{1-2} her) (NP{4-6} a question)))) 2 | 1 ||| (ROOT{0-6} (S{0-6} (NP{0-1} he) (VP{1-6} (V{1-3} visited) (NP{3-6} the (NNP{4-6} (JJ{5-6} white) (NN{4-5} house)))))) 3 | -------------------------------------------------------------------------------- /examples/tree_visualizer/tree.fr: -------------------------------------------------------------------------------- 1 | je lui ai pose un question 2 | il a visite la maison blanche 3 | -------------------------------------------------------------------------------- /examples/tree_visualizer/tree.ref: -------------------------------------------------------------------------------- 1 | i asked her a question 2 | he visited the white house 3 | -------------------------------------------------------------------------------- /examples/tree_visualizer/tree_visualizer.cmd: -------------------------------------------------------------------------------- 1 | # EXAMPLE: tree_visualizer.cmd tree.fr tree.ref tree.en 2 | set JAVA_HOME="C:\Program Files (x86)\Java\jre6" 3 | set PATH="%JAVA_HOME%/bin";%PATH% 4 | 5 | java -Xmx1g -jar tree_visualizer.jar %* 6 | -------------------------------------------------------------------------------- /scripts/distributedLM/config.template: -------------------------------------------------------------------------------- 1 | #lm config 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | use_srilm=true 18 | lm_ceiling_cost=100 19 | use_left_euqivalent_state=false 20 | use_right_euqivalent_state=false 21 | order=5 22 | 23 | remote_symbol_tbl=/home/zli/work/zli@gale/mt06-data/gale_p3_run/remote.symbol.tbl 24 | 25 | remote_lm_server_port=9000 26 | 27 | -------------------------------------------------------------------------------- /scripts/distributedLM/global_symol_list: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -pau- 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /scripts/distributedLM/lm.list.withweights: -------------------------------------------------------------------------------- 1 | /home/zli/work/zli@gale/mt06-data/mt08/lm/afp.part4.gz.7.gram.lm.gz;0.0338212 2 | /home/zli/work/zli@gale/mt06-data/mt08/lm/apw.part9.gz.7.gram.lm.gz;0.0105086 3 | /home/zli/work/zli@gale/mt06-data/mt08/lm/bitext.lm.data.gz.7.gram.lm.gz;0.212171 4 | /home/zli/work/zli@gale/mt06-data/mt08/lm/pd.all.gz.7.gram.lm.gz;0.19243 5 | /home/zli/work/zli@gale/mt06-data/mt08/lm/taiwan.all.gz.7.gram.lm.gz;0.0774303 6 | /home/zli/work/zli@gale/mt06-data/mt08/lm/xin.part1.gz.7.gram.lm.gz;0.146205 7 | /home/zli/work/zli@gale/mt06-data/mt08/lm/xin.part2.gz.7.gram.lm.gz;0.227052 8 | /home/zli/work/zli@gale/mt06-data/mt08/lm/xin.part3.gz.7.gram.lm.gz;0.100382 9 | -------------------------------------------------------------------------------- /scripts/language-pack/VERSIONS: -------------------------------------------------------------------------------- 1 | # Version 3 (March 2017) 2 | 3 | This was the first version actually versioned. It was built to work with docker building 4 | a KenLM language model. 5 | 6 | Includes KenLM language model files (recommended) in addition to BerkeleyLM. 7 | The latter is the default, with the former recommended and facilitated with a Docker 8 | container. Google API now multithreaded. Contained the new files: 9 | 10 | - joshua.config.kenlm (same config file but with KenLM instead of BerkeleyLM) 11 | - lp.conf (identifying the LP version and the git commit of the code) 12 | 13 | # Version 1-2 (prior to March 2017) 14 | 15 | These versions were not explicitly identified. They contained a "joshua" top-level script 16 | and "prepare.sh" for preparing data. Operates in server mode or from the command line. 17 | Entirely BerkeleyLM-based. Includes a Joshua 6.1 release candidate jar file. 18 | 19 | -------------------------------------------------------------------------------- /scripts/language-pack/prepare.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | # Strings together the preprocessing scripts 19 | 20 | set -u 21 | 22 | # Set default language. Override with "lang=XX prepare.sh" 23 | : ${lang=} 24 | 25 | cd $(dirname $0) # relative paths now safe 26 | ./scripts/normalize.pl $lang | ./scripts/tokenize.pl -l $lang 27 | -------------------------------------------------------------------------------- /scripts/misc/canonical_path: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import os 18 | import argparse 19 | 20 | parser = argparse.ArgumentParser(description='Return canonical path') 21 | parser.add_argument('file', help='File or directory to absolutize') 22 | args = parser.parse_args() 23 | 24 | print os.path.realpath(args.file) 25 | -------------------------------------------------------------------------------- /scripts/preparation/lowercase.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | use warnings; 18 | use strict; 19 | 20 | binmode(STDIN, ":utf8"); 21 | binmode(STDOUT, ":utf8"); 22 | 23 | while() { 24 | print lc($_); 25 | } 26 | -------------------------------------------------------------------------------- /scripts/preparation/nonbreaking_prefixes/nonbreaking_prefix.ca: -------------------------------------------------------------------------------- 1 | Dr 2 | Dra 3 | pàg 4 | p 5 | c 6 | av 7 | Sr 8 | Sra 9 | adm 10 | esq 11 | Prof 12 | S.A 13 | S.L 14 | p.e 15 | ptes 16 | Sta 17 | St 18 | pl 19 | màx 20 | cast 21 | dir 22 | nre 23 | fra 24 | admdora 25 | Emm 26 | Excma 27 | espf 28 | dc 29 | admdor 30 | tel 31 | angl 32 | aprox 33 | ca 34 | dept 35 | dj 36 | dl 37 | dt 38 | ds 39 | dg 40 | dv 41 | ed 42 | entl 43 | al 44 | i.e 45 | maj 46 | smin 47 | n 48 | núm 49 | pta 50 | A 51 | B 52 | C 53 | D 54 | E 55 | F 56 | G 57 | H 58 | I 59 | J 60 | K 61 | L 62 | M 63 | N 64 | O 65 | P 66 | Q 67 | R 68 | S 69 | T 70 | U 71 | V 72 | W 73 | X 74 | Y 75 | Z 76 | -------------------------------------------------------------------------------- /scripts/preparation/nonbreaking_prefixes/nonbreaking_prefix.ro: -------------------------------------------------------------------------------- 1 | A 2 | B 3 | C 4 | D 5 | E 6 | F 7 | G 8 | H 9 | I 10 | J 11 | K 12 | L 13 | M 14 | N 15 | O 16 | P 17 | Q 18 | R 19 | S 20 | T 21 | U 22 | V 23 | W 24 | X 25 | Y 26 | Z 27 | dpdv 28 | etc 29 | șamd 30 | M.Ap.N 31 | dl 32 | Dl 33 | d-na 34 | D-na 35 | dvs 36 | Dvs 37 | pt 38 | Pt 39 | -------------------------------------------------------------------------------- /scripts/preparation/nonbreaking_prefixes/nonbreaking_prefix.sl: -------------------------------------------------------------------------------- 1 | dr 2 | Dr 3 | itd 4 | itn 5 | št #NUMERIC_ONLY# 6 | Št #NUMERIC_ONLY# 7 | d 8 | jan 9 | Jan 10 | feb 11 | Feb 12 | mar 13 | Mar 14 | apr 15 | Apr 16 | jun 17 | Jun 18 | jul 19 | Jul 20 | avg 21 | Avg 22 | sept 23 | Sept 24 | sep 25 | Sep 26 | okt 27 | Okt 28 | nov 29 | Nov 30 | dec 31 | Dec 32 | tj 33 | Tj 34 | npr 35 | Npr 36 | sl 37 | Sl 38 | op 39 | Op 40 | gl 41 | Gl 42 | oz 43 | Oz 44 | prev 45 | dipl 46 | ing 47 | prim 48 | Prim 49 | cf 50 | Cf 51 | gl 52 | Gl 53 | A 54 | B 55 | C 56 | D 57 | E 58 | F 59 | G 60 | H 61 | I 62 | J 63 | K 64 | L 65 | M 66 | N 67 | O 68 | P 69 | Q 70 | R 71 | S 72 | T 73 | U 74 | V 75 | W 76 | X 77 | Y 78 | Z 79 | -------------------------------------------------------------------------------- /scripts/preparation/nonbreaking_prefixes/nonbreaking_prefix.sv: -------------------------------------------------------------------------------- 1 | #single upper case letter are usually initials 2 | A 3 | B 4 | C 5 | D 6 | E 7 | F 8 | G 9 | H 10 | I 11 | J 12 | K 13 | L 14 | M 15 | N 16 | O 17 | P 18 | Q 19 | R 20 | S 21 | T 22 | U 23 | V 24 | W 25 | X 26 | Y 27 | Z 28 | #misc abbreviations 29 | AB 30 | G 31 | VG 32 | dvs 33 | etc 34 | from 35 | iaf 36 | jfr 37 | kl 38 | kr 39 | mao 40 | mfl 41 | mm 42 | osv 43 | pga 44 | tex 45 | tom 46 | vs 47 | -------------------------------------------------------------------------------- /scripts/preparation/preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | # Strings together the preprocessing scripts 19 | 20 | set -u 21 | 22 | lang=$1 23 | 24 | $JOSHUA/scripts/preparation/normalize.pl $lang | $JOSHUA/scripts/preparation/tokenize.pl -l $lang | $JOSHUA/scripts/preparation/lowercase.pl 25 | -------------------------------------------------------------------------------- /scripts/samt/postprocessSAMT.sh: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | if [ $# -ne 3 ] 17 | then 18 | echo "Usage: postprocessSAMT.sh mergedrules.gz samt.tm.gz samt.glue.gz" 19 | exit 2 20 | fi 21 | 22 | if [ ! -r $1 ] 23 | then 24 | echo "Error: file $1 does not exist or is not readable." 25 | exit 3 26 | fi 27 | 28 | zgrep -v COUNT $1 | gzip > $2 29 | zgrep COUNT $1 | awk 'BEGIN { FS="#" } ; { print $3 "#@1#@GOAL#1 0 0 0 0 0 0 0";\ 30 | print "@GOAL " $3 "#@1 @2#@GOAL#1 0 0 0 0.434294482 0 0 0" }' | gzip > $3 31 | 32 | -------------------------------------------------------------------------------- /scripts/support/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | test-bundle-dir 3 | -------------------------------------------------------------------------------- /scripts/support/create_glue_grammar.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | # Creates a glue grammar from a regular grammar (packed or plain text). 19 | 20 | if [[ -z "$1" ]]; then 21 | echo "Creates a glue grammar from a main grammar, writing to STDOUT." 22 | echo "Usage: $0 /path/to/main/grammar" 23 | exit 24 | fi 25 | 26 | java -Xmx2g -cp $JOSHUA/target/joshua-*-jar-with-dependencies.jar org.apache.joshua.decoder.ff.tm.CreateGlueGrammar -g $1 27 | -------------------------------------------------------------------------------- /scripts/support/extract-1best: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # This script takes Joshua's nbest output and extracts the 1-best 18 | # output. 19 | 20 | set -u 21 | 22 | java -Xmx500m -cp $JOSHUA/bin -Dfile.encoding=utf8 joshua.util.ExtractTopCand - - 23 | 24 | -------------------------------------------------------------------------------- /scripts/support/filter_grammar.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | # Wrapper around the grammar filter 19 | 20 | JOSHUA=$(readlink -f $(dirname $0)/../..) 21 | JAR_PATH=$JOSHUA/target/joshua-*-jar-with-dependencies.jar 22 | java -Xmx4g -Dfile.encoding=utf8 -cp $JAR_PATH org.apache.joshua.tools.TestSetFilter "$@" 23 | -------------------------------------------------------------------------------- /scripts/support/write-version.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | set -u 19 | 20 | version=$(git describe --abbrev=0 --dirty) 21 | 22 | # Save the current version and commit to a file 23 | echo "release version: $(git describe --abbrev=0)" > $JOSHUA/VERSION 24 | echo "current commit: $(git describe --long --dirty)" >> $JOSHUA/VERSION 25 | 26 | -------------------------------------------------------------------------------- /scripts/thrax/strip_label.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Removes labels (if present) from features. 5 | 6 | e.g., 7 | 8 | [X] ||| le ||| the ||| e_given_f_lex=1 9 | 10 | becomes 11 | 12 | [X] ||| le ||| the ||| 1 13 | """ 14 | 15 | import re 16 | import sys 17 | import codecs 18 | 19 | reload(sys) 20 | sys.setdefaultencoding('utf-8') 21 | sys.stdin = codecs.getreader('utf-8')(sys.stdin) 22 | sys.stdout = codecs.getwriter('utf-8')(sys.stdout) 23 | sys.stdout.encoding = 'utf-8' 24 | 25 | for line in sys.stdin: 26 | tokens = line.split(' ||| ') 27 | tokens[3] = re.sub(r'\S*=', '', tokens[3]) 28 | 29 | print ' ||| '.join(tokens), 30 | -------------------------------------------------------------------------------- /scripts/toolkit/joini.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | import os, sys, codecs 20 | 21 | def main(): 22 | delimiter = sys.argv[1] 23 | fileNames = sys.argv[2:] 24 | 25 | files = [] 26 | 27 | for fileName in fileNames: 28 | files.append(codecs.open(fileName, "r", "utf-8")) 29 | 30 | ongoing = True 31 | 32 | while (ongoing): 33 | lines = [] 34 | for file in files: 35 | line = file.readline() 36 | if (line == ""): 37 | ongoing = False 38 | break 39 | lines.append(line.rstrip()) 40 | if (ongoing): 41 | print delimiter.join(lines) 42 | 43 | 44 | if __name__ == "__main__": 45 | main() 46 | -------------------------------------------------------------------------------- /scripts/training/TODO: -------------------------------------------------------------------------------- 1 | - [ ] language model is built incorrectly when starting at MERT with 2 | a parsed corpus (maybe SAMT should expect a plain corpus and a .parsed one) 3 | - [ ] add recasing with recursive call to pipeline.pl (provide a 1-1 4 | alignment) 5 | - [ ] pipeline shold output a script that can be easily - 6 | used to decode another test set 7 | - [ ] add tree output for test sets 8 | - [ ] run MERT multiple times 9 | - [X] hadoop cluster roll-out 10 | - [X] rm -r hadoop directory after retrieving grammar successfully 11 | - [ ] change qsub arg defaults when doing SAMT 12 | - [ ] don't put number in train files if maxlen == 0 13 | - [ ] should be easier to stop and start runs (locations of canonical files) 14 | - [ ] add in kenlm binarization of the language model 15 | - [ ] better tokenization (url aware, e.g.,) 16 | -------------------------------------------------------------------------------- /scripts/training/build-vocab.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # Takes a corpus of words on STDIN and builds a vocabulary with word 18 | # counts, writing them to STDOUT in the format 19 | # 20 | # ID WORD COUNT 21 | 22 | use utf8; 23 | use warnings; 24 | use strict; 25 | 26 | binmode(STDIN, ":utf8"); 27 | binmode(STDOUT, ":utf8"); 28 | 29 | my %count; 30 | while (my $line = <>) { 31 | chomp($line); 32 | my @tokens = split(' ', $line); 33 | map { $count{$_}++ } @tokens; 34 | } 35 | 36 | my $id = 1; 37 | map { print $id++ . " $_ $count{$_}\n" } (sort { $count{$b} <=> $count{$a} } keys %count); 38 | -------------------------------------------------------------------------------- /scripts/training/filter-empty-lines.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # Matt Post 18 | 19 | # Takes a list of tab-separated strings on STDIN and prints the lines 20 | # only if none of the fields is empty 21 | 22 | use warnings; 23 | use strict; 24 | 25 | my $skipped = 0; 26 | while (my $line = <>) { 27 | if ($line =~ /^\s*\t/ or $line =~ /\t *$/ or $line =~ /\t\s*\t/) { 28 | $skipped++; 29 | } else { 30 | print $line; 31 | } 32 | } 33 | 34 | print STDERR "Skipped $skipped / $.\n"; 35 | -------------------------------------------------------------------------------- /scripts/training/lowercase-leaves.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # This script lowercases just the leaves of a tree, represented in 18 | # standard PTB form. 19 | 20 | use strict; 21 | 22 | binmode(STDIN, ":utf8"); 23 | binmode(STDOUT, ":utf8"); 24 | 25 | while (<>) { 26 | s/(\S+?)\)/lc $1 . ")"/ge; 27 | print; 28 | } 29 | -------------------------------------------------------------------------------- /scripts/training/parallelize/Makefile: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | all: sentserver sentclient 17 | 18 | sentserver: sentserver.c 19 | # gcc -g -O2 -o sentserver -static -pthread -L/usr/local/lib -Wl,--rpath -Wl,/usr/local/lib sentserver.c 20 | gcc -g -O2 -o sentserver -pthread -L/usr/local/lib sentserver.c 21 | 22 | sentclient: sentclient.c 23 | # gcc -g -O2 -o sentclient -static -pthread -L/usr/local/lib -Wl,--rpath -Wl,/usr/local/lib sentclient.c 24 | gcc -g -O2 -o sentclient -pthread -L/usr/local/lib sentclient.c 25 | 26 | clean: 27 | rm -f sentserver sentclient 28 | -------------------------------------------------------------------------------- /scripts/training/parallelize/sentserver.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | #ifndef SENTSERVER_H 20 | #define SENTSERVER_H 21 | 22 | #define DEFAULT_PORT 50000 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /scripts/training/scat: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # Smart cat: calls cat, zcat, or bzcat on each of a list of files, as 18 | # appropriate. 19 | 20 | for file in $@; do 21 | text=$(file -L $file) 22 | if [[ $text =~ "gzip" ]]; then 23 | gzip -cd $file 24 | elif [[ $text =~ "bzip2" ]]; then 25 | bzcat $file 26 | else 27 | cat $file 28 | fi 29 | done 30 | -------------------------------------------------------------------------------- /scripts/training/templates/alignment/jacana/resources/freedict/en-fr.dict.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/scripts/training/templates/alignment/jacana/resources/freedict/en-fr.dict.gz -------------------------------------------------------------------------------- /scripts/training/templates/alignment/jacana/resources/freedict/en-fr.fr-en.dict.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/scripts/training/templates/alignment/jacana/resources/freedict/en-fr.fr-en.dict.gz -------------------------------------------------------------------------------- /scripts/training/templates/alignment/jacana/resources/freedict/fr-en.dict.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/scripts/training/templates/alignment/jacana/resources/freedict/fr-en.dict.gz -------------------------------------------------------------------------------- /scripts/training/templates/alignment/jacana/resources/model/EnglishChunk.bin.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/scripts/training/templates/alignment/jacana/resources/model/EnglishChunk.bin.gz -------------------------------------------------------------------------------- /scripts/training/templates/alignment/jacana/resources/model/EnglishSD.bin.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/scripts/training/templates/alignment/jacana/resources/model/EnglishSD.bin.gz -------------------------------------------------------------------------------- /scripts/training/templates/alignment/jacana/resources/model/EnglishTok.bin.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/scripts/training/templates/alignment/jacana/resources/model/EnglishTok.bin.gz -------------------------------------------------------------------------------- /scripts/training/templates/alignment/jacana/resources/model/fr-en.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/scripts/training/templates/alignment/jacana/resources/model/fr-en.model -------------------------------------------------------------------------------- /scripts/training/templates/alignment/jacana/resources/model/tag.bin.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/scripts/training/templates/alignment/jacana/resources/model/tag.bin.gz -------------------------------------------------------------------------------- /scripts/training/templates/alignment/jacana/resources/wiktionary/en-fr.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/scripts/training/templates/alignment/jacana/resources/wiktionary/en-fr.csv.gz -------------------------------------------------------------------------------- /scripts/training/templates/glue-grammar: -------------------------------------------------------------------------------- 1 | [GOAL] ||| ||| ||| 0 2 | [GOAL] ||| [GOAL,1] [X,2] ||| [GOAL,1] [X,2] ||| -1 3 | [GOAL] ||| [GOAL,1] ||| [GOAL,1] ||| 0 4 | -------------------------------------------------------------------------------- /scripts/training/templates/glue-grammar.itg: -------------------------------------------------------------------------------- 1 | [GOAL] ||| ||| ||| 0 2 | [GOAL] ||| [GOAL,1] [A,2] ||| [GOAL,1] [A,2] ||| -1 3 | [GOAL] ||| [GOAL,1] [B,2] ||| [GOAL,1] [B,2] ||| -1 4 | [GOAL] ||| [GOAL,1] ||| [GOAL,1] ||| 0 5 | -------------------------------------------------------------------------------- /scripts/training/templates/hadoop/core-site.xml: -------------------------------------------------------------------------------- 1 | 17 | 18 | 19 | hadoop.tmp.dir 20 | 21 | 22 | 23 | fs.default.name 24 | hdfs://: 25 | 26 | 27 | -------------------------------------------------------------------------------- /scripts/training/templates/hadoop/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 17 | 18 | 19 | dfs.replication 20 | 1 21 | 22 | 23 | -------------------------------------------------------------------------------- /scripts/training/templates/hadoop/mapred-site.xml: -------------------------------------------------------------------------------- 1 | 17 | 18 | 19 | mapred.job.tracker 20 | : 21 | 22 | 23 | mapred.tasktracker.map.tasks.maximum 24 | 25 | 26 | 27 | mapred.tasktracker.reduce.tasks.maximum 28 | 29 | 30 | 31 | mapred.task.timeout 32 | 0 33 | 34 | 35 | -------------------------------------------------------------------------------- /scripts/training/templates/hadoop/masters: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /scripts/training/templates/hadoop/slaves: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /scripts/training/templates/tune/decoder_command: -------------------------------------------------------------------------------- 1 | cat | /bin/joshua-decoder -m -threads -c > 2> 2 | -------------------------------------------------------------------------------- /scripts/training/templates/tune/decoder_command.qsub: -------------------------------------------------------------------------------- 1 | cat | awk 'BEGIN { num = 0 } {print "" $0 ""; num++}' | /scripts/training/parallelize/parallelize.pl -j -m --qsub-args '' -- /bin/joshua-decoder -m -threads -c > 2> 2 | -------------------------------------------------------------------------------- /scripts/training/unmap-html.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # Remove HTML codes from data. 18 | 19 | binmode(STDIN, ":utf-8"); 20 | binmode(STDOUT, ":utf-8"); 21 | 22 | my %map = ( 23 | "'" => "'", 24 | "," => ",", 25 | "&" => "&", 26 | ">" => ">", 27 | "<" => "<", 28 | """ => "\"", 29 | "ā" => "ā", 30 | "ā:" => "ā", 31 | "ā " => "ā", 32 | "ṇ" => "Ṇ", 33 | "ṇ:" => "Ṇ", 34 | "ṣ" => "Ṣ", 35 | "ṣ:" => "Ṣ", 36 | "&#;" => "", 37 | ); 38 | 39 | while (my $line = <>) { 40 | foreach my $key (keys %map) { 41 | $line =~ s/$key/$map{$key}/g; 42 | } 43 | 44 | print $line; 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/org/apache/joshua/corpus/syntax/SyntaxTree.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package org.apache.joshua.corpus.syntax; 20 | 21 | import java.util.Collection; 22 | 23 | public interface SyntaxTree { 24 | 25 | Collection getConstituentLabels(int from, int to); 26 | 27 | Collection getConcatenatedLabels(int from, int to); 28 | 29 | Collection getCcgLabels(int from, int to); 30 | 31 | int[] getTerminals(); 32 | 33 | int[] getTerminals(int from, int to); 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/org/apache/joshua/decoder/chart_parser/package-info.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | /** 20 | * Provides an implementation of a hierarchical phrase-based 21 | * decoder for statistical machine translation. The code in 22 | * this package is based largely on algorithms from Chiang (2007). 23 | */ 24 | package org.apache.joshua.decoder.chart_parser; 25 | -------------------------------------------------------------------------------- /src/main/java/org/apache/joshua/decoder/ff/SourceDependentFF.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package org.apache.joshua.decoder.ff; 20 | 21 | import org.apache.joshua.decoder.segment_file.Sentence; 22 | 23 | public interface SourceDependentFF extends Cloneable { 24 | 25 | void setSource(Sentence sentence); 26 | 27 | FeatureFunction clone(); 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2013 University of California, Berkeley 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /src/main/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/README: -------------------------------------------------------------------------------- 1 | To build a binary for Berkeley LM, you need to do the following: 2 | 3 | java -cp [berkelylm jar file] -server -mx[lots of memory] edu.berkeley.nlp.lm.io.MakeLmBinaryFromArpa [ARPA file] [output file] 4 | 5 | Both input and output will be appropriately GZipped if they have a .gz extension. Note that MakeLmBinaryFromArpa has options for e.g. enabling compression. 6 | -------------------------------------------------------------------------------- /src/main/java/org/apache/joshua/decoder/ff/lm/bloomfilter_lm/package-info.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | /** 21 | * Provides an implementation of a bloom filter language model, and 22 | * an associated implementation of the language model feature function typically used in 23 | * hierarchical phrase-based decoding for statistical machine translation. 24 | */ 25 | package org.apache.joshua.decoder.ff.lm.bloomfilter_lm; 26 | -------------------------------------------------------------------------------- /src/main/java/org/apache/joshua/decoder/ff/lm/buildin_lm/package-info.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package org.apache.joshua.decoder.ff.lm.buildin_lm; -------------------------------------------------------------------------------- /src/main/java/org/apache/joshua/decoder/ff/state_maintenance/DPState.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package org.apache.joshua.decoder.ff.state_maintenance; 20 | 21 | /** 22 | * Abstract class enforcing explicit implementation of the standard methods. 23 | * 24 | * @author Zhifei Li, zhifei.work@gmail.com 25 | * @author Juri Ganitkevitch, juri@cs.jhu.edu 26 | */ 27 | public abstract class DPState { 28 | 29 | public abstract String toString(); 30 | 31 | public abstract int hashCode(); 32 | 33 | public abstract boolean equals(Object other); 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/package-info.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | /** 21 | * Provides implementations of hierarchical phrase-based translation grammars. 22 | */ 23 | package org.apache.joshua.decoder.ff.tm.hash_based; 24 | -------------------------------------------------------------------------------- /src/main/java/org/apache/joshua/decoder/ff/tm/package-info.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | /** 21 | * Defines interfaces and provides infrastructure for hierarchical 22 | * phrase-based translation grammars. 23 | */ 24 | package org.apache.joshua.decoder.ff.tm; 25 | 26 | -------------------------------------------------------------------------------- /src/main/java/org/apache/joshua/decoder/hypergraph/TrivialInsideOutside.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package org.apache.joshua.decoder.hypergraph; 20 | 21 | /** 22 | * @author Zhifei Li, zhifei.work@gmail.com 23 | * @version $LastChangedDate$ 24 | */ 25 | 26 | public class TrivialInsideOutside extends DefaultInsideOutside { 27 | // used by inside-outside estimation 28 | protected double getHyperedgeLogProb(HyperEdge dt, HGNode parent_it) { 29 | return dt.getTransitionLogP(false);// TODO this is very bad in terms of computation 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/main/java/org/apache/joshua/decoder/hypergraph/package-info.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | /** 21 | * Provides implementations of hypergraph data structures 22 | * and related algorithms used in extracting translation 23 | * results in hierarchical phrase-based translation. 24 | */ 25 | package org.apache.joshua.decoder.hypergraph; -------------------------------------------------------------------------------- /src/main/java/org/apache/joshua/decoder/package-info.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | /** 20 | * Provides infrastructure and wrapper code used relevant to 21 | * hierarchical phrase-based decoding for statistical machine 22 | * translation. This package does not include an implementation 23 | * of any actual decoding algorithm. Rather, such code is in 24 | * child packages of this package. 25 | */ 26 | package org.apache.joshua.decoder; -------------------------------------------------------------------------------- /src/main/java/org/apache/joshua/decoder/phrase/Note.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package org.apache.joshua.decoder.phrase; 20 | 21 | // PORT: done 22 | 23 | public class Note { 24 | public Object value; 25 | 26 | public String toString() { 27 | return value.toString(); 28 | } 29 | 30 | public Note() { 31 | } 32 | 33 | public Note(Object value) { 34 | this.value = value; 35 | } 36 | 37 | public Object get() { 38 | return value; 39 | } 40 | 41 | public void set(Object object) { 42 | this.value = object; 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/org/apache/joshua/decoder/segment_file/package-info.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | /** 20 | * Provides common interfaces for parsing segment files 21 | * (aka test corpora to be translated). In order to support 22 | * constraint annotations, we provide a general API for 23 | * use by JoshuaDecoder and Chart. 24 | */ 25 | package org.apache.joshua.decoder.segment_file; 26 | -------------------------------------------------------------------------------- /src/main/java/org/apache/joshua/lattice/package-info.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | /** 20 | * Provides implementations of lattice and related data structures. 21 | */ 22 | package org.apache.joshua.lattice; 23 | -------------------------------------------------------------------------------- /src/main/java/org/apache/joshua/oracle/package-info.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | /** 20 | * Provides for extracting the target string from a hypergraph 21 | * that most closely matches a reference sentence. Much of the 22 | * code in this package is based on descriptions in Adam 23 | * Lopez's 24 | * doctoral thesis. 25 | */ 26 | package org.apache.joshua.oracle; -------------------------------------------------------------------------------- /src/main/java/org/apache/joshua/subsample/package-info.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | /** 20 | * Provides executables Subsampler and AlignedSubsampler, 21 | * for subsampling from large training corpora based on a 22 | * test corpus. 23 | */ 24 | package org.apache.joshua.subsample; 25 | 26 | -------------------------------------------------------------------------------- /src/main/java/org/apache/joshua/ui/Orientation.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package org.apache.joshua.ui; 20 | 21 | public enum Orientation { 22 | HORIZONTAL, VERTICAL 23 | } 24 | -------------------------------------------------------------------------------- /src/main/java/org/apache/joshua/ui/package-info.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | /** 20 | * Provides classes for visualizing parts of the translation process. 21 | */ 22 | package org.apache.joshua.ui; -------------------------------------------------------------------------------- /src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationTreeEdge.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package org.apache.joshua.ui.tree_visualizer; 20 | 21 | public class DerivationTreeEdge { 22 | public final boolean pointsToSource; 23 | 24 | public DerivationTreeEdge(boolean pts) { 25 | pointsToSource = pts; 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/org/apache/joshua/util/Platform.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package org.apache.joshua.util; 20 | 21 | public class Platform { 22 | 23 | public static boolean isMac() { 24 | return System.getProperties().getProperty("os.name").toLowerCase().contains("mac"); 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/org/apache/joshua/util/encoding/FloatEncoder.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package org.apache.joshua.util.encoding; 20 | 21 | import java.io.DataInputStream; 22 | import java.io.DataOutputStream; 23 | import java.io.IOException; 24 | import java.nio.ByteBuffer; 25 | 26 | public interface FloatEncoder { 27 | 28 | float read(ByteBuffer stream, int position); 29 | 30 | void write(ByteBuffer stream, float value); 31 | 32 | String getKey(); 33 | 34 | void writeState(DataOutputStream out) throws IOException; 35 | 36 | void readState(DataInputStream in) throws IOException; 37 | 38 | int size(); 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/org/apache/joshua/util/encoding/IntEncoder.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package org.apache.joshua.util.encoding; 20 | 21 | import java.io.DataInputStream; 22 | import java.io.DataOutputStream; 23 | import java.io.IOException; 24 | import java.nio.ByteBuffer; 25 | 26 | public interface IntEncoder { 27 | 28 | int read(ByteBuffer stream, int position); 29 | 30 | void write(ByteBuffer stream, int value); 31 | 32 | String getKey(); 33 | 34 | void writeState(DataOutputStream out) throws IOException; 35 | 36 | void readState(DataInputStream in); 37 | 38 | int size(); 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/org/apache/joshua/util/io/package-info.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | /** 20 | * Provides common utility classes for IO. 21 | */ 22 | package org.apache.joshua.util.io; 23 | -------------------------------------------------------------------------------- /src/main/java/org/apache/joshua/util/package-info.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | /** 20 | * Provides common utility classes. 21 | */ 22 | package org.apache.joshua.util; 23 | -------------------------------------------------------------------------------- /src/main/java/org/apache/joshua/util/quantization/Quantizer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package org.apache.joshua.util.quantization; 20 | 21 | import java.io.DataInputStream; 22 | import java.io.DataOutputStream; 23 | import java.io.IOException; 24 | import java.nio.ByteBuffer; 25 | 26 | public interface Quantizer { 27 | 28 | float read(ByteBuffer stream, int position); 29 | 30 | void write(ByteBuffer stream, float value); 31 | 32 | void initialize(); 33 | 34 | void add(float key); 35 | 36 | String getKey(); 37 | 38 | void writeState(DataOutputStream out) throws IOException; 39 | 40 | void readState(DataInputStream in); 41 | 42 | int size(); 43 | } -------------------------------------------------------------------------------- /src/main/java/org/apache/joshua/util/quantization/StatelessQuantizer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package org.apache.joshua.util.quantization; 20 | 21 | import java.io.DataInputStream; 22 | import java.io.DataOutputStream; 23 | import java.io.IOException; 24 | 25 | abstract class StatelessQuantizer implements Quantizer { 26 | 27 | @Override 28 | public void initialize() {} 29 | 30 | @Override 31 | public void add(float key) {} 32 | 33 | @Override 34 | public void writeState(DataOutputStream out) throws IOException { 35 | out.writeUTF(getKey()); 36 | } 37 | 38 | @Override 39 | public void readState(DataInputStream in) {} 40 | } -------------------------------------------------------------------------------- /src/main/java/org/apache/joshua/util/quantization/package-info.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package org.apache.joshua.util.quantization; -------------------------------------------------------------------------------- /src/main/java/org/apache/joshua/zmert/package-info.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | /** 20 | * Provides code for performing minimum error rate training. 21 | * Much of the code in this package is based on Och (2003). 22 | * A deeper description of the algorithm is in Zaidan (2009). 23 | */ 24 | package org.apache.joshua.zmert; 25 | -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # log4j settings 17 | log4j.rootLogger=WARN, stdout 18 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 19 | log4j.appender.stdout.Target=System.err 20 | log4j.appender.stdout.layout=org.apache.log4j.SimpleLayout 21 | -------------------------------------------------------------------------------- /src/test/java/org/apache/joshua/corpus/SpanTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package org.apache.joshua.corpus; 20 | 21 | import org.testng.Assert; 22 | import org.testng.annotations.Test; 23 | 24 | /** 25 | * 26 | * 27 | * @author Lane Schwartz 28 | */ 29 | public class SpanTest { 30 | 31 | @Test 32 | public void iterator() { 33 | 34 | Span span = new Span(1,10); 35 | 36 | int expected = 1; 37 | 38 | for (int actual : span) { 39 | Assert.assertEquals(actual, expected); 40 | expected++; 41 | } 42 | 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/test/java/org/apache/joshua/packed/README: -------------------------------------------------------------------------------- 1 | # This code generates the packed grammar representation from the grammar file 2 | rm -rf small_packed 3 | java -cp /home/hltcoe/mpost/code/joshua/bin:. joshua.tools.GrammarPacker packer.config small_packed small_grammar 4 | 5 | # This compiles and reads the grammar file 6 | java -cp $JOSHUA/bin:. CountRules small_packed 7 | -------------------------------------------------------------------------------- /src/test/java/org/apache/joshua/packed/packer.config: -------------------------------------------------------------------------------- 1 | #chunk_size 30000 2 | chunk_size 2500000 3 | 4 | quantizer boolean Abstract,Adjacent,ContainsX,GlueRule,Lexical,Monotonic,TargetTerminalsButNoSource 5 | quantizer float LexprobSourceGivenTarget,LexprobTargetGivenSource,PhrasePenalty,RarityPenalty,SourcePhraseGivenTarget,SourceTerminalsButNoTarget,TargetPhraseGivenSource 6 | quantizer byte TargetWords 7 | -------------------------------------------------------------------------------- /src/test/resources/berkeley_lm/lm: -------------------------------------------------------------------------------- 1 | 2 | \data\ 3 | ngram 1=5 4 | ngram 2=3 5 | 6 | \1-grams: 7 | -99.000000 8 | -99.000000 -1.752754 9 | -2.034158 the -0.800943 10 | -5.318589 chat-rooms -0.151088 11 | -1.495702 12 | 13 | \2-grams: 14 | -1.773970 the 15 | -4.878868 the chat-rooms 16 | -0.499794 chat-rooms 17 | -------------------------------------------------------------------------------- /src/test/resources/bn-en/hiero/.gitignore: -------------------------------------------------------------------------------- 1 | diff 2 | log 3 | output 4 | output.scores 5 | -------------------------------------------------------------------------------- /src/test/resources/bn-en/hiero/class_lm_9gram.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/bn-en/hiero/class_lm_9gram.gz -------------------------------------------------------------------------------- /src/test/resources/bn-en/hiero/glue-grammar: -------------------------------------------------------------------------------- 1 | [GOAL] ||| ||| ||| 0 2 | [GOAL] ||| [GOAL,1] [X,2] ||| [GOAL,1] [X,2] ||| -1 3 | [GOAL] ||| [GOAL,1] ||| [GOAL,1] ||| 0 4 | -------------------------------------------------------------------------------- /src/test/resources/bn-en/hiero/grammar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/bn-en/hiero/grammar.gz -------------------------------------------------------------------------------- /src/test/resources/bn-en/hiero/joshua-berkeleylm.config: -------------------------------------------------------------------------------- 1 | # feature functions 2 | feature-function = LanguageModel -lm_type berkeleylm -lm_order 5 -lm_file src/test/resources/bn-en/hiero/lm.gz 3 | feature-function = OOVPenalty 4 | feature-function = WordPenalty 5 | 6 | # tm 7 | tm = thrax -owner pt -maxspan 12 -path src/test/resources/bn-en/hiero/grammar.gz 8 | tm = thrax -owner glue -maxspan -1 -path src/test/resources/bn-en/hiero/glue-grammar 9 | 10 | mark-oovs = false 11 | 12 | # tm config 13 | default_non_terminal=X 14 | goalSymbol=GOAL 15 | 16 | # pruning config 17 | pop-limit = 100 18 | 19 | # nbest config 20 | use_unique_nbest = true 21 | top_n = 10 22 | 23 | # output format 24 | output-format = "%c %s" 25 | 26 | # model weights 27 | lm_0 1.2373676802179452 28 | tm_pt_0 -2.4497429277910214 29 | tm_pt_1 0.7224581556224123 30 | tm_pt_2 -0.31689069155153504 31 | tm_pt_3 0.33861043967238036 32 | tm_pt_4 0.03553113401320236 33 | tm_pt_5 0.19138972284064748 34 | tm_pt_6 0.3417994095521415 35 | tm_pt_7 -0.9936312455671283 36 | tm_pt_8 0.9070737587091975 37 | tm_pt_9 0.8202511858619419 38 | tm_pt_10 0.2593091306160006 39 | tm_pt_11 0.25597137004462134 40 | tm_pt_12 0.3538894647790496 41 | tm_pt_13 -0.36212061186692646 42 | tm_pt_14 -0.32923261148678096 43 | tm_pt_15 0.5524863522177359 44 | tm_pt_16 0.23451595442127693 45 | tm_glue_0 1 46 | WordPenalty -3.6942747832593694 47 | OOVPenalty 1.0 48 | -------------------------------------------------------------------------------- /src/test/resources/bn-en/hiero/joshua.config: -------------------------------------------------------------------------------- 1 | # feature functions 2 | feature-function = LanguageModel -lm_type kenlm -lm_order 5 -minimizing false -lm_file src/test/resources/bn-en/hiero/lm.gz 3 | feature-function = OOVPenalty 4 | feature-function = WordPenalty 5 | 6 | # tm 7 | tm = thrax -owner pt -maxspan 12 -path src/test/resources/bn-en/hiero/grammar.gz 8 | tm = thrax -owner glue -maxspan -1 -path src/test/resources/bn-en/hiero/glue-grammar 9 | 10 | mark_oovs=false 11 | 12 | # tm config 13 | default_non_terminal=X 14 | goalSymbol=GOAL 15 | 16 | # pruning config 17 | pop-limit = 10 18 | 19 | # nbest config 20 | use_unique_nbest=true 21 | top_n = 10 22 | 23 | # output format 24 | output-format = "%c %s" 25 | 26 | # model weights 27 | lm_0 1.2373676802179452 28 | lm_1 1.2373676802179452 29 | tm_pt_0 -2.4497429277910214 30 | tm_pt_1 0.7224581556224123 31 | tm_pt_2 -0.31689069155153504 32 | tm_pt_3 0.33861043967238036 33 | tm_pt_4 0.03553113401320236 34 | tm_pt_5 0.19138972284064748 35 | tm_pt_6 0.3417994095521415 36 | tm_pt_7 -0.9936312455671283 37 | tm_pt_8 0.9070737587091975 38 | tm_pt_9 0.8202511858619419 39 | tm_pt_10 0.2593091306160006 40 | tm_pt_11 0.25597137004462134 41 | tm_pt_12 0.3538894647790496 42 | tm_pt_13 -0.36212061186692646 43 | tm_pt_14 -0.32923261148678096 44 | tm_pt_15 0.5524863522177359 45 | tm_pt_16 0.23451595442127693 46 | tm_glue_0 1 47 | WordPenalty -3.6942747832593694 48 | OOVPenalty 1.0 49 | -------------------------------------------------------------------------------- /src/test/resources/bn-en/hiero/lm.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/bn-en/hiero/lm.gz -------------------------------------------------------------------------------- /src/test/resources/bn-en/packed/.gitignore: -------------------------------------------------------------------------------- 1 | output 2 | diff 3 | output.bleu 4 | -------------------------------------------------------------------------------- /src/test/resources/bn-en/packed/grammar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/bn-en/packed/grammar.gz -------------------------------------------------------------------------------- /src/test/resources/bn-en/packed/grammar.packed/encoding: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/bn-en/packed/grammar.packed/encoding -------------------------------------------------------------------------------- /src/test/resources/bn-en/packed/grammar.packed/slice_00000.features: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/bn-en/packed/grammar.packed/slice_00000.features -------------------------------------------------------------------------------- /src/test/resources/bn-en/packed/grammar.packed/slice_00000.source: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/bn-en/packed/grammar.packed/slice_00000.source -------------------------------------------------------------------------------- /src/test/resources/bn-en/packed/grammar.packed/slice_00000.target: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/bn-en/packed/grammar.packed/slice_00000.target -------------------------------------------------------------------------------- /src/test/resources/bn-en/packed/grammar.packed/slice_00000.target.lookup: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/bn-en/packed/grammar.packed/slice_00000.target.lookup -------------------------------------------------------------------------------- /src/test/resources/bn-en/packed/grammar.packed/vocabulary: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/bn-en/packed/grammar.packed/vocabulary -------------------------------------------------------------------------------- /src/test/resources/bn-en/packed/joshua.config: -------------------------------------------------------------------------------- 1 | lm = kenlm 5 false false 100 src/test/resources/bn-en/packed/lm.gz 2 | 3 | tm = thrax -owner pt -maxspan 12 -path src/test/resources/bn-en/packed/grammar.packed 4 | tm = thrax -owner glue -maxspan -1 -path src/test/resources/bn-en/packed/grammar.glue 5 | 6 | mark_oovs = false 7 | 8 | #tm config 9 | default_non_terminal = OOV 10 | goalSymbol = GOAL 11 | 12 | #pruning config 13 | num-translation-options = 0 14 | pop-limit = 10 15 | 16 | #nbest config 17 | use_unique_nbest = true 18 | top_n = 10 19 | 20 | feature-function = OOVPenalty 21 | feature-function = WordPenalty 22 | 23 | # output format 24 | output-format = "%c %s" 25 | 26 | ###### model weights 27 | #lm order weight 28 | lm_0 1.3200621467242506 29 | 30 | #phrasemodel owner column(0-indexed) weight 31 | tm_pt_0 0.4571255198114019 32 | tm_pt_1 -0.17399038425384106 33 | tm_pt_2 -0.784547842535801 34 | tm_pt_3 0.76254324621594 35 | tm_pt_4 -0.8628695028838571 36 | tm_pt_5 0.04258438925263152 37 | tm_pt_6 0.5278815893934184 38 | tm_pt_7 0.9255662450788644 39 | tm_pt_8 0.03385066779097645 40 | tm_pt_9 0.9918446849428446 41 | tm_pt_10 0.52186013168725 42 | tm_pt_11 -0.7874679555197446 43 | tm_pt_12 -0.03770136145251124 44 | tm_pt_13 0.37085201114442157 45 | tm_pt_14 0.34054825749510886 46 | tm_pt_15 0.008348471483412778 47 | tm_pt_16 0.7984119288127296 48 | tm_glue_0 1 49 | WordPenalty -3.0476045270236662 50 | OOVPenalty 1.0 51 | -------------------------------------------------------------------------------- /src/test/resources/bn-en/packed/lm.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/bn-en/packed/lm.gz -------------------------------------------------------------------------------- /src/test/resources/bn-en/samt/grammar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/bn-en/samt/grammar.gz -------------------------------------------------------------------------------- /src/test/resources/bn-en/samt/joshua.config: -------------------------------------------------------------------------------- 1 | lm = kenlm 5 false false 100 src/test/resources/bn-en/samt/lm.gz 2 | 3 | tm = thrax -owner pt -maxspan 12 -path src/test/resources/bn-en/samt/grammar.gz 4 | tm = thrax -owner glue -maxspan -1 -path src/test/resources/bn-en/samt/grammar.glue 5 | 6 | mark_oovs=false 7 | 8 | #tm config 9 | default_non_terminal = OOV 10 | goalSymbol = GOAL 11 | 12 | #pruning config 13 | num-translation-options = 0 14 | pop-limit = 10 15 | 16 | #nbest config 17 | use_unique_nbest = true 18 | top_n = 10 19 | 20 | feature-function = OOVPenalty 21 | feature-function = WordPenalty 22 | 23 | # output format 24 | output-format = "%c %s" 25 | 26 | 27 | ###### model weights 28 | #lm order weight 29 | lm_0 1.3200621467242506 30 | 31 | #phrasemodel owner column(0-indexed) weight 32 | tm_pt_0 0.4571255198114019 33 | tm_pt_1 -0.17399038425384106 34 | tm_pt_2 -0.784547842535801 35 | tm_pt_3 0.76254324621594 36 | tm_pt_4 -0.8628695028838571 37 | tm_pt_5 0.04258438925263152 38 | tm_pt_6 0.5278815893934184 39 | tm_pt_7 0.9255662450788644 40 | tm_pt_8 0.03385066779097645 41 | tm_pt_9 0.9918446849428446 42 | tm_pt_10 0.52186013168725 43 | tm_pt_11 -0.7874679555197446 44 | tm_pt_12 -0.03770136145251124 45 | tm_pt_13 0.37085201114442157 46 | tm_pt_14 0.34054825749510886 47 | tm_pt_15 0.008348471483412778 48 | tm_pt_16 0.7984119288127296 49 | tm_glue_0 1 50 | WordPenalty -3.0476045270236662 51 | OOVPenalty 1.0 52 | -------------------------------------------------------------------------------- /src/test/resources/bn-en/samt/lm.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/bn-en/samt/lm.gz -------------------------------------------------------------------------------- /src/test/resources/data/tiny.en: -------------------------------------------------------------------------------- 1 | resumption of the session 2 | i declare resumed the session of the european parliament adjourned on friday 17 december 1999 , and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period . 3 | you have requested a debate on this subject in the course of the next few days , during this part-session . 4 | please rise , then , for this minute ' s silence . 5 | ( the house rose and observed a minute ' s silence ) 6 | -------------------------------------------------------------------------------- /src/test/resources/decoder/constrained/.gitignore: -------------------------------------------------------------------------------- 1 | diff 2 | log 3 | output 4 | output.scores 5 | -------------------------------------------------------------------------------- /src/test/resources/decoder/constrained/glue-grammar: -------------------------------------------------------------------------------- 1 | [GOAL] ||| ||| ||| 0 2 | [GOAL] ||| [GOAL,1] [X,2] ||| [GOAL,1] [X,2] ||| -1 3 | [GOAL] ||| [GOAL,1] ||| [GOAL,1] ||| 0 4 | -------------------------------------------------------------------------------- /src/test/resources/decoder/constrained/grammar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/decoder/constrained/grammar.gz -------------------------------------------------------------------------------- /src/test/resources/decoder/constrained/input.bn: -------------------------------------------------------------------------------- 1 | গণিত তাই বিজ্ঞানের ভাষা । 2 | গণিত তাই বিজ্ঞানের ভাষা । ||| mathematics is science language . 3 | রবীন্দ্রনাথের জন্ম হয়েছিল কলকাতার এক পিরালী ব্রাহ্মণ পরিবারে । 4 | রবীন্দ্রনাথের জন্ম হয়েছিল কলকাতার এক পিরালী ব্রাহ্মণ পরিবারে । ||| rabindranath was born in a পিরালী ব্রাহ্মণ in the family 5 | রবীন্দ্রনাথের জন্ম হয়েছিল কলকাতার এক পিরালী ব্রাহ্মণ পরিবারে । ||| rabindranath born in kolkata is a পিরালী ব্রাহ্মণ in the family 6 | রবীন্দ্রনাথের জন্ম হয়েছিল কলকাতার এক পিরালী ব্রাহ্মণ পরিবারে । ||| rabindranath born in kolkata is one পিরালী ব্রাহ্মণ in the family 7 | রবীন্দ্রনাথের জন্ম হয়েছিল কলকাতার এক পিরালী ব্রাহ্মণ পরিবারে । ||| rabindranath was born in kolkata is a পিরালী ব্রাহ্মণ in the family 8 | রবীন্দ্রনাথের জন্ম হয়েছিল কলকাতার এক পিরালী ব্রাহ্মণ পরিবারে । ||| rabindranath born in kolkata was one পিরালী ব্রাহ্মণ in the family 9 | -------------------------------------------------------------------------------- /src/test/resources/decoder/constrained/joshua.config: -------------------------------------------------------------------------------- 1 | lm = kenlm 5 false false 100 src/test/resources/decoder/constrained/lm.gz 2 | 3 | tm = thrax pt 12 src/test/resources/decoder/constrained/grammar.gz 4 | tm = thrax glue -1 src/test/resources/decoder/constrained/glue-grammar 5 | 6 | mark_oovs = false 7 | 8 | default-non-terminal = X 9 | goalSymbol = GOAL 10 | 11 | #pruning config 12 | pop-limit = 100 13 | 14 | output-format = %c %s 15 | 16 | #nbest config 17 | use_unique_nbest = true 18 | top_n = 10 19 | 20 | feature-function = WordPenalty 21 | feature-function = OOVPenalty 22 | 23 | 24 | lm_0 1.2373676802179452 25 | 26 | tm_pt_0 -2.4497429277910214 27 | tm_pt_1 0.7224581556224123 28 | tm_pt_2 -0.31689069155153504 29 | tm_pt_3 0.33861043967238036 30 | tm_pt_4 0.03553113401320236 31 | tm_pt_5 0.19138972284064748 32 | tm_pt_6 0.3417994095521415 33 | tm_pt_7 -0.9936312455671283 34 | tm_pt_8 0.9070737587091975 35 | tm_pt_9 0.8202511858619419 36 | tm_pt_10 0.2593091306160006 37 | tm_pt_11 0.25597137004462134 38 | tm_pt_12 0.3538894647790496 39 | tm_pt_13 -0.36212061186692646 40 | tm_pt_14 -0.32923261148678096 41 | tm_pt_15 0.5524863522177359 42 | tm_pt_16 0.23451595442127693 43 | tm_glue_0 1 44 | WordPenalty -3.6942747832593694 45 | OOVPenalty 1.0 46 | -------------------------------------------------------------------------------- /src/test/resources/decoder/constrained/lm.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/decoder/constrained/lm.gz -------------------------------------------------------------------------------- /src/test/resources/decoder/dont-crash/input: -------------------------------------------------------------------------------- 1 | [] 2 | [X] 3 | ||| 4 | | 5 | ((( 6 | || | | 7 | || | 8 | | asdf| 9 | || 10 | | ?| test 11 | -------------------------------------------------------------------------------- /src/test/resources/decoder/left-state/glue-grammar: -------------------------------------------------------------------------------- 1 | [GOAL] ||| ||| ||| 0 2 | [GOAL] ||| [GOAL,1] [X,2] ||| [GOAL,1] [X,2] ||| -1 3 | [GOAL] ||| [GOAL,1] ||| [GOAL,1] ||| 0 4 | -------------------------------------------------------------------------------- /src/test/resources/decoder/left-state/grammar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/decoder/left-state/grammar.gz -------------------------------------------------------------------------------- /src/test/resources/decoder/left-state/input.bn: -------------------------------------------------------------------------------- 1 | রবীন্দ্রনাথের জন্ম হয়েছিল কলকাতার এক পিরালী ব্রাহ্মণ পরিবারে । 2 | সাম্প্রতিককালে ভারতের সঙ্গে যুক্তরাষ্ট্রের সম্পর্কের উন্নতি হয়েছে । 3 | -------------------------------------------------------------------------------- /src/test/resources/decoder/left-state/joshua.config: -------------------------------------------------------------------------------- 1 | lm = kenlm 5 true false 100 src/test/resources/decoder/left-state/lm.gz 2 | 3 | tm = thrax pt 12 src/test/resources/decoder/left-state/grammar.gz 4 | tm = thrax glue -1 src/test/resources/decoder/left-state/glue-grammar 5 | 6 | mark_oovs=false 7 | 8 | #tm config 9 | default_non_terminal=X 10 | goalSymbol=GOAL 11 | 12 | #pruning config 13 | pop-limit=100 14 | 15 | #nbest config 16 | use_unique_nbest=true 17 | top-n = 300 18 | 19 | feature_function = WordPenalty 20 | feature_function = OOVPenalty 21 | 22 | output-format = "%c %s" 23 | 24 | 25 | # Model Weights #### 26 | 27 | lm_0 1.2373676802179452 28 | tm_pt_0 -2.4497429277910214 29 | tm_pt_1 0.7224581556224123 30 | tm_pt_2 -0.31689069155153504 31 | tm_pt_3 0.33861043967238036 32 | tm_pt_4 0.03553113401320236 33 | tm_pt_5 0.19138972284064748 34 | tm_pt_6 0.3417994095521415 35 | tm_pt_7 -0.9936312455671283 36 | tm_pt_8 0.9070737587091975 37 | tm_pt_9 0.8202511858619419 38 | tm_pt_10 0.2593091306160006 39 | tm_pt_11 0.25597137004462134 40 | tm_pt_12 0.3538894647790496 41 | tm_pt_13 -0.36212061186692646 42 | tm_pt_14 -0.32923261148678096 43 | tm_pt_15 0.5524863522177359 44 | tm_pt_16 0.23451595442127693 45 | tm_glue_0 1 46 | WordPenalty -3.6942747832593694 47 | OOVPenalty 1.0 48 | -------------------------------------------------------------------------------- /src/test/resources/decoder/left-state/lm.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/decoder/left-state/lm.gz -------------------------------------------------------------------------------- /src/test/resources/decoder/lowercaser/grammar.glue: -------------------------------------------------------------------------------- 1 | [GOAL] ||| ||| ||| 0 2 | [GOAL] ||| [GOAL,1] [X,2] ||| [GOAL,1] [X,2] ||| -1 3 | [GOAL] ||| [GOAL,1] ||| [GOAL,1] ||| 0 4 | [GOAL] ||| [X,1] ||| [X,1] ||| 0 5 | -------------------------------------------------------------------------------- /src/test/resources/decoder/lowercaser/grammar.test: -------------------------------------------------------------------------------- 1 | [X] ||| ella ||| she ||| 1 ||| 0-0 2 | -------------------------------------------------------------------------------- /src/test/resources/decoder/moses-compat/NEEDS_UPDATING: -------------------------------------------------------------------------------- 1 | Needs to be moved to a unit test. The parameter JoshuaConfiguration.moses is handled by JoshuaDecoder. Therefore, the CLI must be made testable before a unit test can be created. 2 | -------------------------------------------------------------------------------- /src/test/resources/decoder/moses-compat/n-best.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/decoder/moses-compat/n-best.txt -------------------------------------------------------------------------------- /src/test/resources/decoder/moses-compat/output.expected: -------------------------------------------------------------------------------- 1 | 0 ||| help ||| tm_glue_0=1.0 ||| 0.0 2 | 0 ||| help ||| tm_glue_0=1.0 ||| 0.0 3 | -------------------------------------------------------------------------------- /src/test/resources/decoder/n-ary/glue-grammar: -------------------------------------------------------------------------------- 1 | [GOAL] ||| ||| ||| 0 2 | [GOAL] ||| [GOAL,1] [S,2] ||| [GOAL,1] [S,2] ||| -1 3 | [GOAL] ||| [GOAL,1] ||| [GOAL,1] ||| 0 4 | -------------------------------------------------------------------------------- /src/test/resources/decoder/n-ary/grammar: -------------------------------------------------------------------------------- 1 | [NP-S] ||| GOATS ||| Goats ||| 0 2 | [VP] ||| EAT ||| eat ||| 0 3 | [NP-O] ||| CHEESE ||| cheese ||| 0 4 | [S] ||| [NP-O,1] [NP-S,2] [VP,3] ||| [NP-S,2] [VP,3] [NP-O,1] ||| 0 5 | [A] ||| 1 ||| i ||| 0 6 | [B] ||| 2 ||| will ||| 0 7 | [C] ||| 3 ||| go ||| 0 8 | [D] ||| 4 ||| home ||| 0 9 | [S] ||| [C,1] [A,2] [D,3] [B,4] ||| [A,2] [B,4] [C,1] [D,3] ||| 0 10 | -------------------------------------------------------------------------------- /src/test/resources/decoder/n-ary/input.txt: -------------------------------------------------------------------------------- 1 | CHEESE GOATS EAT 2 | 3 1 4 2 3 | -------------------------------------------------------------------------------- /src/test/resources/decoder/n-ary/joshua.config: -------------------------------------------------------------------------------- 1 | lm = kenlm 5 false false 100 src/test/resources/decoder/n-ary/lm.gz 2 | 3 | tm = thrax phrase 20 src/test/resources/decoder/n-ary/grammar 4 | tm = thrax glue -1 src/test/resources/decoder/n-ary/glue-grammar 5 | 6 | mark_oovs = true 7 | 8 | default-non-terminal = X 9 | goalSymbol = GOAL 10 | 11 | #pruning config 12 | pop-limit = 100 13 | 14 | #nbest config 15 | use_unique_nbest = true 16 | top_n = 1 17 | 18 | output-format = %c %s 19 | 20 | weights-file = src/test/resources/decoder/n-ary/weights 21 | feature-function = WordPenalty 22 | feature-function = OOVPenalty 23 | -------------------------------------------------------------------------------- /src/test/resources/decoder/n-ary/lm.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/decoder/n-ary/lm.gz -------------------------------------------------------------------------------- /src/test/resources/decoder/n-ary/output.gold: -------------------------------------------------------------------------------- 1 | -11.503 Goats eat cheese 2 | -4.414 i will go home 3 | -------------------------------------------------------------------------------- /src/test/resources/decoder/n-ary/weights: -------------------------------------------------------------------------------- 1 | lm_0 1.2373676802179452 2 | 3 | tm_phrase_0 1 4 | tm_glue_0 1 5 | WordPenalty -3.6942747832593694 6 | OOVPenalty -100.0 7 | -------------------------------------------------------------------------------- /src/test/resources/decoder/num_translation_options/README: -------------------------------------------------------------------------------- 1 | Tests that num_translation_options is enforced for hierarchical decoders 2 | -------------------------------------------------------------------------------- /src/test/resources/decoder/num_translation_options/glue-grammar: -------------------------------------------------------------------------------- 1 | [GOAL] ||| ||| ||| 0 2 | [GOAL] ||| [GOAL,1] [X,2] ||| [GOAL,1] [X,2] ||| -1 3 | [GOAL] ||| [GOAL,1] ||| [GOAL,1] ||| 0 4 | -------------------------------------------------------------------------------- /src/test/resources/decoder/num_translation_options/grammar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/decoder/num_translation_options/grammar.gz -------------------------------------------------------------------------------- /src/test/resources/decoder/num_translation_options/grammar.packed/encoding: -------------------------------------------------------------------------------- 1 | bytebyte0 -------------------------------------------------------------------------------- /src/test/resources/decoder/num_translation_options/grammar.packed/slice_00000.features: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/decoder/num_translation_options/grammar.packed/slice_00000.features -------------------------------------------------------------------------------- /src/test/resources/decoder/num_translation_options/grammar.packed/slice_00000.source: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/decoder/num_translation_options/grammar.packed/slice_00000.source -------------------------------------------------------------------------------- /src/test/resources/decoder/num_translation_options/grammar.packed/slice_00000.target: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/decoder/num_translation_options/grammar.packed/slice_00000.target -------------------------------------------------------------------------------- /src/test/resources/decoder/num_translation_options/grammar.packed/slice_00000.target.lookup: -------------------------------------------------------------------------------- 1 |  -------------------------------------------------------------------------------- /src/test/resources/decoder/num_translation_options/grammar.packed/vocabulary: -------------------------------------------------------------------------------- 1 | liketaco[X]putbelltolerateup 2 | appreciate 0 3 | with yo quiero lovei -------------------------------------------------------------------------------- /src/test/resources/decoder/num_translation_options/input: -------------------------------------------------------------------------------- 1 | yo quiero taco bell 2 | -------------------------------------------------------------------------------- /src/test/resources/decoder/num_translation_options/joshua-packed.config: -------------------------------------------------------------------------------- 1 | num_translation_options = 3 2 | 3 | lm = kenlm 5 false false 100 src/test/resources/decoder/num_translation_options/lm.gz 4 | 5 | tm = thrax pt 12 src/test/resources/decoder/num_translation_options/grammar.packed 6 | tm = thrax glue -1 src/test/resources/decoder/num_translation_options/glue-grammar 7 | 8 | mark_oovs = false 9 | 10 | default-non-terminal = X 11 | goalSymbol = GOAL 12 | 13 | #pruning config 14 | pop-limit = 100 15 | 16 | output-format = %c ||| %s ||| %f 17 | 18 | #nbest config 19 | use_unique_nbest = true 20 | top_n = 5 21 | 22 | feature-function = WordPenalty 23 | feature-function = OOVPenalty 24 | 25 | lm_0 1.2373676802179452 26 | 27 | tm_pt_0 1 28 | tm_glue_0 1 29 | WordPenalty 1 30 | OOVPenalty 1.0 31 | -------------------------------------------------------------------------------- /src/test/resources/decoder/num_translation_options/joshua.config: -------------------------------------------------------------------------------- 1 | num_translation_options = 3 2 | 3 | lm = kenlm 5 false false 100 src/test/resources/decoder/num_translation_options/lm.gz 4 | 5 | tm = thrax pt 12 src/test/resources/decoder/num_translation_options/grammar.gz 6 | tm = thrax glue -1 src/test/resources/decoder/num_translation_options/glue-grammar 7 | 8 | mark_oovs = false 9 | 10 | default-non-terminal = X 11 | goalSymbol = GOAL 12 | 13 | #pruning config 14 | pop-limit = 100 15 | 16 | output-format = %c ||| %s ||| %f 17 | 18 | #nbest config 19 | use_unique_nbest = true 20 | top_n = 5 21 | 22 | feature-function = WordPenalty 23 | feature-function = OOVPenalty 24 | 25 | lm_0 1.2373676802179452 26 | 27 | tm_pt_0 1 28 | tm_glue_0 1 29 | WordPenalty 1 30 | OOVPenalty 1.0 31 | -------------------------------------------------------------------------------- /src/test/resources/decoder/num_translation_options/lm.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/decoder/num_translation_options/lm.gz -------------------------------------------------------------------------------- /src/test/resources/decoder/num_translation_options/output-no-dot-chart.gold: -------------------------------------------------------------------------------- 1 | -19.196 ||| i like taco bell ||| tm_pt_0=4.000 tm_glue_0=1.000 lm_0=-17.449 WordPenalty=-2.606 OOVPenalty=0.000 2 | -19.733 ||| i love taco bell ||| tm_pt_0=5.000 tm_glue_0=1.000 lm_0=-18.690 WordPenalty=-2.606 OOVPenalty=0.000 3 | -22.883 ||| i appreciate taco bell ||| tm_pt_0=3.000 tm_glue_0=1.000 lm_0=-19.620 WordPenalty=-2.606 OOVPenalty=0.000 4 | -424.954 ||| yo quiero taco bell ||| tm_pt_0=0.000 tm_glue_0=4.000 lm_0=-21.293 WordPenalty=-2.606 OOVPenalty=-400.000 5 | -------------------------------------------------------------------------------- /src/test/resources/decoder/num_translation_options/output-packed.gold: -------------------------------------------------------------------------------- 1 | -19.196 ||| i like taco bell ||| tm_pt_0=4.000 tm_glue_0=1.000 lm_0=-17.449 WordPenalty=-2.606 OOVPenalty=0.000 2 | -19.733 ||| i love taco bell ||| tm_pt_0=5.000 tm_glue_0=1.000 lm_0=-18.690 WordPenalty=-2.606 OOVPenalty=0.000 3 | -22.883 ||| i appreciate taco bell ||| tm_pt_0=3.000 tm_glue_0=1.000 lm_0=-19.620 WordPenalty=-2.606 OOVPenalty=0.000 4 | -424.954 ||| yo quiero taco bell ||| tm_pt_0=0.000 tm_glue_0=4.000 lm_0=-21.293 WordPenalty=-2.606 OOVPenalty=-400.000 5 | -------------------------------------------------------------------------------- /src/test/resources/decoder/num_translation_options/output.gold: -------------------------------------------------------------------------------- 1 | -19.196 ||| i like taco bell ||| tm_pt_0=4.000 tm_glue_0=1.000 lm_0=-17.449 WordPenalty=-2.606 OOVPenalty=0.000 2 | -19.733 ||| i love taco bell ||| tm_pt_0=5.000 tm_glue_0=1.000 lm_0=-18.690 WordPenalty=-2.606 OOVPenalty=0.000 3 | -22.883 ||| i appreciate taco bell ||| tm_pt_0=3.000 tm_glue_0=1.000 lm_0=-19.620 WordPenalty=-2.606 OOVPenalty=0.000 4 | -424.954 ||| yo quiero taco bell ||| tm_pt_0=0.000 tm_glue_0=4.000 lm_0=-21.293 WordPenalty=-2.606 OOVPenalty=-400.000 5 | -------------------------------------------------------------------------------- /src/test/resources/decoder/oov-list/glue-grammar: -------------------------------------------------------------------------------- 1 | [GOAL] ||| ||| ||| 0 2 | [GOAL] ||| [GOAL,1] [S,2] ||| [GOAL,1] [S,2] ||| -1 3 | [GOAL] ||| [GOAL,1] ||| [GOAL,1] ||| 0 4 | -------------------------------------------------------------------------------- /src/test/resources/decoder/oov-list/grammar: -------------------------------------------------------------------------------- 1 | [NP-S] ||| GOATS ||| Goats ||| 0 2 | [VP] ||| EAT ||| eat ||| 0 3 | [NP-O] ||| CHEESE ||| cheese ||| 0 4 | [VP] ||| [VB,1] ||| [VB,1] ||| 0 5 | [S] ||| [NP-O,1] [NP-S,2] [VP,3] ||| [NP-S,2] [VP,3] [NP-O,1] ||| 0 6 | [S] ||| [NP,1] [VP,2] [NP,3] ||| [NP,1] [VP,2] [NP,3] ||| 0 7 | [A] ||| 1 ||| i ||| 0 8 | [B] ||| 2 ||| will ||| 0 9 | [C] ||| 3 ||| go ||| 0 10 | [D] ||| 4 ||| home ||| 0 11 | [S] ||| [C,1] [A,2] [D,3] [B,4] ||| [A,2] [B,4] [C,1] [D,3] ||| 0 12 | -------------------------------------------------------------------------------- /src/test/resources/decoder/oov-list/input.txt: -------------------------------------------------------------------------------- 1 | CHEESE GOATS EAT 2 | 3 1 4 2 3 | goets eet cheez 4 | -------------------------------------------------------------------------------- /src/test/resources/decoder/oov-list/joshua.config: -------------------------------------------------------------------------------- 1 | lm = kenlm 5 false false 100 src/test/resources/decoder/oov-list/lm.gz 2 | 3 | tm = thrax phrase 20 src/test/resources/decoder/oov-list/grammar 4 | tm = thrax glue -1 src/test/resources/decoder/oov-list/glue-grammar 5 | 6 | mark_oovs = true 7 | 8 | default-non-terminal = X 9 | goalSymbol = GOAL 10 | 11 | #pruning config 12 | pop-limit = 100 13 | 14 | #nbest config 15 | use_unique_nbest = true 16 | use_tree_nbest = false 17 | top_n = 1 18 | 19 | oov-list = CD 0.0488752 JJ 0.186114 NN 0.291795 NNS 0.0894967 NP 0.117171 OOV 0.033015 VB 0.0313967 VBG 0.0404596 VBN 0.0317203 20 | 21 | output-format=%s ||| %f ||| %c 22 | 23 | feature-function = WordPenalty 24 | feature-function = OOVPenalty 25 | 26 | lm_0 1.2373676802179452 27 | 28 | tm_phrase_0 1 29 | tm_glue_0 1 30 | WordPenalty -3.6942747832593694 31 | OOVPenalty 1.0 32 | -------------------------------------------------------------------------------- /src/test/resources/decoder/oov-list/lm.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/decoder/oov-list/lm.gz -------------------------------------------------------------------------------- /src/test/resources/decoder/oov-list/output.gold: -------------------------------------------------------------------------------- 1 | Goats eat cheese ||| tm_phrase_0=0.000 tm_glue_0=1.000 lm_0=-16.587 WordPenalty=-2.171 OOVPenalty=0.000 ||| -11.503 2 | i will go home ||| tm_phrase_0=0.000 tm_glue_0=1.000 lm_0=-12.155 WordPenalty=-2.606 OOVPenalty=0.000 ||| -4.414 3 | goets_OOV eet_OOV cheez_OOV ||| tm_phrase_0=0.000 tm_glue_0=1.000 lm_0=-17.700 WordPenalty=-2.171 OOVPenalty=-7.749 ||| -20.629 4 | -------------------------------------------------------------------------------- /src/test/resources/decoder/phrase/unique-hypotheses/joshua.config: -------------------------------------------------------------------------------- 1 | tm = moses pt 0 src/test/resources/decoder/phrase/unique-hypotheses/rules.1.gz 2 | default-non-terminal = X 3 | goal-symbol = GOAL 4 | lm = kenlm 5 true false 100 src/test/resources/decoder/phrase/unique-hypotheses/lm.1.gz 5 | mark-oovs = false 6 | pop-limit = 100 7 | top-n = 300 8 | use-unique-nbest = true 9 | output-format = %s 10 | include-align-index = false 11 | feature-function = OOVPenalty 12 | feature-function = WordPenalty 13 | feature_function = Distortion 14 | feature_function = PhrasePenalty 15 | lm_0 1.0 16 | tm_pt_1 1.0 17 | tm_pt_3 1.0 18 | tm_pt_0 1.0 19 | tm_pt_2 1.0 20 | WordPenalty -2.844814 21 | OOVPenalty 1.0 22 | PhrasePenalty 1.0 23 | Distortion 1.0 24 | -------------------------------------------------------------------------------- /src/test/resources/decoder/phrase/unique-hypotheses/lm.1.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/decoder/phrase/unique-hypotheses/lm.1.gz -------------------------------------------------------------------------------- /src/test/resources/decoder/phrase/unique-hypotheses/rules.1.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/decoder/phrase/unique-hypotheses/rules.1.gz -------------------------------------------------------------------------------- /src/test/resources/decoder/rescoring/glue-grammar: -------------------------------------------------------------------------------- 1 | [GOAL] ||| ||| ||| 0 2 | [GOAL] ||| [GOAL,1] [X,2] ||| [GOAL,1] [X,2] ||| -1 3 | [GOAL] ||| [GOAL,1] ||| [GOAL,1] ||| 0 4 | -------------------------------------------------------------------------------- /src/test/resources/decoder/rescoring/grammar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/decoder/rescoring/grammar.gz -------------------------------------------------------------------------------- /src/test/resources/decoder/rescoring/input.txt: -------------------------------------------------------------------------------- 1 | el nino tomo la cucaracha ||| ||| the boy ate the cockroach 2 | el nino tomo la cucaracha ||| ||| the big storm swarmed the coast ||| the big storm only swarmed the coast 3 | -------------------------------------------------------------------------------- /src/test/resources/decoder/rescoring/joshua.config: -------------------------------------------------------------------------------- 1 | rescore-forest = true 2 | rescore-forest-weight = 100 3 | 4 | lm = kenlm 5 false false 100 src/test/resources/decoder/rescoring/lm.gz 5 | 6 | tm = thrax pt 12 src/test/resources/decoder/rescoring/grammar.gz 7 | tm = thrax glue -1 src/test/resources/decoder/rescoring/glue-grammar 8 | 9 | mark-oovs = true 10 | 11 | default-non-terminal = X 12 | goalSymbol = GOAL 13 | 14 | #pruning config 15 | pop-limit = 100 16 | 17 | output-format = %s ||| %f ||| %c 18 | 19 | #nbest config 20 | use_unique_nbest = true 21 | top_n = 2 22 | 23 | feature-function = WordPenalty 24 | feature-function = OOVPenalty 25 | 26 | lm_0 1.2373676802179452 27 | 28 | tm_pt_0 1 29 | tm_glue_0 1 30 | WordPenalty -1 31 | OOVPenalty 1.0 32 | -------------------------------------------------------------------------------- /src/test/resources/decoder/rescoring/lm.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/decoder/rescoring/lm.gz -------------------------------------------------------------------------------- /src/test/resources/decoder/source-annotations/grammar: -------------------------------------------------------------------------------- 1 | [X] ||| mis ||| my ||| 1 ||| 0-0 2 | [X] ||| amigos ||| friends ||| 1 ||| 0-0 3 | [X] ||| me ||| me ||| 1 ||| 0-0 4 | [X] ||| llaman ||| call ||| 1 ||| 0-0 5 | [X] ||| me llaman ||| call me ||| 1 ||| 0-1 1-0 6 | -------------------------------------------------------------------------------- /src/test/resources/decoder/source-annotations/grammar.glue: -------------------------------------------------------------------------------- 1 | [GOAL] ||| ||| ||| 0 2 | [GOAL] ||| [GOAL,1] [X,2] ||| [GOAL,1] [X,2] ||| -1 3 | [GOAL] ||| [GOAL,1] ||| [GOAL,1] ||| 0 4 | -------------------------------------------------------------------------------- /src/test/resources/decoder/source-annotations/input.txt: -------------------------------------------------------------------------------- 1 | mis[tag=ADJ;num=PL;class=OOV] amigos me llaman 2 | -------------------------------------------------------------------------------- /src/test/resources/decoder/source-annotations/lm.kenlm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/decoder/source-annotations/lm.kenlm -------------------------------------------------------------------------------- /src/test/resources/decoder/source-annotations/output.gold: -------------------------------------------------------------------------------- 1 | 0 ||| my friends call me ||| tm_pt_0=-3.000 tm_glue_0=3.000 lm_0=-11.974 OOVPenalty=0.000 WordPenalty=-2.606 ||| -7.650 2 | 0 ||| my friends call me ||| tm_pt_0=-3.000 tm_glue_0=3.000 lm_0=-111.513 OOVPenalty=0.000 WordPenalty=-2.606 ||| -107.189 3 | -------------------------------------------------------------------------------- /src/test/resources/decoder/target-bigram/vocab: -------------------------------------------------------------------------------- 1 | 1 this 1 2 | 2 is 17 3 | 3 a 42 4 | 4 test 9 5 | -------------------------------------------------------------------------------- /src/test/resources/decoder/tree-output/glue-grammar: -------------------------------------------------------------------------------- 1 | [GOAL] ||| ||| ||| 0 2 | [GOAL] ||| [GOAL,1] [A,2] ||| [GOAL,1] [A,2] ||| -1 3 | [GOAL] ||| [GOAL,1] [D,2] ||| [GOAL,1] [D,2] ||| -1 4 | [GOAL] ||| [GOAL,1] [S,2] ||| [GOAL,1] [S,2] ||| -1 5 | [GOAL] ||| [GOAL,1] ||| [GOAL,1] ||| 0 6 | [GOAL] ||| [NP\DT] ||| [NP\DT,1] ||| -1 7 | -------------------------------------------------------------------------------- /src/test/resources/decoder/tree-output/grammar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/decoder/tree-output/grammar.gz -------------------------------------------------------------------------------- /src/test/resources/decoder/tree-output/input: -------------------------------------------------------------------------------- 1 | foo bar baz 2 | an unparseable sentence 3 | baz 4 | yo soy 5 | purchase xslot 6 | -------------------------------------------------------------------------------- /src/test/resources/decoder/tree-output/joshua.config: -------------------------------------------------------------------------------- 1 | lm = kenlm 5 false false 100 src/test/resources/decoder/tree-output/lm.gz 2 | 3 | tm = thrax pt 12 src/test/resources/decoder/tree-output/grammar.gz 4 | tm = thrax glue -1 src/test/resources/decoder/tree-output/glue-grammar 5 | 6 | mark_oovs = false 7 | 8 | default-non-terminal = X 9 | goalSymbol = GOAL 10 | 11 | #pruning config 12 | pop-limit = 100 13 | 14 | output-format = %t 15 | 16 | #nbest config 17 | use_unique_nbest = true 18 | top_n = 1 19 | 20 | feature-function = WordPenalty 21 | feature-function = OOVPenalty 22 | 23 | 24 | lm_0 1.2373676802179452 25 | 26 | tm_pt_0 -2.4497429277910214 27 | tm_pt_1 0.7224581556224123 28 | tm_pt_2 -0.31689069155153504 29 | tm_pt_3 0.33861043967238036 30 | tm_pt_4 0.03553113401320236 31 | tm_pt_5 0.19138972284064748 32 | tm_pt_6 0.3417994095521415 33 | tm_pt_7 -0.9936312455671283 34 | tm_pt_8 0.9070737587091975 35 | tm_pt_9 0.8202511858619419 36 | tm_pt_10 0.2593091306160006 37 | tm_pt_11 0.25597137004462134 38 | tm_pt_12 0.3538894647790496 39 | tm_pt_13 -0.36212061186692646 40 | tm_pt_14 -0.32923261148678096 41 | tm_pt_15 0.5524863522177359 42 | tm_pt_16 0.23451595442127693 43 | tm_glue_0 1 44 | WordPenalty -3.6942747832593694 45 | OOVPenalty 1.0 46 | -------------------------------------------------------------------------------- /src/test/resources/decoder/tree-output/lm.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/decoder/tree-output/lm.gz -------------------------------------------------------------------------------- /src/test/resources/decoder/tree-output/output.gold: -------------------------------------------------------------------------------- 1 | (GOAL{0-5} (GOAL{0-4} (GOAL{0-3} (GOAL{0-1} ) (A{1-3} (B{1-2} foo) (C{2-3} bar))) (D{3-4} baz)) ) 2 | () 3 | (GOAL{0-3} (GOAL{0-2} (GOAL{0-1} ) (D{1-2} baz)) ) 4 | (GOAL{0-4} (GOAL{0-3} (GOAL{0-1} ) (S{1-3} I AM)) ) 5 | (GOAL{0-4} (NP\DT{1-3} right (NN{2-3} xslot)) ) 6 | -------------------------------------------------------------------------------- /src/test/resources/grammar.glue: -------------------------------------------------------------------------------- 1 | [GOAL] ||| ||| ||| 0 2 | [GOAL] ||| [GOAL,1] [X,2] ||| [GOAL,1] [X,2] ||| -1 3 | [GOAL] ||| [GOAL,1] ||| [GOAL,1] ||| 0 4 | [GOAL] ||| [X,1] ||| [X,1] ||| 0 5 | -------------------------------------------------------------------------------- /src/test/resources/grammar/sparse-features/grammar: -------------------------------------------------------------------------------- 1 | [X] ||| el chico ||| the boy ||| -1 sparse_test_feature=1 svd=1 the_boy=1 2 | -------------------------------------------------------------------------------- /src/test/resources/grammar/sparse-features/grammar.glue: -------------------------------------------------------------------------------- 1 | [GOAL] ||| ||| ||| 0 2 | [GOAL] ||| [GOAL,1] [X,2] ||| [GOAL,1] [X,2] ||| -1 3 | [GOAL] ||| [GOAL,1] ||| [GOAL,1] ||| 0 4 | -------------------------------------------------------------------------------- /src/test/resources/grammar/sparse-features/grammar.packed/encoding: -------------------------------------------------------------------------------- 1 | bytebytebooleanbooleanboolean0sparse_test_featuresvdthe_boy -------------------------------------------------------------------------------- /src/test/resources/grammar/sparse-features/grammar.packed/slice_00000.features: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/grammar/sparse-features/grammar.packed/slice_00000.features -------------------------------------------------------------------------------- /src/test/resources/grammar/sparse-features/grammar.packed/slice_00000.source: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/grammar/sparse-features/grammar.packed/slice_00000.source -------------------------------------------------------------------------------- /src/test/resources/grammar/sparse-features/grammar.packed/slice_00000.target: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/grammar/sparse-features/grammar.packed/slice_00000.target -------------------------------------------------------------------------------- /src/test/resources/grammar/sparse-features/grammar.packed/slice_00000.target.lookup: -------------------------------------------------------------------------------- 1 |  -------------------------------------------------------------------------------- /src/test/resources/grammar/sparse-features/grammar.packed/vocabulary: -------------------------------------------------------------------------------- 1 | [X]elsvdboysparse_test_featurethechico0 the_boy -------------------------------------------------------------------------------- /src/test/resources/grammar/sparse-features/joshua-packed.config: -------------------------------------------------------------------------------- 1 | tm = thrax pt 20 src/test/resources/grammar/sparse-features/grammar.packed 2 | tm = thrax glue -1 src/test/resources/grammar/sparse-features/grammar.glue 3 | default-non-terminal = X 4 | goal-symbol = GOAL 5 | mark-oovs = false 6 | pop-limit = 100 7 | top-n = 1 8 | use-unique-nbest = true 9 | output-format = %i ||| %s ||| %f ||| %c 10 | include-align-index = false 11 | lm_0 1.0 12 | tm_pt_0 1.0 13 | -------------------------------------------------------------------------------- /src/test/resources/grammar/sparse-features/joshua.config: -------------------------------------------------------------------------------- 1 | tm = thrax pt 20 src/test/resources/grammar/sparse-features/grammar 2 | tm = thrax glue -1 src/test/resources/grammar/sparse-features/grammar.glue 3 | default-non-terminal = X 4 | goal-symbol = GOAL 5 | mark-oovs = false 6 | pop-limit = 100 7 | top-n = 1 8 | use-unique-nbest = true 9 | output-format = %i ||| %s ||| %f ||| %c 10 | include-align-index = false 11 | lm_0 1.0 12 | tm_pt_0 1.0 13 | -------------------------------------------------------------------------------- /src/test/resources/joshua/README.broken: -------------------------------------------------------------------------------- 1 | The tests in and beneath this directory do not currently work (2012-06-18). 2 | -------------------------------------------------------------------------------- /src/test/resources/kbest_extraction/glue-grammar: -------------------------------------------------------------------------------- 1 | [GOAL] ||| ||| ||| 0 2 | [GOAL] ||| [GOAL,1] [X,2] ||| [GOAL,1] [X,2] ||| -1 3 | [GOAL] ||| [GOAL,1] ||| [GOAL,1] ||| 0 4 | -------------------------------------------------------------------------------- /src/test/resources/kbest_extraction/grammar: -------------------------------------------------------------------------------- 1 | [X] ||| a ||| A ||| 2 2 | [X] ||| a ||| B ||| 3 3 | [X] ||| a ||| C ||| 5 4 | [X] ||| a ||| D ||| 7 5 | [X] ||| a ||| E ||| 11 6 | [X] ||| b ||| A ||| 13 7 | [X] ||| b ||| B ||| 17 8 | [X] ||| b ||| C ||| 19 9 | [X] ||| b ||| D ||| 23 10 | [X] ||| b ||| E ||| 29 11 | [X] ||| c ||| A ||| 31 12 | [X] ||| c ||| B ||| 37 13 | [X] ||| c ||| C ||| 41 14 | [X] ||| c ||| D ||| 43 15 | [X] ||| c ||| E ||| 47 16 | [X] ||| d ||| A ||| 53 17 | [X] ||| d ||| B ||| 59 18 | [X] ||| d ||| C ||| 61 19 | [X] ||| d ||| D ||| 67 20 | [X] ||| d ||| E ||| 71 21 | [X] ||| e ||| A ||| 73 22 | [X] ||| e ||| B ||| 79 23 | [X] ||| e ||| C ||| 83 24 | [X] ||| e ||| D ||| 89 25 | [X] ||| e ||| E ||| 97 26 | -------------------------------------------------------------------------------- /src/test/resources/kbest_extraction/joshua.config: -------------------------------------------------------------------------------- 1 | feature-function = StateMinimizingLanguageModel -lm_type kenlm -lm_order 5 -lm_file src/test/resources/kbest_extraction/lm.gz 2 | 3 | tm = thrax -owner pt -maxspan 12 -path src/test/resources/kbest_extraction/grammar 4 | tm = thrax -owner glue -maxspan -1 -path src/test/resources/kbest_extraction/glue-grammar 5 | 6 | mark_oovs=false 7 | 8 | #tm config 9 | default_non_terminal=X 10 | goalSymbol=GOAL 11 | 12 | #pruning config 13 | pop-limit=100 14 | 15 | #nbest config 16 | use_unique_nbest=true 17 | top-n = 3126 18 | 19 | #feature_function = WordPenalty 20 | feature_function = OOVPenalty 21 | 22 | # Model Weights #### 23 | 24 | lm_0 1 25 | tm_pt_0 1 26 | tm_glue_0 1 27 | OOVPenalty 10000 28 | -------------------------------------------------------------------------------- /src/test/resources/kbest_extraction/lm.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/kbest_extraction/lm.gz -------------------------------------------------------------------------------- /src/test/resources/kenlm/oilers.kenlm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/kenlm/oilers.kenlm -------------------------------------------------------------------------------- /src/test/resources/lattice-short/README: -------------------------------------------------------------------------------- 1 | This test ensures that the distance between nodes is computed correctly. In lattices, 2 | the span (j - i) no longer contains the distance between two words, since words connected 3 | by two hops in the lattice could have an arbitrary distance between their node IDs. 4 | -------------------------------------------------------------------------------- /src/test/resources/lattice-short/glue-grammar: -------------------------------------------------------------------------------- 1 | [GOAL] ||| ||| ||| 0 2 | [GOAL] ||| [GOAL,1] [X,2] ||| [GOAL,1] [X,2] ||| -1 3 | [GOAL] ||| [GOAL,1] ||| [GOAL,1] ||| 0 4 | -------------------------------------------------------------------------------- /src/test/resources/lattice-short/grammar.test: -------------------------------------------------------------------------------- 1 | [X] ||| a ||| A ||| 1 2 | [X] ||| b ||| B ||| 2 3 | [X] ||| x ||| X ||| 3 4 | -------------------------------------------------------------------------------- /src/test/resources/lattice-short/input: -------------------------------------------------------------------------------- 1 | a 2 | a x 3 | b 4 | b x 5 | ((('a',1,1),('b',1,2),),(('x',1,2),),(('x',1,1),),) 6 | -------------------------------------------------------------------------------- /src/test/resources/lattice-short/joshua.config: -------------------------------------------------------------------------------- 1 | tm = thrax pt 1 grammar.test 2 | tm = thrax glue -1 glue-grammar 3 | 4 | #lm config 5 | lm = berkeleylm 3 false false 100 test.lm 6 | 7 | #tm config 8 | default_non_terminal=X 9 | goalSymbol=GOAL 10 | 11 | #pruning config 12 | pop-limit = 100 13 | 14 | #nbest config 15 | use_unique_nbest = true 16 | include-align-index = false 17 | top-n = 6 18 | 19 | lattice-decoding = true 20 | 21 | mark-oovs = false 22 | 23 | feature-function = OOVPenalty 24 | feature-function = WordPenalty 25 | feature-function = SourcePath 26 | 27 | ###### model weights 28 | #lm order weight 29 | lm_0 1.0 30 | 31 | #phrasemodel owner column(0-indexed) weight 32 | tm_pt_0 1.0 33 | tm_glue_0 0.0 34 | 35 | #wordpenalty weight 36 | WordPenalty -1.0 37 | SourcePath 1.0 38 | 39 | OOVPenalty 1.0 40 | -------------------------------------------------------------------------------- /src/test/resources/lattice-short/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | cat input | $JOSHUA/bin/joshua-decoder -m 500m -config joshua.config 2> log > output 19 | 20 | if [[ $? -ne 0 ]]; then 21 | exit 1 22 | fi 23 | 24 | diff -u output output.expected > diff 25 | 26 | if [[ $? -eq 0 ]]; then 27 | rm -f output log diff 28 | exit 0 29 | else 30 | exit 1 31 | fi 32 | -------------------------------------------------------------------------------- /src/test/resources/lattice/.gitignore: -------------------------------------------------------------------------------- 1 | log 2 | test.nbest 3 | diff 4 | -------------------------------------------------------------------------------- /src/test/resources/lattice/README: -------------------------------------------------------------------------------- 1 | Tests decoding of lattices (sentence 1) and ensures that lattices are not trimmed 2 | for length but that sentences are (sentence 2). 3 | 4 | Sentence 3 tests scientific notation on arc labels. 5 | -------------------------------------------------------------------------------- /src/test/resources/lattice/glue-grammar: -------------------------------------------------------------------------------- 1 | [GOAL] ||| ||| ||| 0 2 | [GOAL] ||| [GOAL,1] [X,2] ||| [GOAL,1] [X,2] ||| -1 3 | [GOAL] ||| [GOAL,1] ||| [GOAL,1] ||| 0 4 | -------------------------------------------------------------------------------- /src/test/resources/lattice/joshua.config: -------------------------------------------------------------------------------- 1 | tm = thrax pt 1 src/test/resources/lattice/grammar.test 2 | tm = thrax glue -1 src/test/resources/lattice/glue-grammar 3 | 4 | #lm config 5 | feature-function = LanguageModel -lm_type berkeleylm -lm_order 3 -lm_file src/test/resources/lattice/test.lm 6 | 7 | #tm config 8 | default_non_terminal=X 9 | goalSymbol=GOAL 10 | 11 | #pruning config 12 | pop-limit = 100 13 | 14 | #nbest config 15 | use_unique_nbest = true 16 | include-align-index = false 17 | top_n = 300 18 | 19 | mark-oovs = true 20 | 21 | # this shouldn't apply to the lattice 22 | maxlen = 1 23 | 24 | lattice-decoding = true 25 | 26 | output-format = %i ||| %e ||| %s ||| %f ||| %c 27 | 28 | feature-function = OOVPenalty 29 | feature-function = WordPenalty 30 | feature-function = SourcePath 31 | 32 | ###### model weights 33 | #lm order weight 34 | lm_0 1.0 35 | 36 | #phrasemodel owner column(0-indexed) weight 37 | tm_pt_0 1.0 38 | tm_pt_1 0.5 39 | tm_pt_2 0.5 40 | 41 | tm_glue_0 0.0 42 | 43 | #wordpenalty weight 44 | WordPenalty -1.0 45 | SourcePath 1.0 46 | 47 | OOVPenalty 1.0 48 | -------------------------------------------------------------------------------- /src/test/resources/lattice/test-lattice.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/lattice/test-lattice.pdf -------------------------------------------------------------------------------- /src/test/resources/lattice/test.plf: -------------------------------------------------------------------------------- 1 | ((('ein',-0.2,1),('dieses',-0.3,1),('haus',-0.5,2),),(('haus',-0.7,1),),) 2 | ein haus 3 | ((('ein',1.56462193e-07,1),('dieses',-0.3,1),('haus',-0.5,2),),(('haus',-0.7,1),),) 4 | ((('sí', 0, 1),),) 5 | -------------------------------------------------------------------------------- /src/test/resources/lattice/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | # The number of MB to give to Java's heap 19 | # For this example 500 is minimum 20 | # For 32-bit Java 2048 (or so) is the maximum 21 | 22 | rm -f test.nbest test.1best 23 | 24 | cat test.plf | $JOSHUA/bin/joshua-decoder -m 500m -c joshua.config > output 2> log 25 | 26 | if [[ $? -ne 0 ]]; then 27 | exit 1 28 | fi 29 | 30 | diff -u output output.expected > diff 31 | 32 | if [[ $? -eq 0 ]]; then 33 | rm -f output log diff 34 | exit 0 35 | else 36 | exit 1 37 | fi 38 | -------------------------------------------------------------------------------- /src/test/resources/lm/berkeley/lm: -------------------------------------------------------------------------------- 1 | 2 | \data\ 3 | ngram 1=5 4 | ngram 2=3 5 | 6 | \1-grams: 7 | -99.000000 8 | -99.000000 -1.752754 9 | -2.034158 the -0.800943 10 | -5.318589 chat-rooms -0.151088 11 | -1.495702 12 | 13 | \2-grams: 14 | -1.773970 the 15 | -4.878868 the chat-rooms 16 | -0.499794 chat-rooms 17 | -------------------------------------------------------------------------------- /src/test/resources/lm/berkeley/lm.berkeleylm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/lm/berkeley/lm.berkeleylm -------------------------------------------------------------------------------- /src/test/resources/lm/berkeley/lm.berkeleylm.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/lm/berkeley/lm.berkeleylm.gz -------------------------------------------------------------------------------- /src/test/resources/lm/berkeley/lm.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/lm/berkeley/lm.gz -------------------------------------------------------------------------------- /src/test/resources/lm/class_lm/class_lm_9gram.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/lm/class_lm/class_lm_9gram.gz -------------------------------------------------------------------------------- /src/test/resources/lm_oov/joshua.config: -------------------------------------------------------------------------------- 1 | feature-function = LanguageModel -lm_type berkeleylm -lm_order 5 -lm_file src/test/resources/berkeley_lm/lm -oov_feature 2 | 3 | tm = thrax -owner pt -maxspan 12 -path src/test/resources/kbest_extraction/grammar 4 | tm = thrax -owner glue -maxspan -1 -path src/test/resources/kbest_extraction/glue-grammar 5 | 6 | top-n = 0 7 | 8 | #feature_function = WordPenalty 9 | feature_function = OOVPenalty 10 | 11 | # Model Weights #### 12 | 13 | lm_0 0 14 | lm_0_oov 1 15 | OOVPenalty 1 16 | tm_pt_0 0 17 | tm_glue 0 18 | -------------------------------------------------------------------------------- /src/test/resources/packed-grammar/.gitignore: -------------------------------------------------------------------------------- 1 | packer.log 2 | reference.en.all 3 | diff 4 | log 5 | output 6 | output.bleu 7 | grammar.glue 8 | grammar.packed 9 | -------------------------------------------------------------------------------- /src/test/resources/packed-grammar/README: -------------------------------------------------------------------------------- 1 | The test in this directory tests packing the grammar and then decoding with it. 2 | It borrows from the test in ../bn-en/hiero/. 3 | -------------------------------------------------------------------------------- /src/test/resources/packed-grammar/grammar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/packed-grammar/grammar.gz -------------------------------------------------------------------------------- /src/test/resources/packed-grammar/joshua.config: -------------------------------------------------------------------------------- 1 | lm = kenlm 5 false false 100 lm.gz 2 | tm = thrax -owner pt -maxspan 12 -path grammar.packed 3 | tm = thrax -owner glue -maxspan -1 -path grammar.glue 4 | 5 | mark_oovs=false 6 | 7 | #tm config 8 | default_non_terminal=X 9 | goalSymbol=GOAL 10 | 11 | #pruning config 12 | pop-limit = 10 13 | 14 | #nbest config 15 | use_unique_nbest=true 16 | top-n = 1 17 | 18 | output-format = %c ||| %s 19 | feature-function = OOVPenalty 20 | feature-function = WordPenalty 21 | 22 | ###### model weights 23 | #lm order weight 24 | lm_0 1.2373676802179452 25 | 26 | #phrasemodel owner column(0-indexed) weight 27 | tm_pt_0 -2.4497429277910214 28 | tm_pt_1 0.7224581556224123 29 | tm_pt_2 -0.31689069155153504 30 | tm_pt_3 0.33861043967238036 31 | tm_pt_4 0.03553113401320236 32 | tm_pt_5 0.19138972284064748 33 | tm_pt_6 0.3417994095521415 34 | tm_pt_7 -0.9936312455671283 35 | tm_pt_8 0.9070737587091975 36 | tm_pt_9 0.8202511858619419 37 | tm_pt_10 0.2593091306160006 38 | tm_pt_11 0.25597137004462134 39 | tm_pt_12 0.3538894647790496 40 | tm_pt_13 -0.36212061186692646 41 | tm_pt_14 -0.32923261148678096 42 | tm_pt_15 0.5524863522177359 43 | tm_pt_16 0.23451595442127693 44 | tm_glue_0 1 45 | WordPenalty -3.6942747832593694 46 | OOVPenalty 1.0 47 | -------------------------------------------------------------------------------- /src/test/resources/packed-grammar/lm.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/packed-grammar/lm.gz -------------------------------------------------------------------------------- /src/test/resources/packed-grammar/test-multiple.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | set -u 19 | 20 | # pack the grammar 21 | rm -rf foo.packed bar.packed 22 | $JOSHUA/scripts/support/grammar-packer.pl -v -g 'grammar.gz grammar.gz' -o 'foo.packed bar.packed' 2> packer-multiple.log 23 | 24 | diff -q foo.packed/vocabulary bar.packed/vocabulary > diff 25 | 26 | if [ $? -eq 0 ]; then 27 | rm -rf foo.packed bar.packed packer-multiple.log 28 | exit 0 29 | else 30 | exit 1 31 | fi 32 | -------------------------------------------------------------------------------- /src/test/resources/packed-grammar/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | set -u 19 | 20 | # pack the grammar 21 | rm -rf grammar.packed 22 | $JOSHUA/scripts/support/grammar-packer.pl -v -g grammar.gz -o grammar.packed 2> packer.log 23 | 24 | # generate the glue grammar 25 | $JOSHUA/scripts/support/create_glue_grammar.sh grammar.packed > grammar.glue 2> glue.log 26 | 27 | # decode 28 | cat input.bn | $JOSHUA/bin/joshua-decoder -m 1g -threads 2 -c joshua.config > output 2> log 29 | 30 | diff -u output output.gold > diff 31 | 32 | if [ $? -eq 0 ]; then 33 | #rm -f packer.log diff log output.bleu output grammar.glue glue.log 34 | rm -rf grammar.packed 35 | exit 0 36 | else 37 | exit 1 38 | fi 39 | -------------------------------------------------------------------------------- /src/test/resources/parser/grammar: -------------------------------------------------------------------------------- 1 | [DT] ||| the ||| the ||| 0 2 | [NN] ||| feline ||| cat ||| 0 3 | [NN] ||| canine ||| dog ||| 0 4 | [NN] ||| mat ||| rug ||| 0 5 | [VBZ] ||| is ||| is ||| 0 6 | [PREP] ||| on ||| upon ||| 0 7 | [NP] ||| [DT,1] [NN,2] ||| [DT,1] [NN,2] ||| 0 8 | [PP] ||| [PREP,1] [NP,2] ||| [PREP,1] [NP,2] ||| 0 9 | [VP] ||| [VBZ,1] [PP,2] ||| [VBZ,1] [PP,2] ||| 0 10 | [S] ||| [NP,1] [VP,2] ||| [NP,1] [VP,2] ||| 0 11 | [S] ||| a ||| z ||| 0 12 | -------------------------------------------------------------------------------- /src/test/resources/parser/grammar.glue: -------------------------------------------------------------------------------- 1 | [GOAL] ||| [S,1] ||| [S,1] ||| 0 2 | -------------------------------------------------------------------------------- /src/test/resources/parser/input: -------------------------------------------------------------------------------- 1 | the feline is on the mat ||| the cat is upon the rug 2 | the canine is on the mat ||| the dog is upon the rug 3 | canine the on is ||| rug mat the the the 4 | a ||| z 5 | -------------------------------------------------------------------------------- /src/test/resources/parser/output.gold: -------------------------------------------------------------------------------- 1 | 0 ||| (ROOT ([0-GOAL-8] ([1-S-7] ([1-NP-3] ([1-DT-2] the) ([2-NN-3] cat)) ([3-VP-7] ([3-VBZ-4] is) ([4-PP-7] ([4-PREP-5] upon) ([5-NP-7] ([5-DT-6] the) ([6-NN-7] rug))))) )) 2 | 1 ||| (ROOT ([0-GOAL-8] ([1-S-7] ([1-NP-3] ([1-DT-2] the) ([2-NN-3] dog)) ([3-VP-7] ([3-VBZ-4] is) ([4-PP-7] ([4-PREP-5] upon) ([5-NP-7] ([5-DT-6] the) ([6-NN-7] rug))))) )) 3 | 2 ||| 4 | 3 ||| (ROOT ([0-GOAL-3] ([1-S-2] z) )) 5 | -------------------------------------------------------------------------------- /src/test/resources/parser/parse.config: -------------------------------------------------------------------------------- 1 | parse = true 2 | mark_oovs = false 3 | pop-limit = 0 4 | 5 | output-format = %i ||| %t 6 | 7 | tm = thrax pt 100 grammar 8 | tm = thrax glue 100 grammar.glue 9 | 10 | #tm config 11 | default_non_terminal = X 12 | goalSymbol = GOAL 13 | 14 | #nbest config 15 | use_unique_nbest = true 16 | top_n = 1 17 | 18 | weights-file = weights 19 | -------------------------------------------------------------------------------- /src/test/resources/parser/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | set -u 19 | 20 | cat input | $JOSHUA/bin/joshua-decoder -c parse.config > output 2> log 21 | 22 | diff -u output output.gold > diff 23 | 24 | if [ $? -eq 0 ]; then 25 | rm -rf output diff log 26 | exit 0 27 | else 28 | exit 1 29 | fi 30 | -------------------------------------------------------------------------------- /src/test/resources/parser/weights: -------------------------------------------------------------------------------- 1 | tm_pt_0 1.0 2 | tm_glue_0 1.0 3 | WordPenalty -2.844814 4 | 5 | -------------------------------------------------------------------------------- /src/test/resources/phrase_decoder/config: -------------------------------------------------------------------------------- 1 | tm = moses -owner pt -maxspan 0 -path src/test/resources/phrase_decoder/rules.1.gz -max-source-len 5 2 | feature-function = StateMinimizingLanguageModel -lm_order 5 -lm_file src/test/resources/phrase_decoder/lm.1.gz 3 | 4 | search = stack 5 | 6 | mark-oovs = false 7 | pop-limit = 10 8 | top-n = 1 9 | 10 | output-format = %i ||| %s ||| %f ||| %c 11 | 12 | include-align-index = true 13 | reordering-limit = 6 14 | 15 | # And these are the feature functions to activate. 16 | feature-function = OOVPenalty 17 | feature-function = WordPenalty 18 | feature-function = Distortion 19 | feature-function = PhrasePenalty -owner pt 20 | 21 | OOVPenalty 1.0 22 | Distortion 0.114849 23 | WordPenalty -0.201544 24 | PhrasePenalty -0.236965 25 | tm_pt_0 0.0370068 26 | tm_pt_1 0.0495759 27 | tm_pt_2 0.196742 28 | tm_pt_3 0.0745423 29 | lm_0 0.204412452147565 30 | -------------------------------------------------------------------------------- /src/test/resources/phrase_decoder/config.packed: -------------------------------------------------------------------------------- 1 | tm = moses -owner pt -maxspan 0 -path rules.packed -max-source-len 5 2 | feature-function = StateMinimizingLanguageModel -lm_order 5 -lm_file lm.1.gz 3 | 4 | search = stack 5 | 6 | mark-oovs = false 7 | pop-limit = 10 8 | top-n = 1 9 | 10 | output-format = %i ||| %s ||| %f ||| %c 11 | 12 | include-align-index = false 13 | reordering-limit = 6 14 | 15 | # And these are the feature functions to activate. 16 | feature-function = OOVPenalty 17 | feature-function = WordPenalty 18 | feature-function = Distortion 19 | feature-function = PhrasePenalty -owner pt 20 | 21 | OOVPenalty 1.0 22 | Distortion 0.114849 23 | WordPenalty -0.201544 24 | PhrasePenalty -0.236965 25 | tm_pt_0 0.0370068 26 | tm_pt_1 0.0495759 27 | tm_pt_2 0.196742 28 | tm_pt_3 0.0745423 29 | lm_0 0.204412452147565 30 | -------------------------------------------------------------------------------- /src/test/resources/phrase_decoder/constrained.config: -------------------------------------------------------------------------------- 1 | tm = moses pt 0 src/test/resources/phrase_decoder/rules.1.gz 2 | 3 | lm = kenlm 5 true false 100 src/test/resources/phrase_decoder/lm.1.gz 4 | 5 | mark-oovs = false 6 | pop-limit = 10 7 | top-n = 5 8 | 9 | output-format = %i ||| %s ||| %f ||| %c 10 | 11 | include-align-index = true 12 | reordering-limit = 10 13 | 14 | # And these are the feature functions to activate. 15 | feature-function = OOVPenalty 16 | feature-function = WordPenalty 17 | feature-function = Distortion 18 | feature-function = PhrasePenalty -owner pt 19 | 20 | OOVPenalty 1.0 21 | Distortion 0.114849 22 | WordPenalty -0.201544 23 | PhrasePenalty -0.236965 24 | tm_pt_0 0.0370068 25 | tm_pt_1 0.0495759 26 | tm_pt_2 0.196742 27 | tm_pt_3 0.0745423 28 | lm_0 0.204412452147565 29 | -------------------------------------------------------------------------------- /src/test/resources/phrase_decoder/constrained.output.gold: -------------------------------------------------------------------------------- 1 | 0 ||| President Obama |8-8| to |7-7| hinder |4-4| a strategy |0-1| for |3-3| Republican |2-2| re @-@ election |5-6| ||| tm_pt_0=-15.792 tm_pt_1=-17.550 tm_pt_2=-14.599 tm_pt_3=-18.298 lm_0=-29.452 OOVPenalty=0.000 WordPenalty=-4.777 Distortion=-24.000 PhrasePenalty=7.000 ||| -15.163 2 | 0 ||| President Obama |8-8| to |7-7| hinder |4-4| a |0-0| strategy |1-1| for |3-3| Republican |2-2| re @-@ election |5-6| ||| tm_pt_0=-16.919 tm_pt_1=-17.550 tm_pt_2=-14.917 tm_pt_3=-18.298 lm_0=-29.452 OOVPenalty=0.000 WordPenalty=-4.777 Distortion=-24.000 PhrasePenalty=8.000 ||| -15.505 3 | 0 ||| President Obama |8-8| to hinder |3-4| a strategy |0-1| for |7-7| Republican |2-2| re @-@ election |5-6| ||| tm_pt_0=-14.986 tm_pt_1=-17.951 tm_pt_2=-14.075 tm_pt_3=-18.699 lm_0=-29.452 OOVPenalty=0.000 WordPenalty=-4.777 Distortion=-32.000 PhrasePenalty=6.000 ||| -15.762 4 | 0 ||| President Obama |8-8| to hinder |3-4| a |0-0| strategy |1-1| for |7-7| Republican |2-2| re @-@ election |5-6| ||| tm_pt_0=-16.112 tm_pt_1=-17.951 tm_pt_2=-14.393 tm_pt_3=-18.699 lm_0=-29.452 OOVPenalty=0.000 WordPenalty=-4.777 Distortion=-32.000 PhrasePenalty=7.000 ||| -16.103 5 | 0 ||| President Obama |8-8| to |3-3| hinder |4-4| a strategy |0-1| for |7-7| Republican |2-2| re @-@ election |5-6| ||| tm_pt_0=-16.329 tm_pt_1=-17.951 tm_pt_2=-15.136 tm_pt_3=-18.699 lm_0=-29.452 OOVPenalty=0.000 WordPenalty=-4.777 Distortion=-32.000 PhrasePenalty=7.000 ||| -16.257 6 | -------------------------------------------------------------------------------- /src/test/resources/phrase_decoder/lm.1.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/phrase_decoder/lm.1.gz -------------------------------------------------------------------------------- /src/test/resources/phrase_decoder/output.gold: -------------------------------------------------------------------------------- 1 | 0 ||| a strategy |0-1| republican |2-2| to hinder |3-4| reelection |5-6| Obama |7-8| ||| tm_pt_0=-9.702 tm_pt_1=-10.800 tm_pt_2=-7.543 tm_pt_3=-8.555 lm_0=-19.117 OOVPenalty=0.000 WordPenalty=-3.040 Distortion=0.000 PhrasePenalty=5.000 ||| -7.496 2 | -------------------------------------------------------------------------------- /src/test/resources/phrase_decoder/rules.1.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/phrase_decoder/rules.1.gz -------------------------------------------------------------------------------- /src/test/resources/phrase_decoder/rules.packed/config: -------------------------------------------------------------------------------- 1 | version = 4 2 | max-source-len = 3 3 | -------------------------------------------------------------------------------- /src/test/resources/phrase_decoder/rules.packed/encoding: -------------------------------------------------------------------------------- 1 | bytefloatfloatfloatfloat0123 -------------------------------------------------------------------------------- /src/test/resources/phrase_decoder/rules.packed/slice_00000.features: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/phrase_decoder/rules.packed/slice_00000.features -------------------------------------------------------------------------------- /src/test/resources/phrase_decoder/rules.packed/slice_00000.source: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/phrase_decoder/rules.packed/slice_00000.source -------------------------------------------------------------------------------- /src/test/resources/phrase_decoder/rules.packed/slice_00000.target: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/phrase_decoder/rules.packed/slice_00000.target -------------------------------------------------------------------------------- /src/test/resources/phrase_decoder/rules.packed/slice_00000.target.lookup: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/phrase_decoder/rules.packed/slice_00000.target.lookup -------------------------------------------------------------------------------- /src/test/resources/phrase_decoder/rules.packed/vocabulary: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/phrase_decoder/rules.packed/vocabulary -------------------------------------------------------------------------------- /src/test/resources/pipeline/.gitignore: -------------------------------------------------------------------------------- 1 | 1 2 | pipeline.log 3 | -------------------------------------------------------------------------------- /src/test/resources/pipeline/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: test 2 | 3 | all: test 4 | 5 | test: 6 | ./test.sh 7 | 8 | clean: 9 | @rm -rf 1 10 | 11 | -------------------------------------------------------------------------------- /src/test/resources/pipeline/final-bleu.gold: -------------------------------------------------------------------------------- 1 | 0.0176 / 1 = 0.0176 2 | -------------------------------------------------------------------------------- /src/test/resources/scripts/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /src/test/resources/scripts/normalization/.gitignore: -------------------------------------------------------------------------------- 1 | diff 2 | output 3 | -------------------------------------------------------------------------------- /src/test/resources/scripts/normalization/data/train.en: -------------------------------------------------------------------------------- 1 | hello : ; 2 | 3 | ‘word ‘no’ do?’ 4 | 5 | ‘word ‘no’ do?’ 6 | 7 | ´word ´no´ do?´ 8 | 9 | ain’t 10 | 11 | ain't 12 | 13 | uh… 14 | 15 | something after nbspaces 16 | 17 | secretary﷓general 18 | 19 | duración 20 | 21 | españa 22 | -------------------------------------------------------------------------------- /src/test/resources/scripts/normalization/data/train.en.norm: -------------------------------------------------------------------------------- 1 | hello:; 2 | 3 | "word "no" do?" 4 | 5 | "word "no" do?" 6 | 7 | 'word 'no' do?' 8 | 9 | ain't 10 | 11 | ain't 12 | 13 | uh... 14 | 15 | something after nbspaces 16 | 17 | secretary-general 18 | 19 | duración 20 | 21 | españa 22 | -------------------------------------------------------------------------------- /src/test/resources/scripts/normalization/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | set -u 20 | 21 | cat data/train.en | $JOSHUA/scripts/preparation/normalize.pl en > output 22 | diff -U 1 output data/train.en.norm > diff 23 | 24 | if [[ $? -eq 0 ]]; then 25 | rm -f output diff 26 | exit 0 27 | else 28 | exit 1 29 | fi 30 | -------------------------------------------------------------------------------- /src/test/resources/scripts/support/moses_grammar/input: -------------------------------------------------------------------------------- 1 | ! ! ! " 题目 [X] ||| questions were [X] ||| 0.000629742 8.23907e-14 0.0206241 0.000385206 ||| 4-0 4-1 ||| 32.75 1 1 ||| ||| 2 | ! ! ! [X] ||| ! ! ! [X] ||| 0.0206241 0.265467 0.0206241 0.0277773 ||| 0-0 0-1 1-1 1-2 2-2 ||| 1 1 1 ||| ||| 3 | ! ! ! [X][X] , [X] ||| [X][X] , [X] ||| 2.59611e-09 5.90069e-10 0.0206241 0.488999 ||| 3-0 4-1 ||| 662017 0.0833333 0.0833333 ||| ||| 4 | ! ! ! [X][X] 公布 [X] ||| [X][X] published [X] ||| 2.61822e-06 7.87304e-11 0.0206241 0.0407651 ||| 3-0 4-1 ||| 875.235 0.111111 0.111111 ||| ||| 5 | ! ! ! [X][X] 立刻 [X] ||| [X][X] they immediately [X] ||| 7.08771e-05 4.22904e-11 0.0206241 0.0106107 ||| 3-0 4-1 4-2 ||| 13.2265 0.0454545 0.0454545 ||| ||| 6 | ! ! ! 了 , [X] ||| , has weakened [X] ||| 0.000927618 5.79717e-11 0.0108156 3.40604e-07 ||| 3-2 4-0 ||| 3.17619 0.272412 0.142857 ||| ||| 7 | ! ! ! 了 , [X] ||| , has weakened unity [X] ||| 0.00114296 5.79717e-11 0.00582378 7.32299e-12 ||| 3-2 4-0 ||| 1.38803 0.272412 0.0769231 ||| ||| 8 | ! ! ! 了 , [X] ||| , has weakened unity among [X] ||| 0.000853088 5.79717e-11 0.00398469 4.12504e-15 ||| 3-2 4-0 ||| 1.27241 0.272412 0.0526316 ||| ||| 9 | ! ! ! 了 [X] ||| has weakened [X] ||| 0.000759167 8.63793e-11 0.00343734 6.96533e-07 ||| 3-1 ||| 27.1667 6 1 ||| ||| 10 | ! ! ! 了 [X] ||| has weakened unity [X] ||| 0.00412481 8.63793e-11 0.00343734 1.49755e-11 ||| 3-1 ||| 5 6 1 ||| ||| 11 | -------------------------------------------------------------------------------- /src/test/resources/scripts/support/moses_grammar/output.expected: -------------------------------------------------------------------------------- 1 | [X] ||| ! ! ! " 题目 ||| questions were ||| 7.37020 30.12730 3.88129 7.86173 ||| 4-0 4-1 2 | [X] ||| ! ! ! ||| ! ! ! ||| 3.88129 1.32626 3.88129 3.58354 ||| 0-0 0-1 1-1 1-2 2-2 3 | [X] ||| ! ! ! [X,1] , ||| [X,1] , ||| 19.76925 21.25078 3.88129 0.71539 ||| 3-0 4-1 4 | [X] ||| ! ! ! [X,1] 公布 ||| [X,1] published ||| 12.85302 23.26499 3.88129 3.19993 ||| 3-0 4-1 5 | [X] ||| ! ! ! [X,1] 立刻 ||| [X,1] they immediately ||| 9.55456 23.88646 3.88129 4.54589 ||| 3-0 4-1 4-2 6 | [X] ||| ! ! ! 了 , ||| , has weakened ||| 6.98289 23.57107 4.52677 14.89255 ||| 3-2 4-0 7 | [X] ||| ! ! ! 了 , ||| , has weakened unity ||| 6.77413 23.57107 5.14581 25.64000 ||| 3-2 4-0 8 | [X] ||| ! ! ! 了 , ||| , has weakened unity among ||| 7.06665 23.57107 5.52530 33.12170 ||| 3-2 4-0 9 | [X] ||| ! ! ! 了 ||| has weakened ||| 7.18329 23.17227 5.67306 14.17715 ||| 3-1 10 | [X] ||| ! ! ! 了 ||| has weakened unity ||| 5.49074 23.17227 5.67306 24.92461 ||| 3-1 11 | -------------------------------------------------------------------------------- /src/test/resources/scripts/support/moses_grammar/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | cat input | $JOSHUA/scripts/support/moses2joshua_grammar.pl > output 20 | 21 | diff -u output output.expected > diff 22 | 23 | if [ $? -eq 0 ]; then 24 | rm -f diff output 25 | exit 0 26 | else 27 | exit 1 28 | fi 29 | 30 | 31 | -------------------------------------------------------------------------------- /src/test/resources/server/http/expected: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "translations": [ 4 | { 5 | "translatedText": "I_OOV love_OOV it_OOV when_OOV I_OOV get_OOV the_OOV house_OOV clean_OOV before_OOV the_OOV weekend_OOV", 6 | "raw_nbest": [ 7 | { 8 | "hyp": "I_OOV love_OOV it_OOV when_OOV I_OOV get_OOV the_OOV house_OOV clean_OOV before_OOV the_OOV weekend_OOV", 9 | "totalScore": 0.0 10 | } 11 | ] 12 | } 13 | ] 14 | }, 15 | "metadata": [] 16 | } 17 | -------------------------------------------------------------------------------- /src/test/resources/server/http/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | # This test case starts a server and then throws 10 threads at it to make sure threading is working. 20 | 21 | $JOSHUA/bin/joshua -threads 4 -server-port 9010 -server-type http -mark-oovs true > server.log 2>&1 & 22 | serverpid=$! 23 | sleep 1 24 | 25 | curl -s http://localhost:9010/?q=I%20love%20it%20when%20I%20get%20the%20house%20clean%20before%20the%20weekend > output 26 | 27 | kill -15 $serverpid 2> /dev/null 28 | 29 | diff -u output expected > diff 30 | 31 | if [[ $? -eq 0 ]]; then 32 | rm -f server.log output log diff 33 | exit 0 34 | else 35 | exit 1 36 | fi 37 | -------------------------------------------------------------------------------- /src/test/resources/server/tcp-text/expected: -------------------------------------------------------------------------------- 1 | 0 ||| this_OOV 0 ||| this_OOV 0 ||| this_OOV 0 ||| this_OOV 0 ||| this_OOV 0 ||| this_OOV 0 ||| this_OOV 0 ||| this_OOV 0 ||| this_OOV 0 ||| this_OOV 2 | 1 ||| that_OOV 1 ||| that_OOV 1 ||| that_OOV 1 ||| that_OOV 1 ||| that_OOV 1 ||| that_OOV 1 ||| that_OOV 1 ||| that_OOV 1 ||| that_OOV 1 ||| that_OOV 3 | 2 ||| these_OOV 2 ||| these_OOV 2 ||| these_OOV 2 ||| these_OOV 2 ||| these_OOV 2 ||| these_OOV 2 ||| these_OOV 2 ||| these_OOV 2 ||| these_OOV 2 ||| these_OOV 4 | 3 ||| 3 ||| 3 ||| 3 ||| 3 ||| 3 ||| 3 ||| 3 ||| 3 ||| 3 ||| 5 | 4 ||| those_OOV 4 ||| those_OOV 4 ||| those_OOV 4 ||| those_OOV 4 ||| those_OOV 4 ||| those_OOV 4 ||| those_OOV 4 ||| those_OOV 4 ||| those_OOV 4 ||| those_OOV 6 | 5 ||| mine_OOV 5 ||| mine_OOV 5 ||| mine_OOV 5 ||| mine_OOV 5 ||| mine_OOV 5 ||| mine_OOV 5 ||| mine_OOV 5 ||| mine_OOV 5 ||| mine_OOV 5 ||| mine_OOV 7 | 6 ||| his_OOV 6 ||| his_OOV 6 ||| his_OOV 6 ||| his_OOV 6 ||| his_OOV 6 ||| his_OOV 6 ||| his_OOV 6 ||| his_OOV 6 ||| his_OOV 6 ||| his_OOV 8 | 7 ||| yours_OOV 7 ||| yours_OOV 7 ||| yours_OOV 7 ||| yours_OOV 7 ||| yours_OOV 7 ||| yours_OOV 7 ||| yours_OOV 7 ||| yours_OOV 7 ||| yours_OOV 7 ||| yours_OOV 9 | 8 ||| hers_OOV 8 ||| hers_OOV 8 ||| hers_OOV 8 ||| hers_OOV 8 ||| hers_OOV 8 ||| hers_OOV 8 ||| hers_OOV 8 ||| hers_OOV 8 ||| hers_OOV 8 ||| hers_OOV 10 | -------------------------------------------------------------------------------- /src/test/resources/testng.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /src/test/resources/thrax/.gitignore: -------------------------------------------------------------------------------- 1 | thrax.log 2 | thrax 3 | .grammar.crc 4 | hadoop-0.20.2 5 | grammar 6 | -------------------------------------------------------------------------------- /src/test/resources/thrax/extraction/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | # Tests that Hadoop works, in standalone mode. 20 | 21 | set -u 22 | 23 | if [[ -z $HADOOP ]]; then 24 | exit 0 25 | fi 26 | 27 | $JOSHUA/scripts/training/run_thrax.py -f input/thrax.conf input/train.{ps,en,a} 2> thrax.log 28 | 29 | size=$(perl -e "print +(stat('grammar.gz'))[7] . $/") 30 | 31 | if [[ $size -eq 106851 ]]; then 32 | rm -rf thrax.log grammar.gz 33 | exit 0 34 | else 35 | exit 1 36 | fi 37 | -------------------------------------------------------------------------------- /src/test/resources/thrax/filtering/dev.hi-en.hi.1: -------------------------------------------------------------------------------- 1 | ऐसा माना जाता है कि आज की आधुनिक दिल्ली बनने से पहले दिल्ली सात बार उजड़ी और विभिन्न स्थानों पर बसी जिनके कुछ अवशेष अब भी देखे जा सकते हैं। 2 | -------------------------------------------------------------------------------- /src/test/resources/thrax/filtering/exact.log.gold: -------------------------------------------------------------------------------- 1 | ........10........20........30........40........50........60........70........80........90.....100% 2 | Added 1 sentences. 3 | 4 | Filtering rules with the exact filter... 5 | [INFO] Total rules read: 8336 6 | [INFO] Rules kept: 993 7 | [INFO] Rules dropped: 7343 8 | [INFO] cached queries: 8022 9 | ........10........20........30........40........50........60........70........80........90.....100% 10 | Added 3 sentences. 11 | 12 | Filtering rules with the exact filter... 13 | ........10........20........30........40........50........60........70........80........90.....100% 14 | [INFO] Total rules read: 4 15 | [INFO] Rules kept: 1 16 | [INFO] Rules dropped: 3 17 | [INFO] cached queries: 0 18 | -------------------------------------------------------------------------------- /src/test/resources/thrax/filtering/fast.log.gold: -------------------------------------------------------------------------------- 1 | ........10........20........30........40........50........60........70........80........90.....100% 2 | Added 1 sentences. 3 | 4 | Filtering rules with the fast filter... 5 | ........10........20........30........40........50........60........70........80........90.....100% 6 | [INFO] Total rules read: 8336 7 | [INFO] Rules kept: 1087 8 | [INFO] Rules dropped: 7249 9 | [INFO] cached queries: 8022 10 | ........10........20........30........40........50........60........70........80........90.....100% 11 | Added 3 sentences. 12 | 13 | Filtering rules with the fast filter... 14 | [INFO] Total rules read: 4 15 | [INFO] Rules kept: 2 16 | [INFO] Rules dropped: 2 17 | [INFO] cached queries: 0 18 | -------------------------------------------------------------------------------- /src/test/resources/thrax/filtering/grammar.de: -------------------------------------------------------------------------------- 1 | [X] ||| golf ||| golf ||| 1 2 | [X] ||| loch ||| hole ||| 1 3 | [X] ||| [X,1] olflo [X,2] ||| [X,1] garbage [X,2] ||| 1 4 | [X] ||| play auf [X,1] quiero ||| garbage ||| 1 5 | -------------------------------------------------------------------------------- /src/test/resources/thrax/filtering/grammar.filtered.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/thrax/filtering/grammar.filtered.gz -------------------------------------------------------------------------------- /src/test/resources/thrax/filtering/input.de: -------------------------------------------------------------------------------- 1 | golfloch 2 | ich will golf playen 3 | yo quiero play auf dem golfloch 4 | -------------------------------------------------------------------------------- /src/test/resources/thrax/filtering/loose.log.gold: -------------------------------------------------------------------------------- 1 | ........10........20........30........40........50........60........70........80........90.....100% 2 | Added 1 sentences. 3 | 4 | Filtering rules with the loose filter... 5 | [INFO] Total rules read: 8336 6 | [INFO] Rules kept: 4099 7 | [INFO] Rules dropped: 4237 8 | [INFO] cached queries: 8022 9 | ........10........20........30........40........50........60........70........80........90.....100% 10 | Added 3 sentences. 11 | 12 | Filtering rules with the loose filter... 13 | [INFO] Total rules read: 4 14 | [INFO] Rules kept: 3 15 | [INFO] Rules dropped: 1 16 | [INFO] cached queries: 0 17 | -------------------------------------------------------------------------------- /src/test/resources/thrax/filtering/test-exact.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | # Tests that both fast and exact filtering of grammars to test files works. 20 | 21 | set -u 22 | 23 | # exact filtering 24 | gzip -cd grammar.filtered.gz | java -Xmx500m -Dfile.encoding=utf8 -cp $JOSHUA/class joshua.tools.TestSetFilter -v -e dev.hi-en.hi.1 > exact 2> exact.log 25 | java -Xmx500m -Dfile.encoding=utf8 -cp $JOSHUA/class joshua.tools.TestSetFilter -v -e -g grammar.de input.de >> exact 2>> exact.log 26 | 27 | diff -u exact.log exact.log.gold > diff.exact 28 | 29 | if [[ $? -eq 0 ]]; then 30 | rm -rf exact exact.log diff.exact 31 | exit 0 32 | else 33 | exit 1 34 | fi 35 | -------------------------------------------------------------------------------- /src/test/resources/thrax/filtering/test-fast.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | # Tests that both fast and exact filtering of grammars to test files works. 20 | 21 | set -u 22 | 23 | # fast filtering 24 | java -Xmx500m -Dfile.encoding=utf8 -cp $JOSHUA/class joshua.tools.TestSetFilter -f -v -g grammar.filtered.gz dev.hi-en.hi.1 > fast 2> fast.log 25 | cat grammar.de | java -Xmx500m -Dfile.encoding=utf8 -cp $JOSHUA/class joshua.tools.TestSetFilter -f -v input.de >> fast 2>> fast.log 26 | 27 | diff -u fast.log fast.log.gold > diff.fast 28 | 29 | if [[ $? -eq 0 ]]; then 30 | rm -rf fast fast.log diff.fast 31 | exit 0 32 | else 33 | exit 1 34 | fi 35 | -------------------------------------------------------------------------------- /src/test/resources/thrax/filtering/test-loose.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | # Tests loose filtering. 20 | 21 | set -u 22 | 23 | # loose filtering 24 | gzip -cd grammar.filtered.gz | java -Xmx500m -Dfile.encoding=utf8 -cp $JOSHUA/class joshua.tools.TestSetFilter -v -l dev.hi-en.hi.1 > loose 2> loose.log 25 | cat grammar.de | java -Xmx500m -Dfile.encoding=utf8 -cp $JOSHUA/class joshua.tools.TestSetFilter -v -l input.de >> loose 2>> loose.log 26 | 27 | diff -u loose.log loose.log.gold > diff.loose 28 | 29 | if [[ $? -eq 0 ]]; then 30 | rm -rf loose loose.log diff.loose 31 | exit 0 32 | else 33 | exit 1 34 | fi 35 | -------------------------------------------------------------------------------- /src/test/resources/wa_grammar: -------------------------------------------------------------------------------- 1 | [X] ||| A [X,1] B1 [X,2] B2 C ||| a b [X,2] c1 [X,1] c2 ||| 1 1 1 1 1 1 OOV=1 ||| 0-0 2-1 4-1 5-3 5-5 2 | [X] ||| U Z1 Z2 ||| n1 u z ||| 1 1 1 1 1 1 OOV=2 ||| 0-1 1-2 2-2 3 | [X] ||| K ||| k1 k2 k3 n1 n2 n3 ||| 1 1 1 1 1 1 OOV=4 ||| 0-0 0-1 0-2 -------------------------------------------------------------------------------- /src/test/resources/wa_grammar.packed/config: -------------------------------------------------------------------------------- 1 | max-source-len = 6 2 | version = 3 3 | -------------------------------------------------------------------------------- /src/test/resources/wa_grammar.packed/encoding: -------------------------------------------------------------------------------- 1 | bytebooleanbooleanbooleanbooleanbooleanbooleanbyte45OOV0123 -------------------------------------------------------------------------------- /src/test/resources/wa_grammar.packed/slice_00000.alignments: -------------------------------------------------------------------------------- 1 | " -------------------------------------------------------------------------------- /src/test/resources/wa_grammar.packed/slice_00000.features: -------------------------------------------------------------------------------- 1 | & -------------------------------------------------------------------------------- /src/test/resources/wa_grammar.packed/slice_00000.source: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/wa_grammar.packed/slice_00000.source -------------------------------------------------------------------------------- /src/test/resources/wa_grammar.packed/slice_00000.target: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/joshua/716cc81aca318f3cb6ed9bd81bd29f043db77eff/src/test/resources/wa_grammar.packed/slice_00000.target -------------------------------------------------------------------------------- /src/test/resources/wa_grammar.packed/slice_00000.target.lookup: -------------------------------------------------------------------------------- 1 |  -------------------------------------------------------------------------------- /src/test/resources/wa_grammar.packed/vocabulary: -------------------------------------------------------------------------------- 1 | [X]A[X,1]B1[X,2]B2Ca b 2 | c1 c2 0 12345OOVUZ1Z2n1uzKk1k2k3n2n3 --------------------------------------------------------------------------------