├── maligna-ui ├── src │ └── main │ │ ├── java │ │ └── net │ │ │ └── loomchild │ │ │ └── maligna │ │ │ └── ui │ │ │ └── console │ │ │ ├── command │ │ │ ├── Command.java │ │ │ ├── exception │ │ │ │ ├── ParametersParseException.java │ │ │ │ ├── MissingParameterException.java │ │ │ │ ├── UnknownParameterException.java │ │ │ │ ├── ParameterFormatException.java │ │ │ │ ├── WrongArgumentCountException.java │ │ │ │ └── CommandException.java │ │ │ ├── TestCommand.java │ │ │ ├── CommandFactory.java │ │ │ └── MacroCommand.java │ │ │ └── Maligna.java │ │ ├── scripts │ │ ├── maligna │ │ └── maligna.bat │ │ └── assembly │ │ └── bin.xml ├── examples │ ├── script │ │ ├── example3 │ │ ├── example3.bat │ │ ├── example1 │ │ ├── example1.bat │ │ ├── example2 │ │ └── example2.bat │ ├── txt │ │ ├── poznan-small-de.txt │ │ └── poznan-small-pl.txt │ └── align │ │ └── human │ │ └── poznan-oracle.al └── pom.xml ├── maligna └── src │ ├── main │ ├── java │ │ └── net │ │ │ └── loomchild │ │ │ └── maligna │ │ │ ├── matrix │ │ │ ├── MatrixFactory.java │ │ │ ├── FullMatrixFactory.java │ │ │ ├── PositionOutsideBandException.java │ │ │ ├── BandMatrixFactory.java │ │ │ ├── Matrix.java │ │ │ ├── FullMatrixIterator.java │ │ │ ├── FullMatrix.java │ │ │ ├── BandMatrixIterator.java │ │ │ └── MatrixIterator.java │ │ │ ├── calculator │ │ │ ├── length │ │ │ │ ├── counter │ │ │ │ │ ├── CharCounter.java │ │ │ │ │ ├── Counter.java │ │ │ │ │ └── SplitCounter.java │ │ │ │ └── LengthCalculator.java │ │ │ ├── CalculatorMock.java │ │ │ ├── Calculator.java │ │ │ ├── meta │ │ │ │ ├── CompositeCalculator.java │ │ │ │ └── MinimumCalculator.java │ │ │ └── content │ │ │ │ └── OracleCalculator.java │ │ │ ├── filter │ │ │ ├── modifier │ │ │ │ ├── modify │ │ │ │ │ ├── clean │ │ │ │ │ │ ├── LowercaseCleanAlgorithm.java │ │ │ │ │ │ ├── FilterNonWordsCleanAlgorithm.java │ │ │ │ │ │ ├── TrimCleanAlgorithm.java │ │ │ │ │ │ ├── CleanAlgorithm.java │ │ │ │ │ │ └── UnifyRareWordsCleanAlgorithm.java │ │ │ │ │ ├── NullModifyAlgorithm.java │ │ │ │ │ ├── ModifyAlgorithm.java │ │ │ │ │ ├── split │ │ │ │ │ │ ├── ParagraphSplitAlgorithm.java │ │ │ │ │ │ ├── SplitAlgorithm.java │ │ │ │ │ │ ├── WordSplitAlgorithm.java │ │ │ │ │ │ ├── FilterNonWordsSplitAlgorithmDecorator.java │ │ │ │ │ │ ├── SplitAlgorithmMock.java │ │ │ │ │ │ ├── SentenceSplitAlgorithm.java │ │ │ │ │ │ └── SrxSplitAlgorithm.java │ │ │ │ │ └── merge │ │ │ │ │ │ ├── MergeAlgorithm.java │ │ │ │ │ │ └── SeparatorMergeAlgorithm.java │ │ │ │ └── Modifier.java │ │ │ ├── macro │ │ │ │ ├── Macro.java │ │ │ │ ├── GaleAndChurchMacro.java │ │ │ │ └── PoissonMacro.java │ │ │ ├── meta │ │ │ │ ├── FilterDecorators.java │ │ │ │ ├── CompositeFilter.java │ │ │ │ └── IgnoreInfiniteProbabilityAlignmentsFilterDecorator.java │ │ │ ├── aligner │ │ │ │ ├── align │ │ │ │ │ ├── hmm │ │ │ │ │ │ ├── Util.java │ │ │ │ │ │ ├── viterbi │ │ │ │ │ │ │ ├── ViterbiAlgorithmFactory.java │ │ │ │ │ │ │ └── ViterbiData.java │ │ │ │ │ │ ├── fb │ │ │ │ │ │ │ └── ForwardBackwardAlgorithmFactory.java │ │ │ │ │ │ └── HmmAlignAlgorithmFactory.java │ │ │ │ │ ├── AlignAlgorithm.java │ │ │ │ │ └── AlignAlgorithmMock.java │ │ │ │ ├── AlignmentImpossibleException.java │ │ │ │ └── Aligner.java │ │ │ ├── Filter.java │ │ │ └── selector │ │ │ │ ├── OneToOneSelector.java │ │ │ │ ├── DifferenceSelector.java │ │ │ │ ├── IntersectionSelector.java │ │ │ │ ├── ProbabilitySelector.java │ │ │ │ └── FractionSelector.java │ │ │ ├── util │ │ │ ├── IORuntimeException.java │ │ │ ├── date │ │ │ │ └── InvalidDateException.java │ │ │ ├── ResourceNotFoundException.java │ │ │ ├── bind │ │ │ │ ├── BindException.java │ │ │ │ ├── QuietValidationEventHandler.java │ │ │ │ ├── AlMarshallerUnmarshaller.java │ │ │ │ ├── TmxMarshallerUnmarshaller.java │ │ │ │ └── MarshallerUnmarshaller.java │ │ │ ├── ImpossibleException.java │ │ │ └── Pair.java │ │ │ ├── parser │ │ │ ├── TmxParseException.java │ │ │ ├── Parser.java │ │ │ ├── AlParser.java │ │ │ └── PlaintextParser.java │ │ │ ├── model │ │ │ ├── ModelParseException.java │ │ │ ├── length │ │ │ │ ├── LengthModelUtil.java │ │ │ │ ├── LengthModel.java │ │ │ │ └── MutableLengthModel.java │ │ │ ├── Util.java │ │ │ ├── translation │ │ │ │ ├── SourceData.java │ │ │ │ ├── TargetDataProbabilityComparator.java │ │ │ │ ├── EmptySourceData.java │ │ │ │ ├── TargetData.java │ │ │ │ ├── InitialSourceData.java │ │ │ │ ├── TranslationModel.java │ │ │ │ └── InitialTranslationModel.java │ │ │ └── language │ │ │ │ ├── LanguageModel.java │ │ │ │ └── LanguageModelUtil.java │ │ │ ├── formatter │ │ │ ├── Formatter.java │ │ │ ├── AlFormatter.java │ │ │ ├── HtmlFormatter.java │ │ │ └── PlaintextFormatter.java │ │ │ ├── progress │ │ │ ├── ProgressObserver.java │ │ │ └── WriterProgressObserver.java │ │ │ ├── coretypes │ │ │ ├── Category.java │ │ │ └── CategoryDefaults.java │ │ │ └── comparator │ │ │ └── Diff.java │ └── resources │ │ └── net │ │ └── loomchild │ │ └── maligna │ │ └── res │ │ ├── test │ │ ├── simpletext.al │ │ └── simpletext.tmx │ │ └── xml │ │ └── al.xsd │ └── test │ └── java │ └── net │ └── loomchild │ └── maligna │ ├── model │ ├── length │ │ ├── MutableLengthModelTest.java │ │ └── LengthModelUtilTest.java │ ├── language │ │ ├── MutableLanguageModelTest.java │ │ └── LanguageModelUtilTest.java │ ├── translation │ │ ├── TargetDataProbabilityComparatorTest.java │ │ ├── MutableTranslationModelTest.java │ │ └── MutableSourceDataTest.java │ └── vocabulary │ │ └── VocabularyTest.java │ ├── calculator │ ├── meta │ │ └── CompositeCalculatorTest.java │ └── length │ │ └── PoissonDistributionCalculatorTest.java │ ├── filter │ ├── modifier │ │ └── modify │ │ │ ├── split │ │ │ ├── SplitAlgorithmMockTest.java │ │ │ ├── SentenceSplitAlgorithmTest.java │ │ │ └── WordSplitAlgorithmTest.java │ │ │ └── merge │ │ │ └── SeparatorMergeAlgorithmTest.java │ ├── aligner │ │ ├── align │ │ │ ├── hmm │ │ │ │ ├── viterbi │ │ │ │ │ └── ViterbiAlgorithmTest.java │ │ │ │ └── fb │ │ │ │ │ └── ForwardBackwardAlgorithmTest.java │ │ │ └── AlignAlgorithmMockTest.java │ │ ├── AlignerTest.java │ │ └── UnifyAlignerTest.java │ ├── selector │ │ ├── DifferenceSelectorTest.java │ │ ├── IntersectionSelectorTest.java │ │ └── OneToOneSelectorTest.java │ ├── macro │ │ └── MooreMacroTest.java │ └── meta │ │ ├── IgnoreInfiniteProbabilityAlignmentsFilterDecoratorTest.java │ │ └── CompositeFilterTest.java │ ├── parser │ ├── PlaintextParserTest.java │ ├── AlParserTest.java │ └── TmxParserTest.java │ ├── formatter │ ├── AlFormatterTest.java │ ├── TmxFormatterTest.java │ └── PresentationFormatterTest.java │ └── coretypes │ └── AlignmentTest.java ├── LICENSE.txt ├── .gitignore └── CHANGELOG.md /maligna-ui/src/main/java/net/loomchild/maligna/ui/console/command/Command.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.ui.console.command; 2 | 3 | 4 | public interface Command { 5 | 6 | public String getName(); 7 | 8 | public void run(String[] args); 9 | 10 | } 11 | -------------------------------------------------------------------------------- /maligna-ui/src/main/scripts/maligna: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPT_DIR=`dirname $0` 4 | PROJECT_HOME=$SCRIPT_DIR/.. 5 | JARS=$PROJECT_HOME/lib/* 6 | CLASS=net.loomchild.maligna.ui.console.Maligna 7 | 8 | exec java -cp "$CLASSPATH:$PROJECT_HOME/build/classes:$JARS" $CLASS $* 9 | -------------------------------------------------------------------------------- /maligna-ui/src/main/scripts/maligna.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | set SCRIPT_DIR=%~d0%~p0 4 | set PROJECT_HOME=%SCRIPT_DIR%.. 5 | set JARS=%PROJECT_HOME%\lib\* 6 | set CLASS=net.loomchild.maligna.ui.console.Maligna 7 | 8 | java -cp "%CLASSPATH%";"%PROJECT_HOME%\build\classes";"%JARS%" %CLASS% %* 9 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/matrix/MatrixFactory.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.matrix; 2 | 3 | /** 4 | * Represents matrix factory. 5 | * Enables to create a matrix of given size without knowing the actual 6 | * matrix type. 7 | * 8 | * @author loomchild 9 | */ 10 | public interface MatrixFactory { 11 | 12 | public Matrix createMatrix(int width, int height); 13 | 14 | } 15 | -------------------------------------------------------------------------------- /maligna-ui/src/main/java/net/loomchild/maligna/ui/console/command/exception/ParametersParseException.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.ui.console.command.exception; 2 | 3 | public class ParametersParseException extends CommandException { 4 | 5 | private static final long serialVersionUID = 4883570247314804577L; 6 | 7 | public ParametersParseException(Throwable cause) { 8 | super("Error parsing parameters.", cause); 9 | } 10 | 11 | } 12 | -------------------------------------------------------------------------------- /maligna-ui/src/main/java/net/loomchild/maligna/ui/console/command/exception/MissingParameterException.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.ui.console.command.exception; 2 | 3 | public class MissingParameterException extends CommandException { 4 | 5 | private static final long serialVersionUID = 6250339071503029391L; 6 | 7 | public MissingParameterException(String parameter) { 8 | super("Missing " + parameter + " parameter."); 9 | } 10 | 11 | } 12 | -------------------------------------------------------------------------------- /maligna-ui/src/main/java/net/loomchild/maligna/ui/console/command/exception/UnknownParameterException.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.ui.console.command.exception; 2 | 3 | public class UnknownParameterException extends CommandException { 4 | 5 | private static final long serialVersionUID = 917612043442645990L; 6 | 7 | public UnknownParameterException(String parameter) { 8 | super("Unknown " + parameter + " parameter."); 9 | } 10 | 11 | } 12 | -------------------------------------------------------------------------------- /maligna-ui/src/main/java/net/loomchild/maligna/ui/console/command/exception/ParameterFormatException.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.ui.console.command.exception; 2 | 3 | public class ParameterFormatException extends CommandException { 4 | 5 | private static final long serialVersionUID = 917612043442645990L; 6 | 7 | public ParameterFormatException(String parameter) { 8 | super("Invlid " + parameter + " parameter format."); 9 | } 10 | 11 | } 12 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/calculator/length/counter/CharCounter.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.calculator.length.counter; 2 | 3 | /** 4 | * Responsible for calculating length of a segment in characters. 5 | * @author loomchild 6 | */ 7 | public class CharCounter implements Counter { 8 | 9 | /** 10 | * Returns segment length. 11 | */ 12 | public int calculateLength(String segment) { 13 | return segment.length(); 14 | } 15 | 16 | } 17 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/matrix/FullMatrixFactory.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.matrix; 2 | 3 | /** 4 | * Represents {@link FullMatrix} factory. 5 | * Responsible for creating {@link FullMatrix} objects. 6 | * 7 | * @author loomchild 8 | */ 9 | public class FullMatrixFactory implements MatrixFactory { 10 | 11 | public Matrix createMatrix(int width, int height) { 12 | return new FullMatrix(width, height); 13 | } 14 | 15 | } 16 | -------------------------------------------------------------------------------- /maligna-ui/examples/script/example3: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPT_DIR=`dirname $0` 4 | PROJECT_HOME=$SCRIPT_DIR/../.. 5 | 6 | $PROJECT_HOME/bin/maligna parse -c txt $PROJECT_HOME/example/txt/poznan-pl.txt $PROJECT_HOME/example/txt/poznan-de.txt | $PROJECT_HOME/bin/maligna modify -c split-sentence | $PROJECT_HOME/bin/maligna modify -c trim | $PROJECT_HOME/bin/maligna align -c viterbi -a oracle,normal -d $PROJECT_HOME/example/align/human/poznan-oracle.al -n char -s iterative-band > poznan-oracle.al 7 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/modifier/modify/clean/LowercaseCleanAlgorithm.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.modifier.modify.clean; 2 | 3 | 4 | /** 5 | * Represents clean algorithm changing input segment to lower case. 6 | * @author loomchild 7 | */ 8 | public class LowercaseCleanAlgorithm extends CleanAlgorithm { 9 | 10 | public String clean(String segment) { 11 | String newSegment = segment.toLowerCase(); 12 | return newSegment; 13 | } 14 | 15 | } 16 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/util/IORuntimeException.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.util; 2 | 3 | import java.io.IOException; 4 | 5 | public class IORuntimeException extends RuntimeException { 6 | 7 | private static final long serialVersionUID = -6587044052300876023L; 8 | 9 | public IORuntimeException(IOException exception) { 10 | super(exception); 11 | } 12 | 13 | public void rethrow() throws IOException { 14 | throw (IOException) getCause(); 15 | } 16 | 17 | } 18 | -------------------------------------------------------------------------------- /maligna-ui/src/main/java/net/loomchild/maligna/ui/console/command/exception/WrongArgumentCountException.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.ui.console.command.exception; 2 | 3 | public class WrongArgumentCountException extends CommandException { 4 | 5 | private static final long serialVersionUID = 4883570247314804577L; 6 | 7 | public WrongArgumentCountException(String expected, int actual) { 8 | super("Wrong argument count. Expected " + expected + ", but was " + actual + "."); 9 | } 10 | 11 | } 12 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/util/date/InvalidDateException.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.util.date; 2 | 3 | /** 4 | * This exception is thrown by {@link DateParser} when it encounters an 5 | * invalid date. 6 | * 7 | * @author loomchild 8 | */ 9 | public class InvalidDateException extends RuntimeException { 10 | 11 | private static final long serialVersionUID = 7608394842578468135L; 12 | 13 | public InvalidDateException(String message) { 14 | super(message); 15 | } 16 | 17 | } 18 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/macro/Macro.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.macro; 2 | 3 | import net.loomchild.maligna.filter.Filter; 4 | 5 | /** 6 | * Represents a macro filter which consists of multiple filtering operations 7 | * (for example complete alignment using Moore's algorithm - 8 | * see {@link MooreMacro}). 9 | * Created to simplify complex operations and improve the performance. 10 | * @author loomchild 11 | */ 12 | public interface Macro extends Filter { 13 | 14 | } 15 | -------------------------------------------------------------------------------- /maligna-ui/examples/script/example3.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | set SCRIPT_DIR=%~d0%~p0 4 | set PROJECT_HOME=%SCRIPT_DIR%..\.. 5 | set BIN_DIR="%PROJECT_HOME%\bin" 6 | set EXAMPLE_DIR="%PROJECT_HOME%\example" 7 | 8 | %BIN_DIR%\maligna parse -c txt %EXAMPLE_DIR%/txt/poznan-pl.txt %EXAMPLE_DIR%/txt/poznan-de.txt | %BIN_DIR%\maligna modify -c split-sentence | %BIN_DIR%\maligna modify -c trim | %BIN_DIR%\maligna align -c viterbi -a oracle,normal -d %EXAMPLE_DIR%/align/human/poznan-oracle.al -n char -s iterative-band > poznan-oracle.al 9 | -------------------------------------------------------------------------------- /maligna-ui/examples/script/example1: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPT_DIR=`dirname $0` 4 | PROJECT_HOME=$SCRIPT_DIR/../.. 5 | 6 | $PROJECT_HOME/bin/maligna parse -c txt $PROJECT_HOME/example/txt/poznan-pl.txt $PROJECT_HOME/example/txt/poznan-de.txt | $PROJECT_HOME/bin/maligna modify -c split-sentence | $PROJECT_HOME/bin/maligna modify -c trim | $PROJECT_HOME/bin/maligna align -c viterbi -a normal -n char -s iterative-band | $PROJECT_HOME/bin/maligna select -c one-to-one | $PROJECT_HOME/bin/maligna format -c txt poznan-pl-align.txt poznan-de-align.txt 7 | 8 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/parser/TmxParseException.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.parser; 2 | 3 | /** 4 | * Represrnts TMX document parsing exception 5 | * 6 | * @author Jarek Lipski (loomchild) 7 | */ 8 | public class TmxParseException extends RuntimeException { 9 | 10 | private static final long serialVersionUID = 5752610837896744124L; 11 | 12 | public TmxParseException(String message) { 13 | super(message); 14 | } 15 | 16 | 17 | public TmxParseException(String message, Throwable cause) { 18 | super(message, cause); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /maligna-ui/examples/script/example1.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | set SCRIPT_DIR=%~d0%~p0 4 | set PROJECT_HOME=%SCRIPT_DIR%..\.. 5 | set BIN_DIR="%PROJECT_HOME%\bin" 6 | set EXAMPLE_DIR="%PROJECT_HOME%\example" 7 | 8 | %BIN_DIR%\maligna parse -c txt %EXAMPLE_DIR%/txt/poznan-pl.txt %EXAMPLE_DIR%/txt/poznan-de.txt | %BIN_DIR%\maligna modify -c split-sentence | %BIN_DIR%\maligna modify -c trim | %BIN_DIR%\maligna align -c viterbi -a normal -n char -s iterative-band | %BIN_DIR%\maligna select -c one-to-one | %BIN_DIR%\maligna format -c txt poznan-pl-align.txt poznan-de-align.txt 9 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/model/ModelParseException.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.model; 2 | 3 | /** 4 | * Represents model or vocabulary parse exception. 5 | * 6 | * @author Jarek Lipski (loomchild) 7 | */ 8 | public class ModelParseException extends RuntimeException { 9 | 10 | private static final long serialVersionUID = 6105226270677843760L; 11 | 12 | public ModelParseException(String message) { 13 | super(message); 14 | } 15 | 16 | 17 | public ModelParseException(String message, Throwable cause) { 18 | super(message, cause); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /maligna-ui/src/main/java/net/loomchild/maligna/ui/console/command/exception/CommandException.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.ui.console.command.exception; 2 | 3 | public class CommandException extends RuntimeException { 4 | 5 | private static final long serialVersionUID = 1962758074740784763L; 6 | 7 | public CommandException(String message) { 8 | super(message); 9 | } 10 | 11 | public CommandException(Throwable cause) { 12 | super(cause); 13 | } 14 | 15 | public CommandException(String message, Throwable cause) { 16 | super(message, cause); 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/modifier/modify/clean/FilterNonWordsCleanAlgorithm.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.modifier.modify.clean; 2 | 3 | 4 | /** 5 | * Represents clean algorithm removing all segments that do not contain any 6 | * letters. 7 | * @author loomchild 8 | */ 9 | public class FilterNonWordsCleanAlgorithm extends CleanAlgorithm { 10 | 11 | public String clean(String segment) { 12 | for (int i = 0; i < segment.length(); ++i) { 13 | if (Character.isLetter(segment.charAt(i))) { 14 | return segment; 15 | } 16 | } 17 | return null; 18 | } 19 | 20 | } 21 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/parser/Parser.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.parser; 2 | 3 | import java.util.List; 4 | 5 | import net.loomchild.maligna.coretypes.Alignment; 6 | 7 | 8 | /** 9 | * Represents input text(s) parser that creates alignment list. 10 | * Input file or files are configured in constructor of a concrete parser 11 | * implementation. 12 | * 13 | * @author loomchild 14 | */ 15 | public interface Parser { 16 | 17 | /** 18 | * Parses input document into an alignment list. 19 | * @return parsed alignment list 20 | */ 21 | public List parse(); 22 | 23 | } 24 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/util/ResourceNotFoundException.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.util; 2 | 3 | /** 4 | * Represents an exception that is thrown when a resource (usually a file) 5 | * cannot be found. 6 | * @author loomchild 7 | * 8 | */ 9 | public class ResourceNotFoundException extends RuntimeException { 10 | 11 | private static final long serialVersionUID = 318909218824445026L; 12 | 13 | public ResourceNotFoundException(String name) { 14 | super(name); 15 | } 16 | 17 | public ResourceNotFoundException(String name, Throwable cause) { 18 | super(name, cause); 19 | } 20 | 21 | } 22 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/util/bind/BindException.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.util.bind; 2 | 3 | /** 4 | * Represents JAXB parse / format exception. 5 | * 6 | * @author Jarek Lipski (loomchild) 7 | */ 8 | public class BindException extends RuntimeException { 9 | 10 | private static final long serialVersionUID = 4729174015756939625L; 11 | 12 | public BindException(String message) { 13 | super(message); 14 | } 15 | 16 | public BindException(String message, Throwable cause) { 17 | super(message, cause); 18 | } 19 | 20 | public BindException(Throwable cause) { 21 | super(cause); 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/calculator/length/counter/Counter.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.calculator.length.counter; 2 | 3 | /** 4 | * Responsible for calculating length of given segment. For example 5 | * it can return number of characters (see {@link CharCounter}, 6 | * number of words (see {@link SplitCounter}) or any other measure. 7 | * @author loomchild 8 | */ 9 | public interface Counter { 10 | 11 | /** 12 | * Calculates length of a segment. 13 | * 14 | * @param segment segment 15 | * @return length of a segment, >= 0 16 | */ 17 | public int calculateLength(String segment); 18 | 19 | } 20 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/calculator/CalculatorMock.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.calculator; 2 | 3 | import java.util.List; 4 | 5 | 6 | /** 7 | * Calculator mock returning always predefined score, na matter what segments 8 | * are passed in. Used for testing. 9 | * 10 | * @author Jarek Lipski (loomchild) 11 | */ 12 | public class CalculatorMock implements Calculator { 13 | 14 | private float score; 15 | 16 | public CalculatorMock(float score) { 17 | this.score = score; 18 | } 19 | 20 | public float calculateScore(List sourceSegmentList, List targetSegmentList) { 21 | return score; 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/modifier/modify/clean/TrimCleanAlgorithm.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.modifier.modify.clean; 2 | 3 | 4 | /** 5 | * Represents clean algorithm trimming all the segments (removing leading and 6 | * trailing whitespace). 7 | * It also omits segments that are empty after trimming. 8 | * 9 | * @author loomchild 10 | */ 11 | public class TrimCleanAlgorithm extends CleanAlgorithm { 12 | 13 | public String clean(String segment) { 14 | String newSegment = segment.trim(); 15 | if (newSegment.length() == 0) { 16 | newSegment = null; 17 | } 18 | return newSegment; 19 | } 20 | 21 | } 22 | -------------------------------------------------------------------------------- /maligna/src/main/resources/net/loomchild/maligna/res/test/simpletext.al: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | First sentence. 6 | Second sentence. 7 | 8 | 9 | Pierwsze zdanie. 10 | 11 | 12 | 13 | 14 | 15 | 16 | Drugie zdanie. 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/meta/FilterDecorators.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.meta; 2 | 3 | import net.loomchild.maligna.filter.Filter; 4 | 5 | /** 6 | * Filter decorator helper methods. 7 | * @author loomchild 8 | */ 9 | public class FilterDecorators { 10 | 11 | /** 12 | * Decorate given filter with standard decorators. Currently uses only 13 | * {@link IgnoreInfiniteProbabilityAlignmentsFilterDecorator}. 14 | * 15 | * @param filter 16 | * @return decorated filter 17 | */ 18 | public static Filter decorate(Filter filter) { 19 | filter = new IgnoreInfiniteProbabilityAlignmentsFilterDecorator(filter); 20 | return filter; 21 | } 22 | 23 | } 24 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/matrix/PositionOutsideBandException.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.matrix; 2 | 3 | /** 4 | * Represents an exception when user tries to access {@link BandMatrix} 5 | * element outside matrix band. 6 | * @see BandMatrix 7 | * @author loomchild 8 | */ 9 | public class PositionOutsideBandException extends RuntimeException { 10 | 11 | private static final long serialVersionUID = 4329499541774129117L; 12 | 13 | public PositionOutsideBandException(int x, int y, int width, int height, 14 | int bandWidth) { 15 | super("Position " + "(" + x + ", " + y + ") in matrix of size (" + 16 | width + ", " + height + ") outside band of width " + bandWidth); 17 | } 18 | 19 | 20 | } 21 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/modifier/modify/NullModifyAlgorithm.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.modifier.modify; 2 | 3 | import java.util.List; 4 | 5 | /** 6 | * Represents modify algorithm that is not changing the input in any way. 7 | * Useful when we want to perform operation just on source or target 8 | * segments and leave the other as it is. 9 | * Null design pattern. 10 | * @author loomchild 11 | */ 12 | public class NullModifyAlgorithm implements ModifyAlgorithm { 13 | 14 | /** 15 | * @param segmentList source segment list 16 | * @return unmodified source segment list 17 | */ 18 | public List modify(List segmentList) { 19 | return segmentList; 20 | } 21 | 22 | } 23 | -------------------------------------------------------------------------------- /maligna-ui/examples/script/example2: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPT_DIR=`dirname $0` 4 | PROJECT_HOME=$SCRIPT_DIR/../.. 5 | 6 | $PROJECT_HOME/bin/maligna parse -c txt $PROJECT_HOME/example/txt/poznan-pl.txt $PROJECT_HOME/example/txt/poznan-de.txt | $PROJECT_HOME/bin/maligna modify -c split-sentence | $PROJECT_HOME/bin/maligna modify -c trim > poznan-split.al 7 | 8 | cat poznan-split.al | $PROJECT_HOME/bin/maligna align -c fb -a poisson -n word -s iterative-band | $PROJECT_HOME/bin/maligna select -c one-to-one | $PROJECT_HOME/bin/maligna select -c fraction -f 0.85 > poznan-align-length.al 9 | 10 | cat poznan-split.al | $PROJECT_HOME/bin/maligna align -c fb -a poisson,translation -n word -s iterative-band -t poznan-align-length.al > poznan-align.al 11 | -------------------------------------------------------------------------------- /maligna-ui/examples/script/example2.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | set SCRIPT_DIR=%~d0%~p0 4 | set PROJECT_HOME=%SCRIPT_DIR%..\.. 5 | set BIN_DIR="%PROJECT_HOME%\bin" 6 | set EXAMPLE_DIR="%PROJECT_HOME%\example" 7 | 8 | %BIN_DIR%\maligna parse -c txt %EXAMPLE_DIR%\txt\poznan-pl.txt %EXAMPLE_DIR%\txt\poznan-de.txt | %BIN_DIR%\maligna modify -c split-sentence | %BIN_DIR%\maligna modify -c trim > poznan-split.al 9 | 10 | type poznan-split.al | %BIN_DIR%\maligna align -c fb -a poisson -n word -s iterative-band | %BIN_DIR%\maligna select -c one-to-one | %BIN_DIR%\maligna select -c fraction -f 0.85 > poznan-align-length.al 11 | 12 | type poznan-split.al | %BIN_DIR%\maligna align -c fb -a poisson,translation -n word -s iterative-band -t poznan-align-length.al > poznan-align.al 13 | -------------------------------------------------------------------------------- /maligna/src/main/resources/net/loomchild/maligna/res/test/simpletext.tmx: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | 6 | 7 | Pierwsze zdanie. 8 | 9 | 10 | 0.1 11 | First sentence. 12 | 13 | 14 | 15 | 16 | Second sentence. 17 | 18 | 19 | 20 | 21 | Erste Satz. 22 | 23 | 24 | Zweite Satz. 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/formatter/Formatter.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.formatter; 2 | 3 | import java.util.List; 4 | 5 | import net.loomchild.maligna.coretypes.Alignment; 6 | 7 | 8 | /** 9 | * Represents a alignment list formatter / writer. 10 | * Responsible for writing alignment list in specific format. 11 | * The output location (file, set of files) are defined in individual subclasses 12 | * and should be configured in class constructor. 13 | * 14 | * @author Jarek Lipski (loomchild) 15 | */ 16 | public interface Formatter { 17 | 18 | /** 19 | * Formats alignment list to previously defined location depending 20 | * on concrete implementation. 21 | * @param alignmentList alignment list 22 | */ 23 | public void format(List alignmentList); 24 | 25 | } 26 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/aligner/align/hmm/Util.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.aligner.align.hmm; 2 | 3 | import net.loomchild.maligna.matrix.Matrix; 4 | 5 | /** 6 | * Represents alignment algorithm utilities. 7 | * @author loomchild 8 | */ 9 | public class Util { 10 | 11 | /** 12 | * Checks if x (y) is greater than 0 and less than matrix width (height) 13 | * and if the element stored at this position is not null. 14 | * @param matrix 15 | * @param x 16 | * @param y 17 | * @return true if element exists at given position 18 | */ 19 | public static boolean elementExists(Matrix matrix, int x, int y) { 20 | return (x >= 0 && y >= 0 21 | && x < matrix.getWidth() && y < matrix.getHeight() 22 | && matrix.get(x, y) != null); 23 | } 24 | 25 | 26 | } 27 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/model/length/LengthModelUtil.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.model.length; 2 | 3 | import java.util.List; 4 | 5 | /** 6 | * Represents length model utilities. 7 | * @author loomchild 8 | */ 9 | public class LengthModelUtil { 10 | 11 | /** 12 | * Trains a length model using given segments corpus 13 | * @param segmentLengthList segment list 14 | * @return created length model reflecting segment length occurrence 15 | * probabilities 16 | */ 17 | public static LengthModel train(List segmentLengthList) { 18 | MutableLengthModel model = new MutableLengthModel(); 19 | 20 | for (int segmentLength : segmentLengthList) { 21 | model.addLengthOccurence(segmentLength); 22 | } 23 | model.normalize(); 24 | 25 | return model; 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/util/ImpossibleException.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.util; 2 | 3 | /** 4 | * Represents exception that should never occur. Should be thrown when You want 5 | * to swallow other exception because you are sure that it won't be thrown. 6 | * 7 | * @author Jarek Lipski (loomchild) 8 | */ 9 | public class ImpossibleException extends RuntimeException { 10 | 11 | private static final long serialVersionUID = -5899108883494773808L; 12 | 13 | public ImpossibleException() { 14 | } 15 | 16 | public ImpossibleException(String message) { 17 | super(message); 18 | } 19 | 20 | public ImpossibleException(String message, Throwable cause) { 21 | super(message, cause); 22 | } 23 | 24 | public ImpossibleException(Throwable cause) { 25 | super(cause); 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/model/Util.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.model; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | public class Util { 7 | 8 | /** 9 | * Converts two dimensional integer (representing word ids) array into a 10 | * list of lists. Used for testing purposes. 11 | * 12 | * @param widArray two dimensional interger array 13 | * @return list of lists. 14 | */ 15 | public static List< List > createWidList(int[][] widArray) { 16 | List< List > widList = new ArrayList< List >(); 17 | for (int[] widArrayGroup : widArray) { 18 | List widListGroup = new ArrayList(); 19 | for (int wid : widArrayGroup) { 20 | widListGroup.add(wid); 21 | } 22 | widList.add(widListGroup); 23 | } 24 | return widList; 25 | } 26 | 27 | 28 | } 29 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/modifier/modify/ModifyAlgorithm.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.modifier.modify; 2 | 3 | import java.util.List; 4 | 5 | import net.loomchild.maligna.filter.modifier.Modifier; 6 | 7 | /** 8 | * Represents modify algorithm used by {@link Modifier}. 9 | * @author loomchild 10 | */ 11 | public interface ModifyAlgorithm { 12 | 13 | /** 14 | * Returns segment list containing modified input segment list. 15 | * Modification can include merging or splitting of elements (resulting 16 | * list can have different size than input list). 17 | * Does not know distinguish between source and target segments (does not 18 | * know which ones are processed). 19 | * @param segmentList source segment list 20 | * @return modified segment list 21 | */ 22 | public List modify(List segmentList); 23 | 24 | } 25 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/model/translation/SourceData.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.model.translation; 2 | 3 | import java.util.List; 4 | 5 | 6 | /** 7 | * Represents source word translations with probabilities. 8 | * 9 | * @author Jarek Lipski (loomchild) 10 | */ 11 | public interface SourceData { 12 | 13 | /** 14 | * Returns probability of translating this source word to a target 15 | * word with given id. 16 | * @param targetWid target word id 17 | * @return translation probability 18 | */ 19 | public double getTranslationProbability(int targetWid); 20 | 21 | /** 22 | * Returns immutable list of translations with probability greater than 23 | * zero, sorted by probability descending. 24 | * @return list of translations (essentially word, probability pairs) 25 | */ 26 | public List getTranslationList(); 27 | 28 | } 29 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/model/translation/TargetDataProbabilityComparator.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.model.translation; 2 | 3 | import java.io.Serializable; 4 | import java.util.Comparator; 5 | 6 | /** 7 | * Responsible for comparing {@link TargetData} objects by translation 8 | * probability and ordering them from the most probable to the least probable. 9 | * @author loomchild 10 | */ 11 | public class TargetDataProbabilityComparator 12 | implements Comparator, Serializable { 13 | 14 | private static final long serialVersionUID = -9161863179489700671L; 15 | 16 | public int compare(TargetData o1, TargetData o2) { 17 | double difference = o2.getProbability() - o1.getProbability(); 18 | if (difference > 0) { 19 | return 1; 20 | } else if (difference < 0) { 21 | return -1; 22 | } else { 23 | return 0; 24 | } 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /maligna-ui/src/main/java/net/loomchild/maligna/ui/console/command/TestCommand.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.ui.console.command; 2 | 3 | import org.junit.extensions.cpsuite.ClasspathSuite; 4 | import org.junit.runner.RunWith; 5 | 6 | import org.junit.internal.TextListener; 7 | import org.junit.runner.JUnitCore; 8 | 9 | 10 | 11 | /** 12 | * Test suite containing all tests executed using console. 13 | * @author loomchild 14 | */ 15 | @RunWith(ClasspathSuite.class) 16 | @ClasspathSuite.IncludeJars(true) 17 | @ClasspathSuite.ClassnameFilters({"net\\.loomchild\\.maligna\\..*"}) 18 | public class TestCommand implements Command { 19 | 20 | public String getName() { 21 | return "test"; 22 | } 23 | 24 | public void run(String[] args) { 25 | JUnitCore core = new JUnitCore(); 26 | core.addListener(new TextListener(System.out)); 27 | core.run(TestCommand.class); 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/model/translation/EmptySourceData.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.model.translation; 2 | 3 | import java.util.Collections; 4 | import java.util.List; 5 | 6 | 7 | /** 8 | * Represents empty translation data of source word - 9 | * essentially it means that source word has no translations. 10 | * Always returns empty translation list and probability of translating to 11 | * any word is always zero. 12 | * 13 | * @author Jarek Lipski (loomchild) 14 | */ 15 | class EmptySourceData implements SourceData { 16 | 17 | /** 18 | * @param targetWid target word id 19 | * @return always zero 20 | */ 21 | public double getTranslationProbability(int targetWid) { 22 | assert targetWid >= 0; 23 | return 0; 24 | } 25 | 26 | /** 27 | * @return empty translation list 28 | */ 29 | public List getTranslationList() { 30 | return Collections.emptyList(); 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/model/translation/TargetData.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.model.translation; 2 | 3 | 4 | /** 5 | * Represents single source word translation. 6 | * Responsible for storing target word id and translation probability. 7 | * @author loomchild 8 | */ 9 | public class TargetData { 10 | 11 | private int wid; 12 | 13 | private double probability; 14 | 15 | /** 16 | * Creates target data. 17 | * @param wid target word id 18 | * @param probability translation probability 19 | */ 20 | public TargetData(int wid, double probability) { 21 | this.wid = wid; 22 | this.probability = probability; 23 | } 24 | 25 | /** 26 | * @return target word id 27 | */ 28 | public int getWid() { 29 | return wid; 30 | } 31 | 32 | /** 33 | * @return source to target word translation probability 34 | */ 35 | public double getProbability() { 36 | return probability; 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/aligner/align/hmm/viterbi/ViterbiAlgorithmFactory.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.aligner.align.hmm.viterbi; 2 | 3 | import java.util.Map; 4 | 5 | import net.loomchild.maligna.calculator.Calculator; 6 | import net.loomchild.maligna.coretypes.Category; 7 | import net.loomchild.maligna.filter.aligner.align.AlignAlgorithm; 8 | import net.loomchild.maligna.filter.aligner.align.hmm.HmmAlignAlgorithmFactory; 9 | import net.loomchild.maligna.matrix.MatrixFactory; 10 | 11 | /** 12 | * Factory always producing objects of {@link ViterbiAlgorithm}. 13 | * @author loomchild 14 | */ 15 | public class ViterbiAlgorithmFactory implements HmmAlignAlgorithmFactory { 16 | 17 | public AlignAlgorithm createAlignAlgorithm(Calculator calculator, 18 | Map categoryMap, MatrixFactory matrixFactory) { 19 | return new ViterbiAlgorithm(calculator, categoryMap, matrixFactory); 20 | } 21 | 22 | } 23 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/aligner/align/hmm/fb/ForwardBackwardAlgorithmFactory.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.aligner.align.hmm.fb; 2 | 3 | import java.util.Map; 4 | 5 | import net.loomchild.maligna.calculator.Calculator; 6 | import net.loomchild.maligna.coretypes.Category; 7 | import net.loomchild.maligna.filter.aligner.align.hmm.HmmAlignAlgorithmFactory; 8 | import net.loomchild.maligna.filter.aligner.align.AlignAlgorithm; 9 | import net.loomchild.maligna.matrix.MatrixFactory; 10 | 11 | /** 12 | * Factory always producing objects of {@link ForwardBackwardAlgorithm}. 13 | * @author loomchild 14 | */ 15 | public class ForwardBackwardAlgorithmFactory implements HmmAlignAlgorithmFactory { 16 | 17 | public AlignAlgorithm createAlignAlgorithm(Calculator calculator, 18 | Map categoryMap, MatrixFactory matrixFactory) { 19 | return new ForwardBackwardAlgorithm(calculator, categoryMap, 20 | matrixFactory); 21 | } 22 | 23 | } 24 | -------------------------------------------------------------------------------- /maligna-ui/examples/txt/poznan-small-de.txt: -------------------------------------------------------------------------------- 1 | Ein ganzes Jahr über ist der See für Angler geöffnet, die vor allem Kleine Maränen, aber auch z. B. auf Aale, Zander, Hechte, Barsche, Weiße Amure, Karpfen oder Brassen fangen können. Angeln kann man sowohl vom Ufer, als auch vom Boot aus. Jedes Jahr werden hier vom Polnischen Angelverein Jungfische ausgesetzt. 2 | Am Kierskie - See gibt es zwei bewachte Badestellen: in Krzyżowniki und Kiekrz, sowie zahlreiche Erholungs - und Schulungszentren. Da am Kierskie - See viele Segelsportzentren (im Winter Eissegelsportzentren) tätig sind, sind Anlegestellen ein markantes Landschaftselement. 3 | Der Stausee Malta wurde 1952 durch den Aufstau der Cybina geschaffen. Das Stauwehr befindet sich am westlichen Ende des Sees, unweit des Kreisverkehrs Śródka. Der Malta - See ist eine bekannte Regattastrecke; zu Beginn der 90er Jahre entstand an seinen Ufern ein ganzer Komplex von Sport - und Erholungsobjekten. Am westlichen Teil des Nordufers ist das Wohnviertel Komandoria gelegen. -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/model/translation/InitialSourceData.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.model.translation; 2 | 3 | import java.util.Collections; 4 | import java.util.List; 5 | 6 | 7 | /** 8 | * Represents source translation data in initial model before first training 9 | * iteration. In this state all translations of this source word are equally 10 | * probable and probability is equal to 1. This object is immutable. 11 | * 12 | * @author Jarek Lipski (loomchild) 13 | */ 14 | class InitialSourceData implements SourceData { 15 | 16 | /** 17 | * Always returns one for any word id. 18 | * @param targetWid target word id, >= 0 19 | * @return always one 20 | */ 21 | public double getTranslationProbability(int targetWid) { 22 | assert targetWid >= 0; 23 | return 1; 24 | } 25 | 26 | /** 27 | * @return empty translation list 28 | */ 29 | public List getTranslationList() { 30 | return Collections.emptyList(); 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /maligna/src/test/java/net/loomchild/maligna/model/length/MutableLengthModelTest.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.model.length; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | 5 | import org.junit.Test; 6 | 7 | /** 8 | * Represents {@link MutableLengthModel} unit test. 9 | * @author loomchild 10 | */ 11 | public class MutableLengthModelTest { 12 | 13 | /** 14 | * Checks if length probabilities are calculated correctly. 15 | */ 16 | @Test 17 | public void length() { 18 | MutableLengthModel model = new MutableLengthModel(); 19 | assertEquals(0.0f, model.getLengthProbability(3), 0.01f); 20 | model.addLengthOccurence(3); 21 | model.addLengthOccurence(3); 22 | model.addLengthOccurence(3); 23 | model.addLengthOccurence(2); 24 | model.normalize(); 25 | assertEquals(0.75f, model.getLengthProbability(3), 0.01f); 26 | assertEquals(0.25f, model.getLengthProbability(2), 0.01f); 27 | assertEquals(2.75f, model.getMeanLength(), 0.01f); 28 | } 29 | 30 | 31 | } 32 | -------------------------------------------------------------------------------- /maligna-ui/examples/txt/poznan-small-pl.txt: -------------------------------------------------------------------------------- 1 | Przez cały rok jezioro jest dostępne dla wędkarzy, którzy mogą liczyć przede wszystkim na sielawę, ale także np. na węgorza, sandacza, szczupaka, okonia, amura białego, karpia czy leszcza. Wędkować można zarówno z brzegu, jak i łodzi. Materiał zarybieniowy dla wędkarzy wprowadzany jest corocznie przez Polski Związek Wędkarski. 2 | Nad Jeziorem Kierskim znajdują się dwa strzeżone kąpieliska: w Krzyżownikach i Kiekrzu, oraz liczne ośrodki wypoczynkowe i szkoleniowe. Ponieważ nad Jeziorem Kierskim działają prężne ośrodki żeglarstwa (w zimie sportu bojerowego), charakterystycznym elementem krajobrazu są przystanie żeglarskie. 3 | Jezioro Malta jest zbiornikiem sztucznym. Powstało w 1952 r. przez spiętrzenie wód rzeki Cybiny. W latach 1980-1990 niecka zbiornika i jego otoczenie zostało gruntownie zmodernizowane. Po zakończonej w 1990 r.rekultywacji, jezioro Malta osiągnęło powierzchnię 64,0 ha. Średnia głębokość akwenu wynosi 3,13 m, ale są miejsca, w których sięga ona 5,0 m. -------------------------------------------------------------------------------- /maligna/src/test/java/net/loomchild/maligna/model/language/MutableLanguageModelTest.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.model.language; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | 5 | import org.junit.Test; 6 | 7 | /** 8 | * Represents {@link MutableLanguageModel} unit test. 9 | * @author loomchild 10 | */ 11 | public class MutableLanguageModelTest { 12 | 13 | /** 14 | * Checks if model probabilities are calculated properly. 15 | */ 16 | @Test 17 | public void word() { 18 | MutableLanguageModel model = new MutableLanguageModel(); 19 | assertEquals(0.0f, model.getWordProbability(1), 0.01f); 20 | model.addWordOccurence(1); 21 | model.addWordOccurence(1); 22 | model.addWordOccurence(1); 23 | model.addWordOccurence(2); 24 | model.normalize(); 25 | assertEquals(0.75f, model.getWordProbability(1), 0.01f); 26 | assertEquals(0.25f, model.getWordProbability(2), 0.01f); 27 | assertEquals(0.25f, model.getSingletonWordProbability(), 0.01f); 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/model/length/LengthModel.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.model.length; 2 | 3 | import java.io.Writer; 4 | 5 | /** 6 | * Represents a model of segment (sentence, paragraph) lengths for a particular 7 | * language, allowing to calculate given length probability and averages. 8 | * 9 | * @author loomchild 10 | */ 11 | public interface LengthModel { 12 | 13 | /** 14 | * Calculates probability that segment of given length will occur 15 | * in modeled language. 16 | * @param length segment length 17 | * @return length probability 18 | */ 19 | public float getLengthProbability(int length); 20 | 21 | /** 22 | * @return mean segment length in modeled language 23 | */ 24 | public float getMeanLength(); 25 | 26 | /** 27 | * Formats the length model to given writer in plaintext format. 28 | * Used for logging purposes and for storing models. 29 | * @param writer writer 30 | */ 31 | public void format(Writer writer); 32 | 33 | } 34 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/matrix/BandMatrixFactory.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.matrix; 2 | 3 | /** 4 | * Represents {@link BandMatrix} factory. 5 | * Responsible for creating {@link BandMatrix} objects. 6 | * 7 | * @author loomchild 8 | */ 9 | public class BandMatrixFactory implements MatrixFactory { 10 | 11 | public static final int DEFAULT_BAND_RADIUS = 20; 12 | 13 | private int bandRadius; 14 | 15 | /** 16 | * Creates band matrix factory producing matrices with given radius. 17 | * @param bandRadius 18 | */ 19 | public BandMatrixFactory(int bandRadius) { 20 | this.bandRadius = bandRadius; 21 | } 22 | 23 | /** 24 | * Creates band matrix factory producing matrices with 25 | * {@value #DEFAULT_BAND_RADIUS}. 26 | */ 27 | public BandMatrixFactory() { 28 | this(DEFAULT_BAND_RADIUS); 29 | } 30 | 31 | public Matrix createMatrix(int width, int height) { 32 | return new BandMatrix(width, height, bandRadius); 33 | } 34 | 35 | 36 | 37 | } 38 | -------------------------------------------------------------------------------- /maligna/src/test/java/net/loomchild/maligna/model/translation/TargetDataProbabilityComparatorTest.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.model.translation; 2 | 3 | import static org.junit.Assert.assertTrue; 4 | 5 | import org.junit.Test; 6 | 7 | /** 8 | * Represents {@link TargetDataProbabilityComparator} unit test. 9 | * @author loomchild 10 | */ 11 | public class TargetDataProbabilityComparatorTest { 12 | 13 | /** 14 | * Simple comparator test. 15 | */ 16 | @Test 17 | public void testCompareTo() { 18 | TargetData[] data = new TargetData[] { 19 | new TargetData(0, 0.1), new TargetData(1, 0.5), 20 | new TargetData(2, 0.5) 21 | }; 22 | TargetDataProbabilityComparator comparator = 23 | new TargetDataProbabilityComparator(); 24 | assertTrue(comparator.compare(data[0], data[1]) > 0); 25 | assertTrue(comparator.compare(data[1], data[0]) < 0); 26 | assertTrue(comparator.compare(data[0], data[0]) == 0); 27 | assertTrue(comparator.compare(data[1], data[2]) == 0); 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /maligna/src/test/java/net/loomchild/maligna/model/length/LengthModelUtilTest.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.model.length; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | 5 | import java.util.Arrays; 6 | import java.util.List; 7 | 8 | import org.junit.Test; 9 | 10 | 11 | /** 12 | * Represents {@link LengthModelUtil} test suite. 13 | * @author loomchild 14 | * 15 | */ 16 | public class LengthModelUtilTest { 17 | 18 | /** 19 | * Checks if length probabilities are calculated correctly. 20 | */ 21 | @Test 22 | public void train() { 23 | List lengthList = Arrays.asList(new Integer[]{3, 1, 1, 0}); 24 | LengthModel model = LengthModelUtil.train(lengthList); 25 | assertEquals(0.25f, model.getLengthProbability(0), 0.01f); 26 | assertEquals(0.5f, model.getLengthProbability(1), 0.01f); 27 | assertEquals(0.0f, model.getLengthProbability(2), 0.01f); 28 | assertEquals(0.25f, model.getLengthProbability(3), 0.01f); 29 | assertEquals(1.25f, model.getMeanLength(), 0.01f); 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/calculator/Calculator.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.calculator; 2 | 3 | import java.util.List; 4 | 5 | /** 6 | *

Represents method of calculating probability of an alignment of given 7 | * source segments to given target segments. 8 | * It's the heart of alignment algorithm.

9 | * 10 | *

The actual implementation can calculate the result using just segment 11 | * lengths (package length) or contents of the segments (package content).

12 | * 13 | * @author Jarek Lipski (loomchild) 14 | */ 15 | public interface Calculator { 16 | 17 | /** 18 | * Calculates score (equal to -ln(probability)) of alignment of given 19 | * source segment to given target segments. 20 | * 21 | * @param sourceSegmentList source segment list 22 | * @param targetSegmentList target segment list 23 | * @return result (-ln(probability)) of the alignment, >= 0 24 | */ 25 | public float calculateScore(List sourceSegmentList, 26 | List targetSegmentList); 27 | 28 | } 29 | -------------------------------------------------------------------------------- /maligna/src/test/java/net/loomchild/maligna/calculator/meta/CompositeCalculatorTest.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.calculator.meta; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | 5 | import java.util.Arrays; 6 | import java.util.List; 7 | 8 | import net.loomchild.maligna.calculator.Calculator; 9 | import net.loomchild.maligna.calculator.CalculatorMock; 10 | 11 | import org.junit.Test; 12 | 13 | /** 14 | * Represents unit test of {@link CompositeCalculator}. 15 | * @author loomchild 16 | */ 17 | public class CompositeCalculatorTest { 18 | 19 | /** 20 | * Checks using {@link CalculatorMock} that composite really returns 21 | * the sum of scores of all contained calculators. 22 | */ 23 | @Test 24 | public void calculate() { 25 | List calculatorList = Arrays.asList(new Calculator[] { 26 | new CalculatorMock(0.5f), new CalculatorMock(0.25f)}); 27 | Calculator calculator = new CompositeCalculator(calculatorList); 28 | assertEquals(0.75f, calculator.calculateScore(null, null), 0.75f); 29 | } 30 | 31 | } 32 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/modifier/modify/split/ParagraphSplitAlgorithm.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.modifier.modify.split; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | 7 | /** 8 | * Represents simple split algorithm dividing input segment into paragraphs, 9 | * ending with end-of-line character ('\n'). 10 | * TODO: merge with WordSplitAlgorithm 11 | * @author Jarek Lipski (loomchild) 12 | */ 13 | public class ParagraphSplitAlgorithm extends SplitAlgorithm { 14 | 15 | public List split(String string) { 16 | List paragraphList = new ArrayList(); 17 | int start = 0; 18 | for (int end = 0; end < string.length(); ++end) { 19 | if (string.charAt(end) == '\n') { 20 | String paragraph = string.substring(start, end); 21 | paragraphList.add(paragraph); 22 | start = end + 1; 23 | } 24 | } 25 | if (start < string.length()) { 26 | String paragraph = string.substring(start); 27 | paragraphList.add(paragraph); 28 | } 29 | return paragraphList; 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /maligna/src/main/resources/net/loomchild/maligna/res/xml/al.xsd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/Filter.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter; 2 | 3 | import java.util.List; 4 | 5 | import net.loomchild.maligna.coretypes.Alignment; 6 | 7 | 8 | /** 9 | *

Represents alignment list filter (in a sense of UNIX filter).

10 | *

Allows to perform any operation on alignment list (not only, like name 11 | * suggests, filter elements from it) - for example it can modify segment 12 | * contents, join or split alignments, etc.

13 | *

14 | * Filter operation receives alignment list as a parameter and returns 15 | * modified alignment list. Thanks to the fact that input and output has the 16 | * same type filters can be connected together creating the operation pipeline. 17 | *

18 | * 19 | * @author Jarek Lipski (loomchild) 20 | */ 21 | public interface Filter { 22 | 23 | /** 24 | * Performs any transformation on alignment list. 25 | * @param alignmentList input alignment list 26 | * @return output alignment list 27 | */ 28 | public List apply(List alignmentList); 29 | 30 | } 31 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/modifier/modify/merge/MergeAlgorithm.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.modifier.modify.merge; 2 | 3 | import static java.util.Collections.singletonList; 4 | 5 | import java.util.List; 6 | 7 | import net.loomchild.maligna.filter.modifier.modify.ModifyAlgorithm; 8 | 9 | 10 | /** 11 | * Represents algorithm merging a few segments into one. 12 | * This operation can add extra characters between segments or modify segment 13 | * contents - the important characteristic of it is that it always takes 14 | * segment list but returns just one segment. 15 | * 16 | * @author Jarek Lipski (loomchild) 17 | */ 18 | public abstract class MergeAlgorithm implements ModifyAlgorithm { 19 | 20 | public List modify(List segmentList) { 21 | return singletonList(merge(segmentList)); 22 | } 23 | 24 | /** 25 | * Merges segments from input list into one output segment. 26 | * @param segmentList source segment list 27 | * @return output segment 28 | */ 29 | public abstract String merge(List segmentList); 30 | 31 | } 32 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/aligner/AlignmentImpossibleException.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.aligner; 2 | 3 | import net.loomchild.maligna.filter.aligner.align.AlignAlgorithm; 4 | import net.loomchild.maligna.filter.aligner.align.onetoone.OneToOneAlgorithm; 5 | 6 | 7 | /** 8 | * Represents an exception used by {@link AlignAlgorithm} to indicate that text 9 | * alignment is impossible. 10 | * 11 | * It is thrown for example when using {@link OneToOneAlgorithm} but the text 12 | * does not have equal number of source and target alignments, text is too 13 | * short to apply given algorithm, segment counts are different 14 | * in reference and input alignments in {@link UnifyAligner}, etc. 15 | * 16 | * @author Jarek Lipski (loomchild) 17 | */ 18 | public class AlignmentImpossibleException extends RuntimeException { 19 | 20 | private static final long serialVersionUID = 101L; 21 | 22 | /** 23 | * Creates an exception. 24 | * @param message 25 | */ 26 | public AlignmentImpossibleException(String message) { 27 | super(message); 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/calculator/meta/CompositeCalculator.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.calculator.meta; 2 | 3 | import java.util.List; 4 | 5 | import net.loomchild.maligna.calculator.Calculator; 6 | 7 | /** 8 | * Represents composite calculator. Alignment score is a sum of 9 | * scores returned by all calculators (equivalent of product of prababilities 10 | * returned by all calculators). 11 | * @author loomchild 12 | */ 13 | public class CompositeCalculator implements Calculator { 14 | 15 | private List calculatorList; 16 | 17 | public CompositeCalculator(List calculatorList) { 18 | this.calculatorList = calculatorList; 19 | } 20 | 21 | public float calculateScore(List sourceSegmentList, 22 | List targetSegmentList) { 23 | float score = 0.0f; 24 | for (Calculator calculator : calculatorList) { 25 | score += calculator.calculateScore(sourceSegmentList, 26 | targetSegmentList); 27 | if (score == Float.POSITIVE_INFINITY) { 28 | break; 29 | } 30 | } 31 | assert score >= 0; 32 | return score; 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/selector/OneToOneSelector.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.selector; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import net.loomchild.maligna.coretypes.Alignment; 7 | import net.loomchild.maligna.filter.Filter; 8 | 9 | /** 10 | * Represents the filter that selects only one to one alignments and removes 11 | * the rest. 12 | * 13 | * @author Jarek Lipski (loomchild) 14 | */ 15 | public class OneToOneSelector implements Filter { 16 | 17 | /** 18 | * Filters the alignment list by leaving only 1-1 alignments. 19 | * 20 | * @param alignmentList input alignment list 21 | * @return filtered alignment list 22 | */ 23 | public List apply(List alignmentList) { 24 | List filteredAlignmentList = new ArrayList(); 25 | for (Alignment alignment : alignmentList) { 26 | if (alignment.getSourceSegmentList().size() == 1 && 27 | alignment.getTargetSegmentList().size() == 1) { 28 | filteredAlignmentList.add(alignment); 29 | } 30 | } 31 | return filteredAlignmentList; 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /maligna/src/test/java/net/loomchild/maligna/filter/modifier/modify/split/SplitAlgorithmMockTest.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.modifier.modify.split; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | 5 | import java.util.Arrays; 6 | import java.util.List; 7 | 8 | import net.loomchild.maligna.filter.modifier.modify.merge.MergeAlgorithm; 9 | import net.loomchild.maligna.filter.modifier.modify.merge.SeparatorMergeAlgorithm; 10 | 11 | import org.junit.Test; 12 | 13 | 14 | /** 15 | * Tests {@link SplitAlgorithmMock}. This is a little paranoid because mock is 16 | * used for testing itself. 17 | * @author loomchild 18 | */ 19 | public class SplitAlgorithmMockTest { 20 | 21 | @Test 22 | public void split() { 23 | String[] segments = new String[] {"aa", "bb", "c"}; 24 | MergeAlgorithm merger = new SeparatorMergeAlgorithm(""); 25 | String text = merger.merge(Arrays.asList(segments)); 26 | SplitAlgorithm splitter = new SplitAlgorithmMock(2); 27 | List splitted = splitter.split(text); 28 | String[] splittedArray = splitted.toArray(new String[splitted.size()]); 29 | assertEquals(segments, splittedArray); 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2006-2015 Jarek Lipski 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/util/bind/QuietValidationEventHandler.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.util.bind; 2 | 3 | import javax.xml.bind.ValidationEvent; 4 | import javax.xml.bind.ValidationEventHandler; 5 | 6 | import org.apache.commons.logging.Log; 7 | import org.apache.commons.logging.LogFactory; 8 | 9 | 10 | /** 11 | * XML validation event handler. On fatal error or error fails the valiadation. 12 | * Warnings are logged on DEBUG level. 13 | * 14 | * @author loomchild 15 | */ 16 | public class QuietValidationEventHandler implements ValidationEventHandler { 17 | 18 | private static Log log = LogFactory.getLog(QuietValidationEventHandler.class); 19 | 20 | public boolean handleEvent(ValidationEvent event) { 21 | if ((event.getSeverity() == ValidationEvent.FATAL_ERROR) || 22 | (event.getSeverity() == ValidationEvent.ERROR)) { 23 | return false; 24 | } else if (event.getSeverity() == ValidationEvent.WARNING) { 25 | log.debug("Validation warning: " + event.getMessage()); 26 | return true; 27 | } else { 28 | log.fatal("Unknown validation event severity: " + event.getSeverity()); 29 | return false; 30 | } 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/progress/ProgressObserver.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.progress; 2 | 3 | /** 4 | * Represents progress observer that listens to progress events. 5 | * 6 | * @author loomchild 7 | */ 8 | public interface ProgressObserver { 9 | 10 | /** 11 | * Occurs when {@link ProgressManager#registerProgressMeter(ProgressMeter)} 12 | * method is called, at the beginning of a process. 13 | * @param progressMeter 14 | */ 15 | public void registerProgressMeter(ProgressMeter progressMeter); 16 | 17 | /** 18 | * Occurs when {@link ProgressMeter#completeTask()} or 19 | * {@link ProgressMeter#completeTasks(int)} methods are called. 20 | * Not every completed task may trigger this event, 21 | * number of notifications can be lower than the number of actual tasks. 22 | * @param progressMeter 23 | */ 24 | public void completeTask(ProgressMeter progressMeter); 25 | 26 | /** 27 | * Occurs when {@link ProgressManager#unregisterProgressMeter(ProgressMeter)} 28 | * method is called, at the end of a process. 29 | * @param progressMeter 30 | */ 31 | public void unregisterProgressMeter(ProgressMeter progressMeter); 32 | 33 | } 34 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/coretypes/Category.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.coretypes; 2 | 3 | /** 4 | * Represents alignment category - for example one source segment to 5 | * one target segment (1-1), two source segments to zero target segments (2-0), 6 | * etc. 7 | * 8 | * Immutable - cannot be modified once it is created. 9 | * 10 | * @author Jarek Lipski (loomchild) 11 | */ 12 | public class Category { 13 | 14 | private int sourceSegmentCount; 15 | 16 | private int targetSegmentCount; 17 | 18 | public Category(int sourceSegmentCount, int targetSegmentCount) { 19 | this.sourceSegmentCount = sourceSegmentCount; 20 | this.targetSegmentCount = targetSegmentCount; 21 | } 22 | 23 | /** 24 | * @return Returns count of source segments in this category. 25 | */ 26 | public int getSourceSegmentCount() { 27 | return sourceSegmentCount; 28 | } 29 | 30 | /** 31 | * @return Returns count of target segments in this category. 32 | */ 33 | public int getTargetSegmentCount() { 34 | return targetSegmentCount; 35 | } 36 | 37 | public String toString() { 38 | return "(" + sourceSegmentCount + "-" + targetSegmentCount + ")"; 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/aligner/align/hmm/HmmAlignAlgorithmFactory.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.aligner.align.hmm; 2 | 3 | import java.util.Map; 4 | 5 | import net.loomchild.maligna.calculator.Calculator; 6 | import net.loomchild.maligna.coretypes.Category; 7 | import net.loomchild.maligna.filter.aligner.align.AlignAlgorithm; 8 | import net.loomchild.maligna.filter.aligner.align.hmm.adaptive.AdaptiveBandAlgorithm; 9 | import net.loomchild.maligna.matrix.MatrixFactory; 10 | 11 | /** 12 | * Represents a factory producing align algorithms based on Hidden Markov 13 | * Models (HMM). Used by {@link AdaptiveBandAlgorithm} to be independent 14 | * of actual algorithm. 15 | * @author loomchild 16 | */ 17 | public interface HmmAlignAlgorithmFactory { 18 | 19 | /** 20 | * Creates align algorithm. 21 | * @param calculator calculator 22 | * @param categoryMap map of possible alignment categories 23 | * @param matrixFactory factory creating matrices to be used by algorithm 24 | * @return align algorithm 25 | */ 26 | public AlignAlgorithm createAlignAlgorithm(Calculator calculator, 27 | Map categoryMap, MatrixFactory matrixFactory); 28 | 29 | } 30 | -------------------------------------------------------------------------------- /maligna/src/test/java/net/loomchild/maligna/filter/modifier/modify/split/SentenceSplitAlgorithmTest.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.modifier.modify.split; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | 5 | import java.util.List; 6 | 7 | import org.junit.Before; 8 | import org.junit.Test; 9 | 10 | 11 | 12 | /** 13 | * Represents {@link SentenceSplitAlgorithm} unit test. 14 | * 15 | * @author Jarek Lipski (loomchild) 16 | */ 17 | public class SentenceSplitAlgorithmTest { 18 | 19 | public static final String TEXT = 20 | "Ala ma kota. Prof. Kot nie wie kim jest. Ech\nNic."; 21 | 22 | public static final String[] SEGMENT_ARRAY = 23 | {"Ala ma kota.", " Prof.", " Kot nie wie kim jest.", " Ech\n", 24 | "Nic."}; 25 | 26 | private SentenceSplitAlgorithm splitter; 27 | 28 | @Before 29 | public void setUp() { 30 | this.splitter = new SentenceSplitAlgorithm(); 31 | } 32 | 33 | /** 34 | * Tests simple split. 35 | */ 36 | @Test 37 | public void stringSplit() { 38 | List segmentList = splitter.split(TEXT); 39 | String[] segmentArray = segmentList.toArray(new String[segmentList.size()]); 40 | assertEquals(SEGMENT_ARRAY, segmentArray); 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/calculator/length/counter/SplitCounter.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.calculator.length.counter; 2 | 3 | import net.loomchild.maligna.filter.modifier.modify.split.SplitAlgorithm; 4 | import net.loomchild.maligna.model.vocabulary.VocabularyUtil; 5 | 6 | /** 7 | * Responsible for calculating length of segment in words. Uses given word 8 | * splitting algorithm or {@link VocabularyUtil#DEFAULT_TOKENIZE_ALGORITHM}. 9 | * 10 | * @author loomchild 11 | */ 12 | public class SplitCounter implements Counter { 13 | 14 | public SplitAlgorithm splitAlgorithm; 15 | 16 | /** 17 | * Create calculator using given word split algoprithm. 18 | * @param splitAlgorithm 19 | */ 20 | public SplitCounter(SplitAlgorithm splitAlgorithm) { 21 | this.splitAlgorithm = splitAlgorithm; 22 | } 23 | 24 | /** 25 | * Creates calculator using 26 | * {@link VocabularyUtil#DEFAULT_TOKENIZE_ALGORITHM}. 27 | */ 28 | public SplitCounter() { 29 | this(VocabularyUtil.DEFAULT_TOKENIZE_ALGORITHM); 30 | } 31 | 32 | /** 33 | * Calculates length of a segment in words. 34 | */ 35 | public int calculateLength(String segment) { 36 | return splitAlgorithm.split(segment).size(); 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/util/Pair.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.util; 2 | 3 | /** 4 | * Simple structure of two values of any type. 5 | * 6 | * @author loomchild 7 | */ 8 | public class Pair { 9 | 10 | public F first; 11 | public S second; 12 | 13 | public Pair(F first, S second) { 14 | this.first = first; 15 | this.second = second; 16 | } 17 | 18 | public int hashCode() { 19 | final int PRIME = 31; 20 | int result = 1; 21 | result = PRIME * result + ((first == null) ? 0 : first.hashCode()); 22 | result = PRIME * result + ((second == null) ? 0 : second.hashCode()); 23 | return result; 24 | } 25 | 26 | public boolean equals(Object obj) { 27 | if (this == obj) 28 | return true; 29 | if (obj == null) 30 | return false; 31 | if (getClass() != obj.getClass()) 32 | return false; 33 | final Pair other = (Pair) obj; 34 | if (first == null) { 35 | if (other.first != null) 36 | return false; 37 | } else if (!first.equals(other.first)) 38 | return false; 39 | if (second == null) { 40 | if (other.second != null) 41 | return false; 42 | } else if (!second.equals(other.second)) 43 | return false; 44 | return true; 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/modifier/modify/split/SplitAlgorithm.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.modifier.modify.split; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import net.loomchild.maligna.filter.modifier.modify.ModifyAlgorithm; 7 | 8 | 9 | /** 10 | * Represents modify algorithm splitting single segment into a list of segments. 11 | * 12 | * @author Jarek Lipski (loomchild) 13 | */ 14 | public abstract class SplitAlgorithm implements ModifyAlgorithm { 15 | 16 | /** 17 | * Modifies a segment list by splitting each segment on the list and 18 | * adding the resulting list to an output list. 19 | * @param segmentList source segment list 20 | * @return output segment list 21 | */ 22 | public List modify(List segmentList) { 23 | List newSegmentList = new ArrayList(); 24 | for (String segment : segmentList) { 25 | List currentSegmentList = split(segment); 26 | newSegmentList.addAll(currentSegmentList); 27 | } 28 | return newSegmentList; 29 | } 30 | 31 | /** 32 | * Splits a segment into a list of segments. 33 | * @param string input segment 34 | * @return resulting segment list 35 | */ 36 | public abstract List split(String string); 37 | 38 | } 39 | -------------------------------------------------------------------------------- /maligna/src/test/java/net/loomchild/maligna/model/vocabulary/VocabularyTest.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.model.vocabulary; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | import static org.junit.Assert.assertFalse; 5 | import static org.junit.Assert.assertNull; 6 | import static org.junit.Assert.assertTrue; 7 | 8 | import org.junit.Test; 9 | 10 | /** 11 | * Represents {@link Vocabulary} unit test. 12 | * @author loomchild 13 | */ 14 | public class VocabularyTest { 15 | 16 | /** 17 | * Performs various tests on vocabulary including adding words, getting 18 | * the word ids, etc. 19 | */ 20 | @Test 21 | public void testVocabulary() { 22 | Vocabulary vocabulary = new Vocabulary(); 23 | assertFalse(vocabulary.containsWord("a b")); 24 | assertNull(vocabulary.getWid("a b")); 25 | assertTrue(vocabulary.containsWid(Vocabulary.NULL_WID)); 26 | assertFalse(vocabulary.containsWid(10)); 27 | assertEquals(0, vocabulary.getWordCount()); 28 | vocabulary.putWord("a b"); 29 | assertEquals(1, vocabulary.getWordCount()); 30 | assertTrue(vocabulary.containsWord("a b")); 31 | int wid = 1; 32 | assertEquals(wid, (int)vocabulary.getWid("a b")); 33 | assertTrue(vocabulary.containsWid(wid)); 34 | assertEquals("a b", vocabulary.getWord(wid)); 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/model/language/LanguageModel.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.model.language; 2 | 3 | import java.io.Writer; 4 | 5 | import net.loomchild.maligna.model.vocabulary.Vocabulary; 6 | 7 | 8 | /** 9 | * Represents simple unigram language model. Responsible for storing 10 | * word probabilities in given language. Words are represented as 11 | * integer word IDs. 12 | * 13 | * @author Jarek Lipski (loomchild) 14 | */ 15 | public interface LanguageModel { 16 | 17 | /** 18 | * Returns word occurrence probability in modeled language. 19 | * @param wid word id 20 | * @return word occurrence probability; [0, 1] 21 | */ 22 | public float getWordProbability(int wid); 23 | 24 | /** 25 | * Returns probability of some word if it occurred only once in 26 | * training corpus. Basically this number is equal to 27 | * 1 / total word number of words in the training corpus. 28 | * @return singleton probability 29 | */ 30 | public float getSingletonWordProbability(); 31 | 32 | /** 33 | * Formats the language model to given writer in plaintext format. 34 | * Used for logging purposes and for storing models. 35 | * @param writer writer 36 | * @param vocabulary this language vocabulary 37 | */ 38 | public void format(Writer writer, Vocabulary vocabulary); 39 | 40 | } 41 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/model/translation/TranslationModel.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.model.translation; 2 | 3 | import java.io.Writer; 4 | 5 | import net.loomchild.maligna.model.vocabulary.Vocabulary; 6 | 7 | 8 | /** 9 | * Represents translation model which is basically probabilistic dictionary 10 | * - it stores source word translations to target words along with each 11 | * translation probability. 12 | * 13 | * @see Statistical Machine Translation 14 | * @author Jarek Lipski (loomchild) 15 | */ 16 | public interface TranslationModel { 17 | 18 | /** 19 | * Retrieves translation data including translation probabilities to other 20 | * words for a word with given id 21 | * @param sourceWid source word id, >= 0. 22 | * @return translation data 23 | */ 24 | public SourceData get(int sourceWid); 25 | 26 | /** 27 | * Formats the translation model to given writer in plaintext format. 28 | * Used for logging purposes and for storing models. 29 | * @param writer writer 30 | * @param sourceVocabulary source language vocabulary 31 | * @param targetVocabulary target language vocabulary 32 | */ 33 | public void format(Writer writer, Vocabulary sourceVocabulary, 34 | Vocabulary targetVocabulary); 35 | 36 | } 37 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/matrix/Matrix.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.matrix; 2 | 3 | 4 | /** 5 | * Represents generic abstract two-dimensional matrix. 6 | * Useful because dimensions can be big but the data is usually sparse, 7 | * so more sophisticated implementations than normal two-dimensional array 8 | * are desired. 9 | * 10 | * @author Jarek Lipski (loomchild) 11 | */ 12 | public interface Matrix { 13 | 14 | /** 15 | * @return matrix width (number of columns) 16 | */ 17 | public int getWidth(); 18 | 19 | /** 20 | * @return matrix height (number of rows) 21 | */ 22 | public int getHeight(); 23 | 24 | /** 25 | * @return real matrix size (number of stored elements, <= width * height 26 | */ 27 | public int getSize(); 28 | 29 | /** 30 | * Returns matrix element at given position. 31 | * @param x column 32 | * @param y row 33 | * @return element 34 | */ 35 | public T get(int x, int y); 36 | 37 | /** 38 | * Sets the matrix element at given position. 39 | * @param x column 40 | * @param y row 41 | * @param data element 42 | */ 43 | public void set(int x, int y, T data); 44 | 45 | /** 46 | * @see MatrixIterator 47 | * @return matrix iterator that will iterate over whole matrix 48 | * from top left to bottom right corner. 49 | */ 50 | public MatrixIterator getIterator(); 51 | 52 | } 53 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/modifier/modify/split/WordSplitAlgorithm.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.modifier.modify.split; 2 | 3 | import static java.lang.Character.isLetterOrDigit; 4 | import static java.lang.Character.isWhitespace; 5 | 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | 9 | /** 10 | * Represents simple split algorithm separating input segment into words. 11 | * Word boundaries are assumed to be everything that is not a character or 12 | * digit. Whitespace characters are removed from the output. 13 | * 14 | * @author loomchild 15 | */ 16 | public class WordSplitAlgorithm extends SplitAlgorithm { 17 | 18 | public List split(String string) { 19 | List wordList = new ArrayList(); 20 | int start = 0; 21 | for (int end = 0; end < string.length(); ++end) { 22 | char ch = string.charAt(end); 23 | if (!isLetterOrDigit(ch)) { 24 | if ((end - start) > 0) { 25 | String word = string.substring(start, end); 26 | wordList.add(word); 27 | } 28 | if (!isWhitespace(ch)) { 29 | String word = string.substring(end, end + 1); 30 | wordList.add(word); 31 | } 32 | start = end + 1; 33 | } 34 | } 35 | if (start < string.length()) { 36 | String word = string.substring(start); 37 | wordList.add(word); 38 | } 39 | return wordList; 40 | } 41 | 42 | } 43 | -------------------------------------------------------------------------------- /maligna/src/test/java/net/loomchild/maligna/model/translation/MutableTranslationModelTest.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.model.translation; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | 5 | import org.junit.Test; 6 | 7 | /** 8 | * Represents {@link MutableTranslationModel} unit test. 9 | * @author loomchild 10 | */ 11 | public class MutableTranslationModelTest { 12 | 13 | /** 14 | * Checks whether {@link MutableTranslationModel#normalize()} and 15 | * {@link MutableTranslationModel#sort()} work as expected. 16 | */ 17 | @Test 18 | public void putNormalizeSort() { 19 | MutableTranslationModel model = new MutableTranslationModel(); 20 | 21 | assertEquals(0, model.get(0).getTranslationList().size()); 22 | 23 | model.getMutable(1).setTranslationProbability(0, 0.5); 24 | model.getMutable(1).setTranslationProbability(1, 1.5); 25 | 26 | assertEquals(0.5, model.get(1).getTranslationProbability(0), 0.00001); 27 | assertEquals(1.5, model.get(1).getTranslationProbability(1), 0.00001); 28 | model.normalize(); 29 | assertEquals(0.25, model.get(1).getTranslationProbability(0), 0.00001); 30 | assertEquals(0.75, model.get(1).getTranslationProbability(1), 0.00001); 31 | 32 | assertEquals(0, model.get(1).getTranslationList().get(0).getWid()); 33 | model.sort(); 34 | assertEquals(1, model.get(1).getTranslationList().get(0).getWid()); 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/meta/CompositeFilter.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.meta; 2 | 3 | import java.util.List; 4 | 5 | import net.loomchild.maligna.coretypes.Alignment; 6 | import net.loomchild.maligna.filter.Filter; 7 | 8 | /** 9 | * Represents a pipeline consisting of many filters but behaving as single 10 | * filter. 11 | * Transforms the input by executing all filters in sequence. 12 | * Basically implements composite design pattern. 13 | * 14 | * @author Jarek Lipski (loomchild) 15 | */ 16 | public class CompositeFilter implements Filter { 17 | 18 | private List filterList; 19 | 20 | /** 21 | * Creates composite filter. 22 | * @param filterList filter list; filters will be applied in the same order 23 | * as they appear on this list 24 | */ 25 | public CompositeFilter(List filterList) { 26 | this.filterList = filterList; 27 | } 28 | 29 | /** 30 | * Applies the composite filter by executing all the configured filters 31 | * is sequence, where output of previous filter is input of the next 32 | * filter. 33 | * @param alignmentList input alignment list 34 | * @return transformed alignment list 35 | */ 36 | public List apply(List alignmentList) { 37 | for (Filter filter : filterList) { 38 | alignmentList = filter.apply(alignmentList); 39 | } 40 | return alignmentList; 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /maligna-ui/examples/align/human/poznan-oracle.al: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Jezioro Malta jest zbiornikiem sztucznym. 6 | Powstało w 1952 r. przez spiętrzenie wód rzeki Cybiny. 7 | 8 | 9 | Der Stausee Malta wurde 1952 durch den Aufstau der Cybina geschaffen. 10 | 11 | 12 | 13 | 14 | Po zakończonej w 1990 r.rekultywacji, jezioro Malta osiągnęło powierzchnię 64,0 ha. 15 | Średnia głębokość akwenu wynosi 3,13 m, ale są miejsca, w których sięga ona 5,0 m. 16 | 17 | 18 | Der Malta - See nimmt eine Fläche von 64 ha ein, seine maximale Tiefe beträgt 5 m (im Bereich der Regattastrecke - 3,7 m). 19 | 20 | 21 | 22 | 23 | Jezioro Rusałka jest sztucznym zbiornikiem wodnym. 24 | 25 | 26 | Der Rusałka - See ist ein künstliches Wasserreservoir. 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /maligna-ui/src/main/assembly/bin.xml: -------------------------------------------------------------------------------- 1 | 5 | bin 6 | 7 | zip 8 | 9 | 10 | 11 | ${project.build.directory} 12 | /lib 13 | 14 | *.jar 15 | 16 | 17 | *-javadoc.jar 18 | 19 | 20 | 21 | ${basedir}/src/main/scripts 22 | /bin 23 | 24 | 25 | ${project.basedir}/.. 26 | 27 | README* 28 | LICENSE* 29 | CHANGELOG* 30 | 31 | . 32 | 33 | 34 | ${project.basedir}/examples 35 | examples 36 | 37 | 38 | 39 | 40 | false 41 | /lib 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/selector/DifferenceSelector.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.selector; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import net.loomchild.maligna.comparator.Diff; 7 | import net.loomchild.maligna.coretypes.Alignment; 8 | import net.loomchild.maligna.comparator.Comparator; 9 | import net.loomchild.maligna.filter.Filter; 10 | 11 | /** 12 | * Represents a filter that selects only those alignments from input list 13 | * that are not present in configured reference alingnment (set difference). 14 | * 15 | * @author loomchild 16 | */ 17 | public class DifferenceSelector implements Filter { 18 | 19 | private List rightAlignmentList; 20 | 21 | /** 22 | * Creates difference selector filter. 23 | * @param rightAlignmentList reference alignment list 24 | */ 25 | public DifferenceSelector(List rightAlignmentList) { 26 | this.rightAlignmentList = new ArrayList(rightAlignmentList); 27 | } 28 | 29 | /** 30 | * Filters input alignment leaving only alignments that do not exist in 31 | * configured reference alignment (set difference between alignment lists). 32 | * @param leftAlignmentList Input alignment list. 33 | * @return List containing selected alignments. 34 | */ 35 | public List apply(List leftAlignmentList) { 36 | Diff diff = Comparator.compare(leftAlignmentList, rightAlignmentList); 37 | return diff.getLeftList(); 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/matrix/FullMatrixIterator.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.matrix; 2 | 3 | import java.util.NoSuchElementException; 4 | 5 | /** 6 | * Represents {@link FullMatrix} iterator. 7 | * 8 | * @author loomchild 9 | * @param matrix data type 10 | */ 11 | public class FullMatrixIterator implements MatrixIterator { 12 | 13 | private FullMatrix matrix; 14 | 15 | private int x, y; 16 | 17 | public FullMatrixIterator(FullMatrix matrix) { 18 | this.matrix = matrix; 19 | beforeFirst(); 20 | } 21 | 22 | public int getX() { 23 | return x; 24 | } 25 | 26 | public int getY() { 27 | return y; 28 | } 29 | 30 | public void beforeFirst() { 31 | x = -1; 32 | y = 0; 33 | } 34 | 35 | public boolean hasNext() { 36 | return !((y >= matrix.getHeight() - 1) && (x >= matrix.getWidth() - 1)); 37 | } 38 | 39 | public void next() { 40 | ++x; 41 | if (x >= matrix.getWidth()) { 42 | ++y; 43 | x = 0; 44 | if (y >= matrix.getHeight()) { 45 | throw new NoSuchElementException(); 46 | } 47 | } 48 | } 49 | 50 | public void afterLast() { 51 | x = matrix.getWidth(); 52 | y = matrix.getHeight() - 1; 53 | } 54 | 55 | public boolean hasPrevious() { 56 | return !((y <= 0) && (x <= 0)); 57 | } 58 | 59 | public void previous() { 60 | --x; 61 | if (x < 0) { 62 | --y; 63 | x = matrix.getWidth() - 1; 64 | if (y < 0) { 65 | throw new NoSuchElementException(); 66 | } 67 | } 68 | } 69 | 70 | } 71 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/modifier/modify/split/FilterNonWordsSplitAlgorithmDecorator.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.modifier.modify.split; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | /** 7 | * Represents a split algorithm that splits input segment using 8 | * given algorithm but ignores all punctuation in the output segments. 9 | * To be used together with {@link WordSplitAlgorithm}. 10 | * Decorator design pattern. 11 | * 12 | * @author loomchild 13 | * 14 | */ 15 | public class FilterNonWordsSplitAlgorithmDecorator extends SplitAlgorithm { 16 | 17 | private SplitAlgorithm splitAlgorithm; 18 | 19 | /** 20 | * Creates splitter decorator. 21 | * @param splitAlgorithm split algorithm to be used 22 | */ 23 | public FilterNonWordsSplitAlgorithmDecorator(SplitAlgorithm splitAlgorithm) { 24 | this.splitAlgorithm = splitAlgorithm; 25 | } 26 | 27 | @Override 28 | public List split(String string) { 29 | List segmentList = splitAlgorithm.split(string); 30 | List resultSegmentList = new ArrayList(); 31 | for (String segment : segmentList) { 32 | // Checks whether segment consists only of letters and numbers. 33 | // Assumes that if the first character is a letter or number 34 | // then that's true. 35 | if (Character.isLetterOrDigit(segment.charAt(0))) { 36 | resultSegmentList.add(segment.toLowerCase()); 37 | } 38 | } 39 | return resultSegmentList; 40 | } 41 | 42 | } 43 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/modifier/modify/split/SplitAlgorithmMock.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.modifier.modify.split; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | 7 | /** 8 | * Represents split algorithm mock. 9 | * Responsible for splitting input segment int susbsegments of given lenght. 10 | * Used for testing. 11 | * 12 | * @author Jarek Lipski (loomchild) 13 | */ 14 | public class SplitAlgorithmMock extends SplitAlgorithm { 15 | 16 | private int charsInSegment; 17 | 18 | /** 19 | * Creates split algorithm splitting input segment into segments of given 20 | * length. 21 | * @param charsInSegment output segment length. 22 | */ 23 | public SplitAlgorithmMock(int charsInSegment) { 24 | assert charsInSegment > 0; 25 | this.charsInSegment = charsInSegment; 26 | } 27 | 28 | /** 29 | * Splits text into segments of given length (the last one can be shorter). 30 | * @param string input segment 31 | * @return output segment list 32 | */ 33 | public List split(String string) { 34 | List segmentList = new ArrayList(); 35 | int start = 0; 36 | for(int end = start + charsInSegment; end < string.length(); 37 | start += charsInSegment, end += charsInSegment) { 38 | String segment = string.substring(start, end); 39 | segmentList.add(segment); 40 | } 41 | String segment = string.substring(start); 42 | segmentList.add(segment); 43 | return segmentList; 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /maligna/src/test/java/net/loomchild/maligna/filter/aligner/align/hmm/viterbi/ViterbiAlgorithmTest.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.aligner.align.hmm.viterbi; 2 | 3 | import net.loomchild.maligna.calculator.Calculator; 4 | import net.loomchild.maligna.calculator.length.counter.CharCounter; 5 | import net.loomchild.maligna.calculator.length.counter.Counter; 6 | import net.loomchild.maligna.filter.aligner.align.hmm.HmmAlignAlgorithmTest; 7 | import net.loomchild.maligna.matrix.FullMatrixFactory; 8 | import net.loomchild.maligna.calculator.length.NormalDistributionCalculator; 9 | import net.loomchild.maligna.coretypes.CategoryDefaults; 10 | import net.loomchild.maligna.matrix.MatrixFactory; 11 | 12 | import org.junit.Before; 13 | 14 | /** 15 | * Represents {@link ViterbiAlgorithm} unit test. 16 | * @author loomchild 17 | * 18 | */ 19 | public class ViterbiAlgorithmTest extends HmmAlignAlgorithmTest { 20 | 21 | private ViterbiAlgorithm algorithm; 22 | 23 | protected ViterbiAlgorithm getAlgorithm() { 24 | return algorithm; 25 | } 26 | 27 | /** 28 | * Constructs algorithm object. It is similar to Gale and Church algorithm. 29 | */ 30 | @Before 31 | public void setUp() { 32 | Counter counter = new CharCounter(); 33 | Calculator calculator = new NormalDistributionCalculator(counter); 34 | MatrixFactory matrixFactory = new FullMatrixFactory(); 35 | 36 | algorithm = new ViterbiAlgorithm(calculator, 37 | CategoryDefaults.BEST_CATEGORY_MAP, matrixFactory); 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/matrix/FullMatrix.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.matrix; 2 | 3 | 4 | /** 5 | * Represents a two-dimensional matrix that contains all the elements. 6 | * This matrix use standard two-dimensional array and occupies 7 | * row number * column number memory, 8 | * which can be quite memory inefficient for sparse matrices. 9 | * On the other hand element access should be fast with this matrix. 10 | * 11 | * @author loomchild 12 | */ 13 | public class FullMatrix implements Matrix { 14 | 15 | private Object[][] dataArray; 16 | 17 | private int width; 18 | 19 | private int height; 20 | 21 | /** 22 | * Creates a matrix. 23 | * @param width matrix width (number of columns), >= 1. 24 | * @param height matrix height (number of rows), >= 1. 25 | */ 26 | public FullMatrix(int width, int height) { 27 | this.width = width; 28 | this.height = height; 29 | this.dataArray = new Object[width][height]; 30 | } 31 | 32 | public int getWidth() { 33 | return width; 34 | } 35 | 36 | public int getHeight() { 37 | return height; 38 | } 39 | 40 | public int getSize() { 41 | return width * height; 42 | } 43 | 44 | @SuppressWarnings("unchecked") 45 | public T get(int x, int y) { 46 | return (T)dataArray[x][y]; 47 | } 48 | 49 | public void set(int x, int y, T data) { 50 | dataArray[x][y] = data; 51 | } 52 | 53 | public MatrixIterator getIterator() { 54 | return new FullMatrixIterator(this); 55 | } 56 | 57 | } 58 | -------------------------------------------------------------------------------- /maligna/src/test/java/net/loomchild/maligna/filter/selector/DifferenceSelectorTest.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.selector; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | 8 | import net.loomchild.maligna.coretypes.Alignment; 9 | import net.loomchild.maligna.filter.Filter; 10 | 11 | import org.junit.Test; 12 | 13 | /** 14 | * Represents {@link DifferenceSelector} unit test. 15 | * @author loomchild 16 | */ 17 | public class DifferenceSelectorTest { 18 | 19 | @Test 20 | public void testSimple() { 21 | 22 | List leftAlignmentList = new ArrayList(); 23 | leftAlignmentList.add(new Alignment(new String[]{"a"}, new String[]{"1"})); 24 | leftAlignmentList.add(new Alignment(new String[]{"b"}, new String[]{})); 25 | leftAlignmentList.add(new Alignment(new String[]{"c"}, new String[]{"3"})); 26 | 27 | List rightAlignmentList = new ArrayList(); 28 | rightAlignmentList.add(new Alignment(new String[]{"a"}, new String[]{"1"})); 29 | rightAlignmentList.add(new Alignment(new String[]{"b"}, new String[]{"2"})); 30 | rightAlignmentList.add(new Alignment(new String[]{"c"}, new String[]{"3"})); 31 | 32 | Filter filter = new DifferenceSelector(rightAlignmentList); 33 | List resultAlignmentList = filter.apply(leftAlignmentList); 34 | 35 | assertEquals(1, resultAlignmentList.size()); 36 | assertEquals(leftAlignmentList.get(1), resultAlignmentList.get(0)); 37 | 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/model/translation/InitialTranslationModel.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.model.translation; 2 | 3 | import java.io.Writer; 4 | 5 | import net.loomchild.maligna.model.vocabulary.Vocabulary; 6 | 7 | 8 | /** 9 | * Represents initial translation model before first training iteration. 10 | * In this state translation probabilities from any word to any word are 11 | * equal to one. 12 | * This object is immutable. 13 | * 14 | * @author Jarek Lipski (loomchild) 15 | */ 16 | class InitialTranslationModel implements TranslationModel { 17 | 18 | private InitialSourceData translationData;; 19 | 20 | /** 21 | * Creates initial translation model. 22 | */ 23 | public InitialTranslationModel() { 24 | this.translationData = new InitialSourceData(); 25 | } 26 | 27 | /** 28 | * Always returns {@link InitialSourceData}, which means that for any source 29 | * word it translates to any target word with probability equal to 1. 30 | * @param sourceWid source word id, >= 0. 31 | * @return immutable instance of {@link InitialSourceData} 32 | */ 33 | public SourceData get(int sourceWid) { 34 | assert sourceWid >= 0; 35 | return translationData; 36 | } 37 | 38 | /** 39 | * @throws UnsupportedOperationException always because it is not possible 40 | * to format initial model. 41 | */ 42 | public void format(Writer writer, Vocabulary sourceVocabulary, 43 | Vocabulary targetVocabulary) { 44 | throw new UnsupportedOperationException(); 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /maligna-ui/src/main/java/net/loomchild/maligna/ui/console/command/CommandFactory.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.ui.console.command; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | import java.util.Set; 6 | 7 | 8 | public class CommandFactory { 9 | 10 | private static volatile CommandFactory instance; 11 | 12 | private Map commandMap; 13 | 14 | 15 | public static CommandFactory getInstance() 16 | { 17 | if (instance == null) 18 | { 19 | synchronized (CommandFactory.class) 20 | { 21 | if (instance == null) 22 | { 23 | instance = new CommandFactory(); 24 | } 25 | } 26 | } 27 | return instance; 28 | } 29 | 30 | private CommandFactory() 31 | { 32 | initCommandMap(); 33 | } 34 | 35 | private void initCommandMap() 36 | { 37 | commandMap = new HashMap(); 38 | addCommand(new AlignCommand()); 39 | addCommand(new CompareCommand()); 40 | addCommand(new FormatCommand()); 41 | addCommand(new ModelCommand()); 42 | addCommand(new ModifyCommand()); 43 | addCommand(new ParseCommand()); 44 | addCommand(new SelectCommand()); 45 | addCommand(new MacroCommand()); 46 | addCommand(new TestCommand()); 47 | } 48 | 49 | private void addCommand(Command command) { 50 | commandMap.put(command.getName(), command); 51 | } 52 | 53 | public Command getCommand(String name) { 54 | return commandMap.get(name); 55 | } 56 | 57 | public Set getCommandNameSet() { 58 | return commandMap.keySet(); 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/selector/IntersectionSelector.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.selector; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import net.loomchild.maligna.comparator.Diff; 7 | import net.loomchild.maligna.coretypes.Alignment; 8 | import net.loomchild.maligna.filter.Filter; 9 | import net.loomchild.maligna.comparator.Comparator; 10 | 11 | /** 12 | * Represents a filter that selects only alignments also occurring in given 13 | * reference alignment list - creating set intersection of reference alignments 14 | * and input alignment. 15 | * 16 | * @author loomchild 17 | */ 18 | public class IntersectionSelector implements Filter { 19 | 20 | private List rightAlignmentList; 21 | 22 | /** 23 | * Creates intersection selector. 24 | * @param rightAlignmentList reference alignment list 25 | */ 26 | public IntersectionSelector(List rightAlignmentList) { 27 | this.rightAlignmentList = new ArrayList(rightAlignmentList); 28 | } 29 | 30 | /** 31 | * Filters input alignment leaving only alignments that exist in 32 | * configured reference alignment (set intersection between alignment lists). 33 | * @param leftAlignmentList Input alignment list. 34 | * @return List containing selected alignments. 35 | */ 36 | public List apply(List leftAlignmentList) { 37 | Diff diff = Comparator.compare(leftAlignmentList, rightAlignmentList); 38 | return diff.getCommonList(); 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/aligner/align/AlignAlgorithm.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.aligner.align; 2 | 3 | import java.util.List; 4 | 5 | import net.loomchild.maligna.coretypes.Alignment; 6 | import net.loomchild.maligna.coretypes.Category; 7 | import net.loomchild.maligna.filter.aligner.AlignmentImpossibleException; 8 | 9 | /** 10 | * Represents alignment algorithm. 11 | * 12 | * @author Jarek Lipski (loomchild) 13 | */ 14 | public interface AlignAlgorithm { 15 | 16 | /** 17 | * Aligns source segment list with target segment list and returns a 18 | * list of alignments. All segments on the input list will be 19 | * present in resulting alignment in the same order as they were present 20 | * on input lists. 21 | * Alignments can be, one-to-zero, one-to-one, 22 | * many-to-zero, many-to-one, many-to-many (see {@link Category} for lists 23 | * of alignment categories for different aligners). 24 | * If both lists are empty returns empty list. If one of the lists is 25 | * empty returns only many-to-zero alignments (all-to-zero if possible). 26 | * 27 | * @param sourceSegmentList source segment list 28 | * @param targetSegmentList target segment list 29 | * @return alignment list containing all segments. 30 | * @throws AlignmentImpossibleException when it is impossible to align 31 | * given segments using this aligner 32 | */ 33 | public List align(List sourceSegmentList, 34 | List targetSegmentList); 35 | 36 | } 37 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/selector/ProbabilitySelector.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.selector; 2 | 3 | import static net.loomchild.maligna.util.Util.toScore; 4 | 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | 8 | import net.loomchild.maligna.coretypes.Alignment; 9 | import net.loomchild.maligna.filter.Filter; 10 | 11 | 12 | /** 13 | * Selects alignments with probability equal or greater than given threshold. 14 | * 15 | * @author Jarek Lipski (loomchild) 16 | */ 17 | public class ProbabilitySelector implements Filter { 18 | 19 | private double scoreThreshold; 20 | 21 | /** 22 | * Creates selector. 23 | * @param probabilityThreshold Minimum accepted alignment probability. 24 | * From range [0,1]. 25 | */ 26 | public ProbabilitySelector(double probabilityThreshold) { 27 | assert probabilityThreshold >= 0.0f && probabilityThreshold <= 1.0f; 28 | this.scoreThreshold = toScore(probabilityThreshold); 29 | } 30 | 31 | /** 32 | * Selects alignments with probability equal or greater than threshold. 33 | * @param alignmentList input alignment list 34 | * @return list containing selected alignments 35 | */ 36 | public List apply(List alignmentList) { 37 | List selectedAlignmentList = new ArrayList(); 38 | for (Alignment alignment : alignmentList) { 39 | if (alignment.getScore() <= scoreThreshold) { 40 | selectedAlignmentList.add(alignment); 41 | } 42 | } 43 | return selectedAlignmentList; 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/modifier/modify/clean/CleanAlgorithm.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.modifier.modify.clean; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import net.loomchild.maligna.filter.modifier.modify.ModifyAlgorithm; 7 | 8 | 9 | /** 10 | * Represents modify algorithm that cleans input segment list from useless 11 | * segments or characters inside segments. 12 | * @author loomchild 13 | */ 14 | public abstract class CleanAlgorithm implements ModifyAlgorithm { 15 | 16 | /** 17 | * Modifies each individual segment by calling {@link #clean(String)} 18 | * method (implemented by this class subclasses) for it. Stores the 19 | * results in output list, ignoring a segment when {@link #clean(String)} 20 | * returns null. 21 | * @param segmentList source segment list 22 | * @return cleaned segment list 23 | */ 24 | public List modify(List segmentList) { 25 | List newSegmentList = new ArrayList(); 26 | for (String segment : segmentList) { 27 | String newSegment = clean(segment); 28 | if (newSegment != null) { 29 | newSegmentList.add(newSegment); 30 | } 31 | } 32 | return newSegmentList; 33 | } 34 | 35 | /** 36 | * Modifies single individual segment. If returns null the segment is 37 | * removed from resulting list. 38 | * @param segment Sinput segment 39 | * @return modified segment or null if it should be removed from the result 40 | */ 41 | public abstract String clean(String segment); 42 | 43 | } 44 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/aligner/align/hmm/viterbi/ViterbiData.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.aligner.align.hmm.viterbi; 2 | 3 | import net.loomchild.maligna.coretypes.Category; 4 | 5 | /** 6 | * Represents alignment data type stored in a matrix by {@link ViterbiAlgorithm}. 7 | * Includes current alignment category, its score and cumulative score of 8 | * all alignments scores from best path leading to this alignment including 9 | * its score. 10 | * 11 | * @author loomchild 12 | */ 13 | public class ViterbiData { 14 | 15 | private Category category; 16 | 17 | private float score; 18 | 19 | private float totalScore; 20 | 21 | /** 22 | * Creates data. 23 | * @param category category of an alignment 24 | * @param score score of this alignment 25 | * @param totalScore total score of this alignment including all previous 26 | * alignments on the path 27 | */ 28 | public ViterbiData(Category category, 29 | float score, float totalScore) { 30 | this.category = category; 31 | this.score = score; 32 | this.totalScore = totalScore; 33 | } 34 | 35 | /** 36 | * @return this alignment score 37 | */ 38 | public float getScore() { 39 | return score; 40 | } 41 | 42 | /** 43 | * @return total score of this alignment including all previous alignments 44 | * on the path 45 | */ 46 | public float getTotalScore() { 47 | return totalScore; 48 | } 49 | 50 | /** 51 | * @return this alignment category 52 | */ 53 | public Category getCategory() { 54 | return category; 55 | } 56 | 57 | } 58 | -------------------------------------------------------------------------------- /maligna/src/test/java/net/loomchild/maligna/parser/PlaintextParserTest.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.parser; 2 | 3 | import static net.loomchild.maligna.util.TestUtil.assertAlignmentListEquals; 4 | 5 | import java.io.Reader; 6 | import java.io.StringReader; 7 | import java.util.List; 8 | 9 | import net.loomchild.maligna.coretypes.Alignment; 10 | 11 | import org.junit.Test; 12 | 13 | /** 14 | * Represents {@link PlaintextParser} unit test. 15 | * @author loomchild 16 | */ 17 | public class PlaintextParserTest { 18 | 19 | public static final String SOURCE_STRING = "aaabbb"; 20 | 21 | public static final String TARGET_STRING = "1122"; 22 | 23 | public static final String[][] SOURCE_SEGMENT_ARRAY = { 24 | new String[] {SOURCE_STRING}, 25 | }; 26 | 27 | public static final String[][] TARGET_SEGMENT_ARRAY = { 28 | new String[] {TARGET_STRING}, 29 | }; 30 | 31 | @Test 32 | public void parseString() { 33 | Parser parser = new PlaintextParser(SOURCE_STRING, TARGET_STRING); 34 | List alignmentList = parser.parse(); 35 | assertAlignmentListEquals(SOURCE_SEGMENT_ARRAY, TARGET_SEGMENT_ARRAY, 36 | alignmentList); 37 | } 38 | 39 | @Test 40 | public void parseReader() { 41 | Reader sourceReader = new StringReader(SOURCE_STRING); 42 | Reader targetReader = new StringReader(TARGET_STRING); 43 | Parser parser = new PlaintextParser(sourceReader, targetReader); 44 | List alignmentList = parser.parse(); 45 | assertAlignmentListEquals(SOURCE_SEGMENT_ARRAY, TARGET_SEGMENT_ARRAY, 46 | alignmentList); 47 | } 48 | 49 | } 50 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/util/bind/AlMarshallerUnmarshaller.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.util.bind; 2 | 3 | import java.io.Reader; 4 | import java.io.Writer; 5 | 6 | import net.loomchild.maligna.util.bind.al.Alignmentlist; 7 | 8 | /** 9 | * Represents XML marshaller / unmarshaller of .al native format. 10 | * Uses {@link MarshallerUnmarshaller}. Singleton. 11 | * 12 | * @author loomchild 13 | */ 14 | public class AlMarshallerUnmarshaller { 15 | 16 | public static final String CONTEXT = "net.loomchild.maligna.util.bind.al"; 17 | 18 | public static final String SCHEMA = "net/loomchild/maligna/res/xml/al.xsd"; 19 | 20 | private static volatile AlMarshallerUnmarshaller instance; 21 | 22 | private MarshallerUnmarshaller marshallerUnmarshaller; 23 | 24 | /** 25 | * @return singleton instance 26 | */ 27 | public static AlMarshallerUnmarshaller getInstance() { 28 | if (instance == null) { 29 | synchronized (AlMarshallerUnmarshaller.class) { 30 | if (instance == null) { 31 | instance = new AlMarshallerUnmarshaller(); 32 | } 33 | } 34 | } 35 | return instance; 36 | } 37 | 38 | private AlMarshallerUnmarshaller() { 39 | this.marshallerUnmarshaller = new MarshallerUnmarshaller(CONTEXT, 40 | SCHEMA); 41 | } 42 | 43 | public void marshal(Alignmentlist al, Writer writer) { 44 | marshallerUnmarshaller.marshal(al, writer); 45 | } 46 | 47 | public Alignmentlist unmarshal(Reader reader) { 48 | return (Alignmentlist)marshallerUnmarshaller.unmarshal(reader); 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /maligna/src/test/java/net/loomchild/maligna/filter/selector/IntersectionSelectorTest.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.selector; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | 8 | import net.loomchild.maligna.coretypes.Alignment; 9 | import net.loomchild.maligna.filter.Filter; 10 | 11 | import org.junit.Test; 12 | 13 | /** 14 | * Represents {@link IntersectionSelector} unit test. 15 | * @author loomchild 16 | */ 17 | public class IntersectionSelectorTest { 18 | 19 | @Test 20 | public void testSimple() { 21 | 22 | List leftAlignmentList = new ArrayList(); 23 | leftAlignmentList.add(new Alignment(new String[]{"a"}, new String[]{"1"})); 24 | leftAlignmentList.add(new Alignment(new String[]{"b"}, new String[]{})); 25 | leftAlignmentList.add(new Alignment(new String[]{"c"}, new String[]{"3"})); 26 | 27 | List rightAlignmentList = new ArrayList(); 28 | rightAlignmentList.add(new Alignment(new String[]{"a"}, new String[]{"1"})); 29 | rightAlignmentList.add(new Alignment(new String[]{"b"}, new String[]{"2"})); 30 | rightAlignmentList.add(new Alignment(new String[]{"c"}, new String[]{"3"})); 31 | 32 | Filter filter = new IntersectionSelector(rightAlignmentList); 33 | List resultAlignmentList = filter.apply(leftAlignmentList); 34 | 35 | assertEquals(2, resultAlignmentList.size()); 36 | assertEquals(rightAlignmentList.get(0), resultAlignmentList.get(0)); 37 | assertEquals(rightAlignmentList.get(2), resultAlignmentList.get(1)); 38 | 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /maligna/src/test/java/net/loomchild/maligna/filter/aligner/align/hmm/fb/ForwardBackwardAlgorithmTest.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.aligner.align.hmm.fb; 2 | 3 | import net.loomchild.maligna.calculator.Calculator; 4 | import net.loomchild.maligna.calculator.length.counter.CharCounter; 5 | import net.loomchild.maligna.calculator.length.counter.Counter; 6 | import net.loomchild.maligna.coretypes.CategoryDefaults; 7 | import net.loomchild.maligna.filter.aligner.align.hmm.HmmAlignAlgorithmTest; 8 | import net.loomchild.maligna.matrix.FullMatrixFactory; 9 | import net.loomchild.maligna.matrix.MatrixFactory; 10 | import net.loomchild.maligna.calculator.length.NormalDistributionCalculator; 11 | 12 | import org.junit.Before; 13 | 14 | /** 15 | * Represents {@link ForwardBackwardAlgorithm} unit test. 16 | * @author loomchild 17 | * 18 | */ 19 | public class ForwardBackwardAlgorithmTest extends HmmAlignAlgorithmTest { 20 | 21 | private ForwardBackwardAlgorithm algorithm; 22 | 23 | protected ForwardBackwardAlgorithm getAlgorithm() { 24 | return algorithm; 25 | } 26 | 27 | /** 28 | * Constructs algorithm object. It is similar to Gale and Church algorithm 29 | * but based on Forward Backward method instead of Viterbi method. 30 | */ 31 | @Before 32 | public void setUp() { 33 | Counter counter = new CharCounter(); 34 | Calculator calculator = new NormalDistributionCalculator(counter); 35 | MatrixFactory matrixFactory = new FullMatrixFactory(); 36 | 37 | algorithm = new ForwardBackwardAlgorithm(calculator, 38 | CategoryDefaults.BEST_CATEGORY_MAP, matrixFactory); 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/modifier/modify/merge/SeparatorMergeAlgorithm.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.modifier.modify.merge; 2 | 3 | import java.util.Iterator; 4 | import java.util.List; 5 | 6 | /** 7 | * Represents and algorithm merging a list of segments into one segment by 8 | * concatenating them. It can insert given separator string between segments. 9 | * 10 | * @author Jarek Lipski (loomchild) 11 | */ 12 | public class SeparatorMergeAlgorithm extends MergeAlgorithm { 13 | 14 | /** 15 | * Default segment separator. 16 | */ 17 | public static final String DEFAULT_SEPARATOR = ""; 18 | 19 | private String separator; 20 | 21 | /** 22 | * Creates merge algorithm. 23 | * @param separator separator 24 | */ 25 | public SeparatorMergeAlgorithm(String separator) { 26 | this.separator = separator; 27 | } 28 | 29 | /** 30 | * Creates merge algoruthm with {@link #DEFAULT_SEPARATOR}. 31 | */ 32 | public SeparatorMergeAlgorithm() { 33 | this(DEFAULT_SEPARATOR); 34 | } 35 | 36 | /** 37 | * Merges list of segments into one segment by concatenating them and 38 | * inserting separator between. 39 | * @param segmentList input segment list 40 | * @return merged segment 41 | */ 42 | public String merge(List segmentList) { 43 | StringBuilder builder = new StringBuilder(); 44 | Iterator i = segmentList.iterator(); 45 | while (i.hasNext()) { 46 | String segment = i.next(); 47 | builder.append(segment); 48 | if (i.hasNext()) { 49 | builder.append(separator); 50 | } 51 | } 52 | return builder.toString(); 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/aligner/Aligner.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.aligner; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import net.loomchild.maligna.coretypes.Alignment; 7 | import net.loomchild.maligna.filter.Filter; 8 | import net.loomchild.maligna.filter.aligner.align.AlignAlgorithm; 9 | 10 | 11 | /** 12 | * Represents aligner - for each alignment on input list aligns source segments 13 | * with target segments and appends obtained list of alignments to the result. 14 | * This implies that resulting list can have more alignments than input list 15 | * but cannot have less. Does not change alignment contents. 16 | * 17 | * @author Jarek Lipski (loomchild) 18 | */ 19 | public class Aligner implements Filter { 20 | 21 | private AlignAlgorithm algorithm; 22 | 23 | public Aligner(AlignAlgorithm algorithm) { 24 | this.algorithm = algorithm; 25 | } 26 | 27 | /** 28 | * For each alignment on input list aligns source segments with target 29 | * segments, and appends the obtained alignment list to the result. 30 | * @throws AlignmentImpossibleException when it is not possible to align 31 | * texts 32 | */ 33 | public List apply(List alignmentList) { 34 | List newAlignmentList = new ArrayList(); 35 | for (Alignment alignment : alignmentList) { 36 | List currentAlignmentList = 37 | algorithm.align(alignment.getSourceSegmentList(), 38 | alignment.getTargetSegmentList()); 39 | newAlignmentList.addAll(currentAlignmentList); 40 | } 41 | return newAlignmentList; 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/coretypes/CategoryDefaults.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.coretypes; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | 6 | import net.loomchild.maligna.util.Util; 7 | 8 | /** 9 | * Responsible for storing default categories with scores (-ln probability) 10 | * of occurrence of alignment of this category measured experimentally in some 11 | * test corpus. 12 | */ 13 | public class CategoryDefaults { 14 | 15 | public static final Map BEST_CATEGORY_MAP = 16 | createBestCategoryMap(); 17 | 18 | private static Map createBestCategoryMap() { 19 | Map map = new HashMap(); 20 | 21 | map.put(new Category(1, 1), (float)Util.toScore(0.9)); 22 | map.put(new Category(1, 0), (float)Util.toScore(0.005)); 23 | map.put(new Category(0, 1), (float)Util.toScore(0.005)); 24 | map.put(new Category(2, 1), (float)Util.toScore(0.045)); 25 | map.put(new Category(1, 2), (float)Util.toScore(0.045)); 26 | 27 | return map; 28 | } 29 | 30 | public static final Map MOORE_CATEGORY_MAP = 31 | createMooreCategoryMap(); 32 | 33 | private static Map createMooreCategoryMap() { 34 | Map map = new HashMap(); 35 | 36 | map.put(new Category(1, 1), (float)Util.toScore(0.94)); 37 | map.put(new Category(1, 0), (float)Util.toScore(0.01)); 38 | map.put(new Category(0, 1), (float)Util.toScore(0.01)); 39 | map.put(new Category(2, 1), (float)Util.toScore(0.02)); 40 | map.put(new Category(1, 2), (float)Util.toScore(0.02)); 41 | 42 | return map; 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /maligna/src/test/java/net/loomchild/maligna/parser/AlParserTest.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.parser; 2 | 3 | import static net.loomchild.maligna.util.TestUtil.assertAlignmentListEquals; 4 | import static net.loomchild.maligna.util.Util.getReader; 5 | import static net.loomchild.maligna.util.Util.getResourceStream; 6 | 7 | import java.io.InputStream; 8 | import java.io.Reader; 9 | import java.util.List; 10 | 11 | import net.loomchild.maligna.coretypes.Alignment; 12 | 13 | import org.junit.Test; 14 | 15 | /** 16 | * Represents {@link AlParser} unit test. 17 | * @author loomchild 18 | */ 19 | public class AlParserTest { 20 | 21 | public static final String FILE = "net/loomchild/maligna/res/test/simpletext.al"; 22 | 23 | public static final String[][] SOURCE_SEGMENT_ARRAY = { 24 | new String[] {"First sentence. ", "Second sentence."}, 25 | new String[] {}, 26 | new String[] {}, 27 | }; 28 | 29 | public static final String[][] TARGET_SEGMENT_ARRAY = { 30 | new String[] {"Pierwsze zdanie."}, 31 | new String[] {"Drugie zdanie."}, 32 | new String[] {}, 33 | }; 34 | 35 | /** 36 | * Test whether {@link AlParser} is able to parse a test file 37 | * stored in {@value #FILE} into {@link #SOURCE_SEGMENT_ARRAY} and 38 | * {@link #TARGET_SEGMENT_ARRAY}. 39 | * @throws Exception 40 | */ 41 | @Test 42 | public void parse() throws Exception { 43 | InputStream inputStream = getResourceStream(FILE); 44 | Reader reader = getReader(inputStream); 45 | AlParser parser = new AlParser(reader); 46 | List alignmentList = parser.parse(); 47 | assertAlignmentListEquals(SOURCE_SEGMENT_ARRAY, TARGET_SEGMENT_ARRAY, 48 | alignmentList); 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /maligna/src/test/java/net/loomchild/maligna/model/translation/MutableSourceDataTest.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.model.translation; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | 5 | import java.util.List; 6 | 7 | import org.junit.Test; 8 | 9 | /** 10 | * Represents {@link MutableSourceData} unit test. 11 | * @author loomchild 12 | */ 13 | public class MutableSourceDataTest { 14 | 15 | /** 16 | * Check if {@link MutableSourceData#normalize()} and 17 | * {@link MutableSourceData#sort()} work as expected. 18 | */ 19 | @Test 20 | public void getPutSortNormalize() { 21 | MutableSourceData data = new MutableSourceData(); 22 | assertEquals(0, data.getTranslationList().size()); 23 | assertEquals(0, data.getTranslationProbability(0), 0.000001); 24 | data.setTranslationProbability(0, 0.6f); 25 | assertEquals(0.6, data.getTranslationProbability(0), 0.000001); 26 | data.setTranslationProbability(1, 1.0f); 27 | assertEquals(1.0, data.getTranslationProbability(1), 0.000001); 28 | data.setTranslationProbability(2, 0.4f); 29 | assertEquals(0.4, data.getTranslationProbability(2), 0.000001); 30 | data.normalize(); 31 | data.sort(); 32 | List targetList = data.getTranslationList(); 33 | assertEquals(3, targetList.size()); 34 | TargetData target = targetList.get(0); 35 | assertEquals(0.5, target.getProbability(), 0.000001); 36 | assertEquals(1, target.getWid()); 37 | target = targetList.get(1); 38 | assertEquals(0.3, target.getProbability(), 0.000001); 39 | assertEquals(0, target.getWid()); 40 | target = targetList.get(2); 41 | assertEquals(0.2, target.getProbability(), 0.000001); 42 | assertEquals(2, target.getWid()); 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /maligna/src/test/java/net/loomchild/maligna/filter/modifier/modify/split/WordSplitAlgorithmTest.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.modifier.modify.split; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | 5 | import java.util.List; 6 | 7 | import org.junit.Test; 8 | 9 | 10 | /** 11 | * Represents {@link WordSplitAlgorithm} unit test. 12 | * @author loomchild 13 | */ 14 | public class WordSplitAlgorithmTest { 15 | 16 | public static final String SPACE = " ab\t 9\net "; 17 | 18 | public static final String[] EXPECTED_SPACE = 19 | new String[] {"ab", "9", "et"}; 20 | 21 | /** 22 | * Checks if splitting on whitespace works as expected and that whitespace 23 | * characters are removed from the output. 24 | */ 25 | @Test 26 | public void splitSpace() { 27 | WordSplitAlgorithm splitter = new WordSplitAlgorithm(); 28 | List wordList = splitter.split(SPACE); 29 | String[] wordArray = wordList.toArray(new String[wordList.size()]); 30 | assertEquals(EXPECTED_SPACE, wordArray); 31 | } 32 | 33 | public static final String PUNCTUATION = 34 | "1. Ja, niżej podpisan(I'm \"batman01\")."; 35 | 36 | public static final String[] EXPECTED_PUNCTUATION = new String[] { 37 | "1", ".", "Ja", ",", "niżej", "podpisan", "(", "I", "'", "m", 38 | "\"", "batman01", "\"", ")", "."}; 39 | 40 | /** 41 | * Checks if splitting after punctuation characters works as expected. 42 | */ 43 | @Test 44 | public void splitPunctuation() { 45 | WordSplitAlgorithm splitter = new WordSplitAlgorithm(); 46 | List wordList = splitter.split(PUNCTUATION); 47 | String[] wordArray = wordList.toArray(new String[wordList.size()]); 48 | assertEquals(EXPECTED_PUNCTUATION, wordArray); 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /maligna/src/test/java/net/loomchild/maligna/calculator/length/PoissonDistributionCalculatorTest.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.calculator.length; 2 | 3 | import static net.loomchild.maligna.calculator.length.PoissonDistributionCalculator.factorial; 4 | import static net.loomchild.maligna.calculator.length.PoissonDistributionCalculator.poissonDistribution; 5 | import static org.junit.Assert.assertEquals; 6 | 7 | import org.junit.Test; 8 | 9 | /** 10 | * Represents {@link PoissonDistributionCalculator} unit test. 11 | * @author loomchild 12 | */ 13 | public class PoissonDistributionCalculatorTest { 14 | 15 | /** 16 | * Tests factorial calculation by 17 | * {@link PoissonDistributionCalculator#factorial(int)}. 18 | */ 19 | @Test 20 | public void testFactorial() { 21 | assertEquals(1.0f, Math.exp(factorial(1)), 0.01f); 22 | assertEquals(2.0f, Math.exp(factorial(2)), 0.01f); 23 | assertEquals(6.0f, Math.exp(factorial(3)), 0.01f); 24 | assertEquals(24.0f, Math.exp(factorial(4)), 0.01f); 25 | } 26 | 27 | /** 28 | * Tests Poisson distribution points calculation by 29 | * {@link PoissonDistributionCalculator#poissonDistribution(float, int)} 30 | * using some manually calculated values. 31 | */ 32 | @Test 33 | public void testPoissonDistribution() { 34 | assertEquals(0.6065f, Math.exp(-poissonDistribution(0.5f, 0)), 0.0001f); 35 | assertEquals(0.3679f, Math.exp(-poissonDistribution(1.0f, 0)), 0.0001f); 36 | assertEquals(0.3679f, Math.exp(-poissonDistribution(1.0f, 1)), 0.0001f); 37 | assertEquals(0.1839f, Math.exp(-poissonDistribution(1.0f, 2)), 0.0001f); 38 | assertEquals(0.2707f, Math.exp(-poissonDistribution(2.0f, 1)), 0.0001f); 39 | assertEquals(0.1805f, Math.exp(-poissonDistribution(2.0f, 3)), 0.0001f); 40 | } 41 | 42 | } 43 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/util/bind/TmxMarshallerUnmarshaller.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.util.bind; 2 | 3 | import java.io.Reader; 4 | import java.io.Writer; 5 | 6 | import net.loomchild.maligna.util.bind.tmx.Tmx; 7 | 8 | /** 9 | * Represents XML marshaller / unmarshaller of .tmx format. 10 | * Uses {@link MarshallerUnmarshaller}. Singleton. 11 | * 12 | * @author loomchild 13 | */ 14 | public class TmxMarshallerUnmarshaller { 15 | 16 | public static final String CONTEXT = "net.loomchild.maligna.util.bind.tmx"; 17 | 18 | /** 19 | * List of XML schemas used for validation. 20 | * Order of appearance on this array is important! 21 | */ 22 | public static final String[] SCHEMA_ARRAY = new String[] { 23 | "net/loomchild/maligna/res/xml/xml.xsd", "net/loomchild/maligna/res/xml/tmx.xsd" 24 | }; 25 | 26 | private static volatile TmxMarshallerUnmarshaller instance; 27 | 28 | private MarshallerUnmarshaller marshallerUnmarshaller; 29 | 30 | /** 31 | * @return singleton instance 32 | */ 33 | public static TmxMarshallerUnmarshaller getInstance() { 34 | if (instance == null) { 35 | synchronized (TmxMarshallerUnmarshaller.class) { 36 | if (instance == null) { 37 | instance = new TmxMarshallerUnmarshaller(); 38 | } 39 | } 40 | } 41 | return instance; 42 | } 43 | 44 | private TmxMarshallerUnmarshaller() { 45 | this.marshallerUnmarshaller = new MarshallerUnmarshaller(CONTEXT, 46 | SCHEMA_ARRAY); 47 | } 48 | 49 | public void marshal(Tmx tmx, Writer writer) { 50 | marshallerUnmarshaller.marshal(tmx, writer); 51 | } 52 | 53 | public Tmx unmarshal(Reader reader) { 54 | return (Tmx)marshallerUnmarshaller.unmarshal(reader); 55 | } 56 | 57 | } 58 | -------------------------------------------------------------------------------- /maligna/src/test/java/net/loomchild/maligna/model/language/LanguageModelUtilTest.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.model.language; 2 | 3 | import static net.loomchild.maligna.model.Util.createWidList; 4 | import static org.junit.Assert.assertEquals; 5 | 6 | import java.io.StringReader; 7 | import java.util.List; 8 | 9 | import org.junit.Test; 10 | 11 | /** 12 | * Represents {@link LanguageModelUtil} unit test. 13 | * @author loomchild 14 | */ 15 | public class LanguageModelUtilTest { 16 | 17 | public String LANGUAGE_MODEL = 18 | "3 2\n" + 19 | "1 3\n" + 20 | "2 0\n" + 21 | " "; 22 | 23 | /** 24 | * Tests if model training produces expected probabilities. 25 | */ 26 | @Test 27 | public void train() { 28 | int[][] widArray = new int[][] { 29 | new int[] {1, 2, 1}, new int[] {1}, new int[] {2}, new int[] {}, 30 | }; 31 | List< List > widList = createWidList(widArray); 32 | LanguageModel model = LanguageModelUtil.train(widList); 33 | assertEquals(0.6f, model.getWordProbability(1), 0.01f); 34 | assertEquals(0.4f, model.getWordProbability(2), 0.01f); 35 | assertEquals(0.0f, model.getWordProbability(0), 0.01f); 36 | assertEquals(0.2f, model.getSingletonWordProbability(), 0.01f); 37 | } 38 | 39 | /** 40 | * Tests if parsing of simple test model {@link #LANGUAGE_MODEL} 41 | * works as expected. 42 | */ 43 | @Test 44 | public void testParse() { 45 | StringReader reader = new StringReader(LANGUAGE_MODEL); 46 | LanguageModel languageModel = LanguageModelUtil.parse(reader); 47 | assertEquals(0.2f, languageModel.getSingletonWordProbability(), 0.0001f); 48 | assertEquals(0.6f, languageModel.getWordProbability(1), 0.0001f); 49 | assertEquals(0.0f, languageModel.getWordProbability(2), 0.0001f); 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/modifier/modify/split/SentenceSplitAlgorithm.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.modifier.modify.split; 2 | 3 | import java.io.IOException; 4 | import java.io.Reader; 5 | import java.io.StringReader; 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | 9 | import net.loomchild.maligna.util.ImpossibleException; 10 | 11 | 12 | /** 13 | *

Represents simple sentence splitter using hardcoded rules.

14 | * 15 | *

Splitting does not omit any characters. Uses {@link SimpleSplitter}.

16 | * 17 | *

For more accurate sentence segmentation see {@link SrxSplitAlgorithm}.

18 | * 19 | * @author Jarek Lipski (loomchild) 20 | */ 21 | public class SentenceSplitAlgorithm extends SplitAlgorithm { 22 | 23 | /** 24 | * Splits input segment to a list of sentences. Spltting occurrs 25 | * after end-of-line character and after end of sentence character (.?!), 26 | * if the next character is capital letter. 27 | * 28 | * @param string input segment 29 | * @return list of sentences 30 | */ 31 | public List split(String string) { 32 | Reader stringReader = new StringReader(string); 33 | List segmentList = null; 34 | try { 35 | segmentList = split(stringReader); 36 | } catch (IOException e) { 37 | throw new ImpossibleException("IOException reading StringReader", e); 38 | } 39 | return segmentList; 40 | } 41 | 42 | private List split(Reader reader) throws IOException { 43 | List segmentList = new ArrayList(); 44 | SimpleSplitter splitter = new SimpleSplitter(reader); 45 | while(splitter.hasNext()) { 46 | String segment = splitter.next(); 47 | segmentList.add(segment); 48 | } 49 | return segmentList; 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /maligna/src/test/java/net/loomchild/maligna/filter/selector/OneToOneSelectorTest.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.selector; 2 | 3 | import static net.loomchild.maligna.util.TestUtil.assertAlignmentListEquals; 4 | import static net.loomchild.maligna.util.TestUtil.createAlignmentList; 5 | import static net.loomchild.maligna.util.TestUtil.filterSegmentArray; 6 | 7 | import java.util.List; 8 | 9 | import net.loomchild.maligna.coretypes.Alignment; 10 | import net.loomchild.maligna.filter.Filter; 11 | 12 | import org.junit.Test; 13 | 14 | /** 15 | * Represents {@link OneToOneSelector} unit test. 16 | * @author loomchild 17 | */ 18 | public class OneToOneSelectorTest { 19 | 20 | public static final String[][] SOURCE_ARRAY = new String[][] { 21 | new String[] {"aa", "bb"}, 22 | new String[] {}, 23 | new String[] {"cc"}, 24 | new String[] {}, 25 | new String[] {"dd"}, 26 | new String[] {"ee", "ff"}, 27 | }; 28 | 29 | public static final String[][] TARGET_ARRAY = new String[][] { 30 | new String[] {"11"}, 31 | new String[] {"22"}, 32 | new String[] {"33"}, 33 | new String[] {}, 34 | new String[] {"44"}, 35 | new String[] {"55", "66"}, 36 | }; 37 | 38 | public static final int[] RESULT_INDEXES = new int[] { 39 | 2, 4 40 | }; 41 | 42 | /** 43 | * Checks if selector leaves only and all one to one alignments. 44 | */ 45 | @Test 46 | public void compare() { 47 | List alignmentList = createAlignmentList( 48 | SOURCE_ARRAY, TARGET_ARRAY); 49 | Filter filter = new OneToOneSelector(); 50 | List resultAlignmentList = filter.apply(alignmentList); 51 | assertAlignmentListEquals( 52 | filterSegmentArray(SOURCE_ARRAY, RESULT_INDEXES), 53 | filterSegmentArray(TARGET_ARRAY, RESULT_INDEXES), 54 | resultAlignmentList); 55 | } 56 | 57 | } 58 | -------------------------------------------------------------------------------- /maligna/src/test/java/net/loomchild/maligna/formatter/AlFormatterTest.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.formatter; 2 | 3 | import static net.loomchild.maligna.util.TestUtil.assertAlignmentListEquals; 4 | import static net.loomchild.maligna.util.TestUtil.createAlignmentList; 5 | 6 | import java.io.Reader; 7 | import java.io.StringReader; 8 | import java.io.StringWriter; 9 | import java.util.List; 10 | 11 | import net.loomchild.maligna.coretypes.Alignment; 12 | import net.loomchild.maligna.parser.AlParser; 13 | 14 | import org.junit.Test; 15 | 16 | /** 17 | * Represents {@link AlFormatter} unit test. 18 | * @author loomchild 19 | */ 20 | public class AlFormatterTest { 21 | 22 | public static final String[][] SOURCE_ARRAY = new String[][]{ 23 | new String[]{"Ala ma kota kot ma\tale nie wie.\nDrugie.", 24 | "Burza mózgów zawsze dobrze robi."}, 25 | new String[]{}, 26 | new String[]{}, 27 | }; 28 | 29 | public static final String[][] TARGET_ARRAY = new String[][]{ 30 | new String[]{"Wasserreservoir, Wasserreservoir..."}, 31 | new String[]{}, 32 | new String[]{"Immer nur Wasser"}, 33 | }; 34 | 35 | /** 36 | * Tests whether alignment formatted by {@link AlFormatter} can be 37 | * successfully parsed by {@link AlParser}. 38 | */ 39 | @Test 40 | public void testFormatParse() { 41 | List alignmentList = createAlignmentList(SOURCE_ARRAY, 42 | TARGET_ARRAY); 43 | StringWriter writer = new StringWriter(); 44 | Formatter formatter = new AlFormatter(writer); 45 | formatter.format(alignmentList); 46 | Reader reader = new StringReader(writer.toString()); 47 | AlParser parser = new AlParser(reader); 48 | List resultAlignmentList = parser.parse(); 49 | assertAlignmentListEquals(SOURCE_ARRAY, TARGET_ARRAY, 50 | resultAlignmentList); 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/macro/GaleAndChurchMacro.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.macro; 2 | 3 | import java.util.List; 4 | 5 | import net.loomchild.maligna.calculator.Calculator; 6 | import net.loomchild.maligna.calculator.length.counter.CharCounter; 7 | import net.loomchild.maligna.calculator.length.counter.Counter; 8 | import net.loomchild.maligna.coretypes.Alignment; 9 | import net.loomchild.maligna.filter.Filter; 10 | import net.loomchild.maligna.filter.aligner.Aligner; 11 | import net.loomchild.maligna.filter.aligner.align.hmm.HmmAlignAlgorithmFactory; 12 | import net.loomchild.maligna.filter.aligner.align.hmm.adaptive.AdaptiveBandAlgorithm; 13 | import net.loomchild.maligna.calculator.length.NormalDistributionCalculator; 14 | import net.loomchild.maligna.filter.aligner.align.AlignAlgorithm; 15 | import net.loomchild.maligna.filter.aligner.align.hmm.viterbi.ViterbiAlgorithmFactory; 16 | 17 | /** 18 | * Represents macro to align a text using Gale and Church algorithm. 19 | * Actual implementation can be slightly different but the result should be 20 | * very similar. 21 | * 22 | * @see "A Program for Aligning Sentences in Bilingual Corpora, 23 | * Gale, W.A., Church, K.W." 24 | * @author loomchild 25 | */ 26 | public class GaleAndChurchMacro implements Macro { 27 | 28 | public List apply(List alignmentList) { 29 | 30 | Counter counter = new CharCounter(); 31 | Calculator calculator = new NormalDistributionCalculator(counter); 32 | 33 | HmmAlignAlgorithmFactory algorithmFactory = 34 | new ViterbiAlgorithmFactory(); 35 | 36 | AlignAlgorithm algorithm = 37 | new AdaptiveBandAlgorithm(algorithmFactory, calculator); 38 | 39 | Filter filter = new Aligner(algorithm); 40 | 41 | return filter.apply(alignmentList); 42 | 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/modifier/modify/split/SrxSplitAlgorithm.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.modifier.modify.split; 2 | 3 | 4 | import java.io.Reader; 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | 8 | import net.loomchild.segment.TextIterator; 9 | import net.loomchild.segment.srx.SrxDocument; 10 | import net.loomchild.segment.srx.SrxParser; 11 | import net.loomchild.segment.srx.SrxTextIterator; 12 | import net.loomchild.segment.srx.io.SrxAnyParser; 13 | 14 | 15 | /** 16 | * Represents a sentence splitter using rules defined in SRX format. 17 | * Uses external segment library. 18 | * 19 | * @see Segment Project 20 | * @see SRX Standard 21 | * @author Jarek Lipski (loomchild) 22 | */ 23 | public class SrxSplitAlgorithm extends SplitAlgorithm { 24 | 25 | private SrxDocument document; 26 | 27 | private String languageCode; 28 | 29 | /** 30 | * Creates a SRX splitter using given rules and selecting the ones to apply 31 | * using given language code. 32 | * @param reader reader containing SRX rules 33 | * @param languageCode language code used to select the rules to apply 34 | */ 35 | public SrxSplitAlgorithm(Reader reader, String languageCode) { 36 | SrxParser parser = new SrxAnyParser(); 37 | this.document = parser.parse(reader); 38 | this.languageCode = languageCode; 39 | } 40 | 41 | public List split(String string) { 42 | TextIterator textIterator = 43 | new SrxTextIterator(document, languageCode, string); 44 | List segmentList = new ArrayList(); 45 | while(textIterator.hasNext()) { 46 | String segment = textIterator.next(); 47 | segmentList.add(segment); 48 | } 49 | return segmentList; 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /maligna/src/test/java/net/loomchild/maligna/filter/aligner/AlignerTest.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.aligner; 2 | 3 | import java.util.List; 4 | 5 | import net.loomchild.maligna.coretypes.Alignment; 6 | import net.loomchild.maligna.filter.aligner.align.AlignAlgorithm; 7 | import net.loomchild.maligna.filter.aligner.align.AlignAlgorithmMock; 8 | import net.loomchild.maligna.util.TestUtil; 9 | 10 | import org.junit.Test; 11 | 12 | /** 13 | * Represents unit test of {@link Aligner} class. Checks if it applies 14 | * {@link AlignAlgorithm} correctly using {@link AlignAlgorithmMock}. 15 | * @author loomchild 16 | */ 17 | public class AlignerTest { 18 | 19 | public static final String[][] SOURCE_SEGMENT_ARRAY = { 20 | new String[] {"a", "b", "c", "d"}, 21 | new String[] {"e", "f"} 22 | }; 23 | 24 | public static final String[][] TARGET_SEGMENT_ARRAY = { 25 | new String[] {"1", "2", "3"}, 26 | new String[] {"4"} 27 | }; 28 | 29 | public static final String[][] EXPECTED_SOURCE_SEGMENT_ARRAY = { 30 | new String[] {"a", "b"}, 31 | new String[] {"c", "d"}, 32 | new String[] {"e", "f"}, 33 | }; 34 | 35 | public static final String[][] EXPECTED_TARGET_SEGMENT_ARRAY = { 36 | new String[] {"1", "2"}, 37 | new String[] {"3"}, 38 | new String[] {"4"}, 39 | }; 40 | 41 | /** 42 | * Checks if {@link Aligner} uses {@link AlignAlgorithm} correctly. 43 | */ 44 | @Test 45 | public void testAlign() { 46 | AlignAlgorithm algorithm = new AlignAlgorithmMock(2); 47 | Aligner aligner = new Aligner(algorithm); 48 | List alignmentList = TestUtil.createAlignmentList( 49 | SOURCE_SEGMENT_ARRAY, TARGET_SEGMENT_ARRAY); 50 | List resultAlignmentList = aligner.apply(alignmentList); 51 | TestUtil.assertAlignmentListEquals(EXPECTED_SOURCE_SEGMENT_ARRAY, 52 | EXPECTED_TARGET_SEGMENT_ARRAY, resultAlignmentList); 53 | } 54 | 55 | } 56 | 57 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/macro/PoissonMacro.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.macro; 2 | 3 | import java.util.List; 4 | 5 | import net.loomchild.maligna.coretypes.Alignment; 6 | import net.loomchild.maligna.filter.aligner.align.hmm.HmmAlignAlgorithmFactory; 7 | import net.loomchild.maligna.filter.aligner.align.hmm.adaptive.AdaptiveBandAlgorithm; 8 | import net.loomchild.maligna.calculator.Calculator; 9 | import net.loomchild.maligna.calculator.length.PoissonDistributionCalculator; 10 | import net.loomchild.maligna.calculator.length.counter.Counter; 11 | import net.loomchild.maligna.calculator.length.counter.SplitCounter; 12 | import net.loomchild.maligna.filter.Filter; 13 | import net.loomchild.maligna.filter.aligner.Aligner; 14 | import net.loomchild.maligna.filter.aligner.align.AlignAlgorithm; 15 | import net.loomchild.maligna.filter.aligner.align.hmm.fb.ForwardBackwardAlgorithmFactory; 16 | 17 | /** 18 | * Uses algorithm similar to Gale and Church (see {@link GaleAndChurchMacro}), 19 | * but instead of normal distribution uses Poisson distribution and 20 | * measures length of sentence in words instead of characters as in original. 21 | * 22 | * Seems to give better results than Gale and Church algorithm. 23 | * 24 | * @author loomchild 25 | */ 26 | public class PoissonMacro implements Macro { 27 | 28 | public List apply(List alignmentList) { 29 | 30 | Counter counter = new SplitCounter(); 31 | Calculator calculator = 32 | new PoissonDistributionCalculator(counter, alignmentList); 33 | 34 | HmmAlignAlgorithmFactory algorithmFactory = 35 | new ForwardBackwardAlgorithmFactory(); 36 | 37 | AlignAlgorithm algorithm = 38 | new AdaptiveBandAlgorithm(algorithmFactory, calculator); 39 | 40 | Filter filter = new Aligner(algorithm); 41 | 42 | return filter.apply(alignmentList); 43 | 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/formatter/AlFormatter.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.formatter; 2 | 3 | import java.io.Writer; 4 | import java.util.List; 5 | 6 | import net.loomchild.maligna.coretypes.Alignment; 7 | import net.loomchild.maligna.util.bind.AlMarshallerUnmarshaller; 8 | import net.loomchild.maligna.util.bind.al.Alignmentlist; 9 | import net.loomchild.maligna.util.bind.al.Segmentlist; 10 | 11 | /** 12 | * Represents formatter to native .AL format. 13 | * 14 | * This format preserves all information about alignments including scores. 15 | * Alignments are stored using a writer defined in constructor. 16 | * 17 | * @author Jarek Lipski (loomchild) 18 | */ 19 | public class AlFormatter implements Formatter { 20 | 21 | private Writer writer; 22 | 23 | /** 24 | * Creates formatter. 25 | * @param writer writer to which the output will be dumped. 26 | */ 27 | public AlFormatter(Writer writer) { 28 | this.writer = writer; 29 | } 30 | 31 | /** 32 | * Formats alignments to a writer preserving all parameters. 33 | */ 34 | public void format(List alignmentList) { 35 | Alignmentlist al = new Alignmentlist(); 36 | for (Alignment alignment : alignmentList) { 37 | net.loomchild.maligna.util.bind.al.Alignment a = new net.loomchild.maligna.util.bind.al.Alignment(); 38 | a.setScore((double)alignment.getScore()); 39 | a.setSourcelist(createSegmentList( 40 | alignment.getSourceSegmentList())); 41 | a.setTargetlist(createSegmentList( 42 | alignment.getTargetSegmentList())); 43 | al.getAlignment().add(a); 44 | } 45 | AlMarshallerUnmarshaller.getInstance().marshal(al, writer); 46 | } 47 | 48 | private Segmentlist createSegmentList(List segmentList) { 49 | Segmentlist sl = new Segmentlist(); 50 | for (String segment : segmentList) { 51 | sl.getSegment().add(segment); 52 | } 53 | return sl; 54 | } 55 | 56 | } 57 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/parser/AlParser.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.parser; 2 | 3 | import java.io.Reader; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | 7 | import net.loomchild.maligna.coretypes.Alignment; 8 | import net.loomchild.maligna.util.bind.AlMarshallerUnmarshaller; 9 | import net.loomchild.maligna.util.bind.al.Alignmentlist; 10 | import net.loomchild.maligna.util.bind.al.Segmentlist; 11 | 12 | /** 13 | * Represents parser of a native .al format. 14 | * Parses a document configured in constructor. 15 | * 16 | * @author loomchild 17 | */ 18 | public class AlParser implements Parser { 19 | 20 | private Reader reader; 21 | 22 | /** 23 | * Constructs parser. 24 | * @param reader input document to be parsed 25 | */ 26 | public AlParser(Reader reader) { 27 | this.reader = reader; 28 | } 29 | 30 | /** 31 | * Parses a document into a list of alignments. 32 | * Retrieves all information stored in this format including score. 33 | */ 34 | public List parse() { 35 | List alignmentList = new ArrayList(); 36 | Alignmentlist al = 37 | AlMarshallerUnmarshaller.getInstance().unmarshal(reader); 38 | for (net.loomchild.maligna.util.bind.al.Alignment a : al.getAlignment()) { 39 | List sourceSegmentList = 40 | createSegmentList(a.getSourcelist()); 41 | List targetSegmentList = 42 | createSegmentList(a.getTargetlist()); 43 | float score = a.getScore().floatValue(); 44 | Alignment alignment = new Alignment(sourceSegmentList, 45 | targetSegmentList, score); 46 | alignmentList.add(alignment); 47 | } 48 | return alignmentList; 49 | } 50 | 51 | private List createSegmentList(Segmentlist sl) { 52 | List segmentList = new ArrayList(); 53 | for (String s : sl.getSegment()) { 54 | segmentList.add(s); 55 | } 56 | return segmentList; 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/formatter/HtmlFormatter.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.formatter; 2 | 3 | import java.io.PrintWriter; 4 | import java.io.Writer; 5 | import java.util.List; 6 | 7 | import net.loomchild.maligna.coretypes.Alignment; 8 | 9 | /** 10 | * Represents HTML formatter that produces human-readable output. 11 | * 12 | * The output is stored using configured writer in two columns of HTML table. 13 | * 14 | * @author loomchild 15 | */ 16 | public class HtmlFormatter implements Formatter { 17 | 18 | private PrintWriter writer; 19 | 20 | /** 21 | * Constructs a formatter 22 | * @param writer writer that will be used as output 23 | */ 24 | public HtmlFormatter(Writer writer) { 25 | this.writer = new PrintWriter(writer, true); 26 | } 27 | 28 | /** 29 | * Formats the alignment into full HTML page containing a table with 30 | * two columns representing source and target texts. 31 | * 32 | * @param alignmentList input alignment list 33 | */ 34 | public void format(List alignmentList) { 35 | writer.println(""); 36 | 37 | writer.println(""); 38 | writer.println(""); 39 | writer.println(""); 40 | 41 | writer.println(""); 42 | writer.println(""); 43 | for (Alignment alignment : alignmentList) { 44 | writer.println(""); 45 | formatStrings(alignment.getSourceSegmentList()); 46 | formatStrings(alignment.getTargetSegmentList()); 47 | writer.println(""); 48 | } 49 | writer.println("
"); 50 | writer.println(""); 51 | writer.println(""); 52 | } 53 | 54 | private void formatStrings(List stringList) { 55 | writer.println(""); 56 | for (String string : stringList) { 57 | writer.println("

" + string + "

"); 58 | } 59 | writer.println(""); 60 | } 61 | 62 | } 63 | -------------------------------------------------------------------------------- /maligna/src/test/java/net/loomchild/maligna/filter/modifier/modify/merge/SeparatorMergeAlgorithmTest.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.modifier.modify.merge; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | 5 | import java.util.Arrays; 6 | import java.util.Collections; 7 | import java.util.List; 8 | 9 | import org.junit.Before; 10 | import org.junit.Test; 11 | 12 | 13 | /** 14 | * Represents {@link SeparatorMergeAlgorithm} unit test. 15 | * @author loomchild 16 | */ 17 | public class SeparatorMergeAlgorithmTest { 18 | 19 | private MergeAlgorithm merger; 20 | 21 | @Before 22 | public void setUp() { 23 | merger = new SeparatorMergeAlgorithm(" "); 24 | } 25 | 26 | /** 27 | * Check if merging empty list returns empty segment. 28 | */ 29 | @Test 30 | public void mergeEmpty() { 31 | List list = Collections.emptyList(); 32 | String segment = merger.merge(list); 33 | assertEquals("", segment); 34 | } 35 | 36 | /** 37 | * Checks if merging a list containing just one segment 38 | * returns the same segment. 39 | */ 40 | @Test 41 | public void mergeSingleton() { 42 | List list = Collections.singletonList("ala"); 43 | String segment = merger.merge(list); 44 | assertEquals("ala", segment); 45 | } 46 | 47 | /** 48 | * Test merging with separator. 49 | */ 50 | @Test 51 | public void merge() { 52 | List list = Arrays.asList(new String[] {"ala", "ma", " kota"}); 53 | String segment = merger.merge(list); 54 | assertEquals("ala ma kota", segment); 55 | } 56 | 57 | /** 58 | * Tests merging without a separator - if the result will be 59 | * exactly the same as string contatenation. 60 | */ 61 | @Test 62 | public void mergeNoSeparator() { 63 | MergeAlgorithm emptyMerger = new SeparatorMergeAlgorithm(""); 64 | List list = Arrays.asList(new String[] {"ala", "ma", " kota"}); 65 | String segment = emptyMerger.merge(list); 66 | assertEquals("alama kota", segment); 67 | } 68 | 69 | 70 | } 71 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/meta/IgnoreInfiniteProbabilityAlignmentsFilterDecorator.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.meta; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import net.loomchild.maligna.coretypes.Alignment; 7 | import net.loomchild.maligna.filter.Filter; 8 | 9 | /** 10 | * Represents a filter decorator ignoring all alignments with score 11 | * equal to {@link Float#NEGATIVE_INFINITY}. Can be used to force 12 | * certain alignments / segmentations because they were checked by human. 13 | * 14 | * Note: 15 | * Decorators are classes that enhance or change behavior of underlying class. 16 | * 17 | * @author loomchild 18 | */ 19 | public class IgnoreInfiniteProbabilityAlignmentsFilterDecorator implements Filter { 20 | 21 | private Filter filter; 22 | 23 | /** 24 | * Creates decorator. 25 | * @param filter filter to be decorated 26 | */ 27 | public IgnoreInfiniteProbabilityAlignmentsFilterDecorator(Filter filter) { 28 | this.filter = filter; 29 | } 30 | 31 | /** 32 | * Iterates over a list of alignments and if the alignment has 33 | * score equal to {@link Float#NEGATIVE_INFINITY} then it 34 | * is copied to resulting list. Otherwise alignment is stored 35 | * in a helper list to which underlying filter will be applied 36 | * when the next ignored alignment is encountered. 37 | */ 38 | public List apply(List alignmentList) { 39 | List resultList = new ArrayList(); 40 | 41 | List currentList = new ArrayList(); 42 | 43 | for (Alignment alignment : alignmentList) { 44 | if (alignment.getScore() == Float.NEGATIVE_INFINITY) { 45 | resultList.addAll(filter.apply(currentList)); 46 | currentList.clear(); 47 | resultList.add(alignment); 48 | } else { 49 | currentList.add(alignment); 50 | } 51 | } 52 | resultList.addAll(filter.apply(currentList)); 53 | 54 | return resultList; 55 | } 56 | 57 | } 58 | -------------------------------------------------------------------------------- /maligna/src/test/java/net/loomchild/maligna/coretypes/AlignmentTest.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.coretypes; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | 8 | import org.junit.Before; 9 | import org.junit.Test; 10 | 11 | /** 12 | * Represents {@link Alignment} class test. 13 | * @author loomchild 14 | */ 15 | public class AlignmentTest { 16 | 17 | private List sourceSegmentList; 18 | 19 | private List targetSegmentList; 20 | 21 | @Before 22 | public void setUp() { 23 | sourceSegmentList = new ArrayList(); 24 | targetSegmentList = new ArrayList(); 25 | } 26 | 27 | /** 28 | * Tests whether after calling the constructor lists stored in 29 | * {@link Alignment} are copies of the arguments. 30 | */ 31 | @Test 32 | public void contructorListCopying() { 33 | Alignment alignment = new Alignment(sourceSegmentList, 34 | targetSegmentList, 2.0f); 35 | checkAlignment(alignment); 36 | } 37 | 38 | /** 39 | * Tests whether after calling {@link Alignment#addSourceSegmentList(List)} 40 | * and {@link Alignment#addTargetSegmentList(List)} methods, lists stored in 41 | * {@link Alignment} are copies of the arguments. 42 | */ 43 | @Test 44 | public void methodListCopying() { 45 | Alignment alignment = new Alignment(); 46 | alignment.addSourceSegmentList(sourceSegmentList); 47 | alignment.addTargetSegmentList(targetSegmentList); 48 | alignment.setScore(2.0f); 49 | checkAlignment(alignment); 50 | } 51 | 52 | private void checkAlignment(Alignment alignment) { 53 | assertEquals(0, alignment.getSourceSegmentList().size()); 54 | sourceSegmentList.add("a"); 55 | assertEquals(0, alignment.getSourceSegmentList().size()); 56 | assertEquals(0, alignment.getTargetSegmentList().size()); 57 | targetSegmentList.add("c"); 58 | assertEquals(0, alignment.getTargetSegmentList().size()); 59 | assertEquals(2.0f, alignment.getScore(), 0.000000001f); 60 | } 61 | 62 | } 63 | -------------------------------------------------------------------------------- /maligna/src/test/java/net/loomchild/maligna/filter/macro/MooreMacroTest.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.macro; 2 | 3 | import java.util.List; 4 | 5 | import net.loomchild.maligna.coretypes.Alignment; 6 | import net.loomchild.maligna.util.TestUtil; 7 | 8 | import org.junit.Before; 9 | import org.junit.Test; 10 | 11 | public class MooreMacroTest { 12 | 13 | private MooreMacro macro; 14 | 15 | @Before 16 | public void setUp() { 17 | this.macro = new MooreMacro(); 18 | } 19 | 20 | /** 21 | * Tests if when aligning three to one no {@link NullPointerException} 22 | * will be thrown, as it was the case. 23 | */ 24 | @Test 25 | public void testPreservesAllSegments() { 26 | String[] sourceSegments = new String[]{ 27 | "He had given up attending to matters of practical importance; he had lost all desire to do so.", 28 | "Nothing that any landlady could do had a real terror for him.", 29 | "But to be stopped on the stairs, to be forced to listen to her trivial, irrelevant gossip, to pestering demands for payment, threats and complaints, and to rack his brains for excuses, to prevaricate, to lie—no, rather than that, he would creep down the stairs like a cat and slip out unseen." 30 | }; 31 | 32 | String[] targetSegments = new String[]{ 33 | "Aber auf der Treppe stehenzubleiben, allerlei Gewäsch über allen möglichen ihm ganz gleichgültigen Alltagskram, all diese Mahnungen ans Bezahlen, die Drohungen und Klagen anzuhören und dabei selbst sich herauszuwinden, sich zu entschuldigen, zu lügen – nein, da war es schon besser, wie eine Katze auf der Treppe vorbeizuschlüpfen und sich, ohne von jemand gesehen zu werden, flink davonzumachen." 34 | }; 35 | 36 | List alignmentList = TestUtil.createAlignmentList( 37 | new String[][]{sourceSegments}, 38 | new String[][]{targetSegments}); 39 | 40 | List result = macro.apply(alignmentList); 41 | 42 | TestUtil.assertAlignmentListContains(sourceSegments, targetSegments, result); 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/calculator/meta/MinimumCalculator.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.calculator.meta; 2 | 3 | import java.util.List; 4 | 5 | import net.loomchild.maligna.calculator.Calculator; 6 | 7 | /** 8 | * Represents conditional calculator. 9 | * If score calculated by given testCalculator is equal or less than the 10 | * given threshold returns given minimum score, 11 | * otherwise returns score calculated by calculator. 12 | * 13 | * TODO: generalize to n calculators, simplify that no default is defined - 14 | * just if score returned by any calculator is zero then do not try calculate 15 | * others because score by definition cannot be less than zero. 16 | * 17 | * @author loomchild 18 | * 19 | */ 20 | public class MinimumCalculator implements Calculator { 21 | 22 | private static final float DEFAULT_MINIMUM_SCORE = 0.0f; 23 | 24 | private Calculator testCalculator; 25 | 26 | private Calculator calculator; 27 | 28 | private float scoreThreshold; 29 | 30 | private float minimumScore; 31 | 32 | public MinimumCalculator(Calculator testCalculator, 33 | Calculator calculator, float scoreThreshold, float minumumScore) { 34 | this.testCalculator = testCalculator; 35 | this.calculator = calculator; 36 | this.scoreThreshold = scoreThreshold; 37 | this.minimumScore = minumumScore; 38 | } 39 | 40 | public MinimumCalculator(Calculator testCalculator, 41 | Calculator calculator, float scoreThreshold) { 42 | this(testCalculator,calculator, scoreThreshold, DEFAULT_MINIMUM_SCORE); 43 | } 44 | 45 | public float calculateScore(List sourceSegmentList, 46 | List targetSegmentList) { 47 | 48 | float testScore = 49 | testCalculator.calculateScore(sourceSegmentList, targetSegmentList); 50 | float score; 51 | 52 | if (testScore <= scoreThreshold) { 53 | score = minimumScore; 54 | } else { 55 | score = calculator.calculateScore(sourceSegmentList, targetSegmentList); 56 | } 57 | 58 | return score; 59 | 60 | } 61 | 62 | } 63 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/matrix/BandMatrixIterator.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.matrix; 2 | 3 | import java.util.NoSuchElementException; 4 | 5 | /** 6 | * Represents {@link BandMatrix} iterator. 7 | * 8 | * @author loomchild 9 | * @param matrix data type 10 | */ 11 | public class BandMatrixIterator implements MatrixIterator { 12 | 13 | private BandMatrix matrix; 14 | 15 | private int x, y, maxX, minX; 16 | 17 | public BandMatrixIterator(BandMatrix matrix) { 18 | this.matrix = matrix; 19 | beforeFirst(); 20 | } 21 | 22 | public int getX() { 23 | return x; 24 | } 25 | 26 | public int getY() { 27 | return y; 28 | } 29 | 30 | public void beforeFirst() { 31 | x = -1; 32 | y = 0; 33 | calculateMinMaxX(); 34 | } 35 | 36 | public boolean hasNext() { 37 | return !((y >= matrix.getHeight() - 1) && (x >= matrix.getWidth() - 1)); 38 | } 39 | 40 | public void next() { 41 | ++x; 42 | if (x > maxX) { 43 | ++y; 44 | calculateMinMaxX(); 45 | x = minX; 46 | if (y >= matrix.getHeight()) { 47 | throw new NoSuchElementException(); 48 | } 49 | } 50 | } 51 | 52 | public void afterLast() { 53 | x = matrix.getWidth(); 54 | y = matrix.getHeight() - 1; 55 | calculateMinMaxX(); 56 | } 57 | 58 | public boolean hasPrevious() { 59 | return !((y <= 0) && (x <= 0)); 60 | } 61 | 62 | public void previous() { 63 | --x; 64 | if (x < minX) { 65 | --y; 66 | calculateMinMaxX(); 67 | x = maxX; 68 | if (y < 0) { 69 | throw new NoSuchElementException(); 70 | } 71 | } 72 | } 73 | 74 | /** 75 | * Calculates minimum and maximum x position (column number) at current 76 | * y position (row number) and stores then in {@link #minX} and 77 | * {@link #maxX}. 78 | */ 79 | private void calculateMinMaxX() { 80 | int diagonalX = matrix.getDiagonalX(y); 81 | minX = Math.max(0, diagonalX - matrix.getBandRadius()); 82 | maxX = Math.min(matrix.getWidth() - 1, 83 | diagonalX + matrix.getBandRadius()); 84 | } 85 | 86 | } 87 | -------------------------------------------------------------------------------- /maligna/src/test/java/net/loomchild/maligna/formatter/TmxFormatterTest.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.formatter; 2 | 3 | import static net.loomchild.maligna.util.TestUtil.assertAlignmentListEquals; 4 | import static net.loomchild.maligna.util.TestUtil.createAlignmentList; 5 | 6 | import java.io.Reader; 7 | import java.io.StringReader; 8 | import java.io.StringWriter; 9 | import java.util.List; 10 | 11 | import net.loomchild.maligna.coretypes.Alignment; 12 | import net.loomchild.maligna.parser.TmxParser; 13 | 14 | import org.junit.Test; 15 | 16 | /** 17 | * Represents {@link TmxFormatter} class test. 18 | * 19 | * @author loomchild 20 | */ 21 | public class TmxFormatterTest { 22 | 23 | public static final String SOURCE_LANGUAGE = "pl"; 24 | 25 | public static final String TARGET_LANGUAGE = "de"; 26 | 27 | public static final String[][] EXPECTED_SOURCE_ARRAY = new String[][]{ 28 | new String[]{"Ala ma kota kot ma\tale nie wie.\nDrugie.Burza mózgów zawsze " + 29 | "dobrze robi."}, 30 | new String[]{}, 31 | }; 32 | 33 | public static final String[][] EXPECTED_TARGET_ARRAY = new String[][]{ 34 | new String[]{"Wasserreservoir, Wasserreservoir..."}, 35 | new String[]{"Immer nur Wasser"}, 36 | }; 37 | 38 | /** 39 | * Tests whether alignment formatted by {@link TmxFormatter} can be 40 | * successfully parsed by {@link TmxParser}. 41 | */ 42 | @Test 43 | public void testFormatParse() { 44 | List alignmentList = createAlignmentList( 45 | AlFormatterTest.SOURCE_ARRAY, AlFormatterTest.TARGET_ARRAY); 46 | StringWriter writer = new StringWriter(); 47 | Formatter formatter = new TmxFormatter(writer, SOURCE_LANGUAGE, 48 | TARGET_LANGUAGE); 49 | formatter.format(alignmentList); 50 | Reader reader = new StringReader(writer.toString()); 51 | TmxParser parser = new TmxParser(reader, SOURCE_LANGUAGE, 52 | TARGET_LANGUAGE); 53 | List resultAlignmentList = parser.parse(); 54 | assertAlignmentListEquals(EXPECTED_SOURCE_ARRAY, EXPECTED_TARGET_ARRAY, 55 | resultAlignmentList); 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/formatter/PlaintextFormatter.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.formatter; 2 | 3 | import static net.loomchild.maligna.util.Util.merge; 4 | 5 | import java.io.PrintWriter; 6 | import java.io.Writer; 7 | import java.util.List; 8 | 9 | import net.loomchild.maligna.coretypes.Alignment; 10 | 11 | /** 12 | * Represents a formatter writing to separate source and target plaintext files. 13 | * 14 | * In each line of each file all all given alignment segments are written. 15 | * Subsequent lines in source and target files correspond to each other, number 16 | * of lines is equal in files. If alignment consists of more than one segment, 17 | * they are merged and space is inserted between them. 18 | * 19 | * @author loomchild 20 | */ 21 | public class PlaintextFormatter implements Formatter { 22 | 23 | private PrintWriter sourceWriter; 24 | 25 | private PrintWriter targetWriter; 26 | 27 | /** 28 | * Creates formatter. 29 | * @param sourceWriter source file writer 30 | * @param targetWriter target file writer 31 | */ 32 | public PlaintextFormatter(Writer sourceWriter, Writer targetWriter) { 33 | this.sourceWriter = new PrintWriter(sourceWriter, true); 34 | this.targetWriter = new PrintWriter(targetWriter, true); 35 | } 36 | 37 | /** 38 | * Formats alignment using defined source and target writer. 39 | * @param alignmentList input alignment list 40 | */ 41 | public void format(List alignmentList) { 42 | for (Alignment alignment : alignmentList) { 43 | printSegmentList(sourceWriter, alignment.getSourceSegmentList()); 44 | printSegmentList(targetWriter, alignment.getTargetSegmentList()); 45 | } 46 | } 47 | 48 | /** 49 | * Merges segments and replaces end-of-line characters with spaces to make 50 | * sure resulting files have the same number of lines. 51 | * @param writer 52 | * @param segmentList 53 | */ 54 | private void printSegmentList(PrintWriter writer, 55 | List segmentList) { 56 | String segment = merge(segmentList); 57 | segment = segment.replace("\n", " "); 58 | writer.println(segment); 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/matrix/MatrixIterator.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.matrix; 2 | 3 | import java.util.NoSuchElementException; 4 | 5 | 6 | /** 7 | * Represents generic matrix iterator. 8 | * Iterates the matrix from top left to bottom right corner, increasing 9 | * row number first, and if it reaches maximum increasing column number 10 | * ([0,0], [1,0], ... [n,0], [0,1],...). 11 | * Some of the elements on the matrix may be ignored if it does not 12 | * store them, but the overall order must be preserved. 13 | * Also enables iterating the matrix in reverse order. 14 | * 15 | * @author loomchild 16 | * @param data type stored in the matrix 17 | */ 18 | public interface MatrixIterator { 19 | 20 | /** 21 | * @return x position of the iterator (column) 22 | */ 23 | public int getX(); 24 | 25 | /** 26 | * @return y position of the iterator (row) 27 | */ 28 | public int getY(); 29 | 30 | /** 31 | * Resets the iterator - sets its position to before first element. 32 | */ 33 | public void beforeFirst(); 34 | 35 | /** 36 | * @return true if iterator has next element (hasn't reached bottom 37 | * left corner) 38 | */ 39 | public boolean hasNext(); 40 | 41 | /** 42 | * Advances the iterator to the next element. If this is not possible 43 | * because iterator hasn't got the next element ({@link #hasNext()} 44 | * returns false) it throws an exception. 45 | * 46 | * @throws NoSuchElementException when there are no more elements 47 | */ 48 | public void next(); 49 | 50 | /** 51 | * Sets the position to after last element - subsequent calls to 52 | * {@link #hasNext()} will return false. 53 | */ 54 | public void afterLast(); 55 | 56 | /** 57 | * @return true if iterator has previous element (hasn't reached top left 58 | * corner). 59 | */ 60 | public boolean hasPrevious() ; 61 | 62 | /** 63 | * Moves the iterator to the previous element. If this is not possible 64 | * because iterator hasn't got the previous element ({@link #hasPrevious()} 65 | * returns false) it throws an exception. 66 | * 67 | * @throws NoSuchElementException when there is not previous elements 68 | */ 69 | public void previous(); 70 | 71 | } 72 | -------------------------------------------------------------------------------- /maligna/src/test/java/net/loomchild/maligna/formatter/PresentationFormatterTest.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.formatter; 2 | 3 | import static net.loomchild.maligna.util.TestUtil.createAlignmentList; 4 | import static org.junit.Assert.assertEquals; 5 | 6 | import java.io.StringWriter; 7 | import java.util.Collections; 8 | import java.util.List; 9 | 10 | import net.loomchild.maligna.coretypes.Alignment; 11 | 12 | import org.junit.Test; 13 | 14 | /** 15 | * Represents {@link PresentationFormatter} test. 16 | * 17 | * @author loomchild 18 | */ 19 | public class PresentationFormatterTest { 20 | 21 | public static final String LINE_SEPARATOR = 22 | System.getProperty("line.separator"); 23 | 24 | public static final String EXPECTED = 25 | "Ala ma kota kot ma | Wasserreservoir, Was" + LINE_SEPARATOR + 26 | "ale nie wie. | serreservoir... " + LINE_SEPARATOR + 27 | "Drugie. | " + LINE_SEPARATOR + 28 | " | " + LINE_SEPARATOR + 29 | "Burza mózgów zawsze | " + LINE_SEPARATOR + 30 | "dobrze robi. | " + LINE_SEPARATOR + 31 | "_____________________|_____________________" + LINE_SEPARATOR + 32 | "_____________________|_____________________" + LINE_SEPARATOR + 33 | " | Immer nur Wasser " + LINE_SEPARATOR; 34 | 35 | /** 36 | * Tests if formatting empty text returns empty output. 37 | */ 38 | @Test 39 | public void formatEmpty() { 40 | StringWriter writer = new StringWriter(); 41 | Formatter formatter = new PresentationFormatter(writer, 9); 42 | List alignmentList = Collections.emptyList(); 43 | formatter.format(alignmentList); 44 | assertEquals("", writer.toString()); 45 | } 46 | 47 | /** 48 | * Tests if the output is the same as {@link #EXPECTED}. 49 | */ 50 | @Test 51 | public void format() { 52 | List alignmentList = createAlignmentList( 53 | AlFormatterTest.SOURCE_ARRAY, AlFormatterTest.TARGET_ARRAY); 54 | StringWriter writer = new StringWriter(); 55 | Formatter formatter = new PresentationFormatter(writer, 43); 56 | formatter.format(alignmentList); 57 | assertEquals(EXPECTED + " ", writer.toString() + " "); 58 | } 59 | 60 | } 61 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/aligner/align/AlignAlgorithmMock.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.aligner.align; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Iterator; 5 | import java.util.List; 6 | 7 | import net.loomchild.maligna.coretypes.Alignment; 8 | import net.loomchild.maligna.filter.aligner.AlignmentImpossibleException; 9 | 10 | /** 11 | * Align algorithm mock used for unit testing. 12 | * Returns alignments with given number of source and target segments 13 | * (if possible). 14 | * 15 | * @author Jarek Lipski (loomchild) 16 | */ 17 | public class AlignAlgorithmMock implements AlignAlgorithm { 18 | 19 | private int maxSegments; 20 | 21 | /** 22 | * Creates aligner. 23 | * @param maxSegments number of source / target segments in each alignment 24 | */ 25 | public AlignAlgorithmMock(int maxSegments) { 26 | this.maxSegments = maxSegments; 27 | } 28 | 29 | /** 30 | * Aligns source segments to target segments. Resulting 31 | * alignments will always be of n-n category (if there are enough segments), 32 | * where n is configured max segments. 33 | * 34 | * @param sourceSegmentList source segment list 35 | * @param targetSegmentList target segment list 36 | * @return alignment list 37 | * @throws AlignmentImpossibleException never thrown because alignment is 38 | * always possible with this aligner 39 | */ 40 | public List align(List sourceSegmentList, 41 | List targetSegmentList) { 42 | Iterator sourceIterator = sourceSegmentList.iterator(); 43 | Iterator targetIterator = targetSegmentList.iterator(); 44 | int maxSize = Math.max(sourceSegmentList.size(), 45 | targetSegmentList.size()); 46 | List alignmentList = new ArrayList(); 47 | Alignment alignment = null; 48 | for (int i = 0; i < maxSize; ++i) { 49 | if (i % maxSegments == 0) { 50 | alignment = new Alignment(); 51 | alignmentList.add(alignment); 52 | } 53 | if (sourceIterator.hasNext()) { 54 | alignment.addSourceSegment(sourceIterator.next()); 55 | } 56 | if (targetIterator.hasNext()) { 57 | alignment.addTargetSegment(targetIterator.next()); 58 | } 59 | } 60 | return alignmentList; 61 | } 62 | 63 | 64 | } 65 | -------------------------------------------------------------------------------- /maligna/src/test/java/net/loomchild/maligna/filter/aligner/UnifyAlignerTest.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.aligner; 2 | 3 | import static net.loomchild.maligna.util.TestUtil.assertAlignmentListEquals; 4 | import static net.loomchild.maligna.util.TestUtil.createAlignmentList; 5 | 6 | import java.util.List; 7 | 8 | import net.loomchild.maligna.coretypes.Alignment; 9 | 10 | import org.junit.Test; 11 | 12 | /** 13 | * Represents {@link UnifyAligner} unit test. 14 | * @author loomchild 15 | * 16 | */ 17 | public class UnifyAlignerTest { 18 | 19 | public static final String[][] REFERENCE_SOURCE_SEGMENT_ARRAY = { 20 | new String[] {"", ""}, 21 | new String[] {"", "", ""}, 22 | new String[] {""}, 23 | }; 24 | 25 | public static final String[][] REFERENCE_TARGET_SEGMENT_ARRAY = { 26 | new String[] {""}, 27 | new String[] {""}, 28 | new String[] {"", ""}, 29 | }; 30 | 31 | public static final String[][] SOURCE_SEGMENT_ARRAY = { 32 | new String[] {"a", "b", "c", "d"}, 33 | new String[] {"e", "f"} 34 | }; 35 | 36 | public static final String[][] TARGET_SEGMENT_ARRAY = { 37 | new String[] {"1", "2", "3"}, 38 | new String[] {"4"} 39 | }; 40 | 41 | public static final String[][] EXPECTED_SOURCE_SEGMENT_ARRAY = { 42 | new String[] {"a", "b"}, 43 | new String[] {"c", "d", "e"}, 44 | new String[] {"f"}, 45 | }; 46 | 47 | public static final String[][] EXPECTED_TARGET_SEGMENT_ARRAY = { 48 | new String[] {"1"}, 49 | new String[] {"2"}, 50 | new String[] {"3", "4"}, 51 | }; 52 | 53 | /** 54 | * Checks whether unify aligner works as expected using 55 | * {@link #REFERENCE_SOURCE_SEGMENT_ARRAY}, 56 | * {@link #REFERENCE_TARGET_SEGMENT_ARRAY}. 57 | */ 58 | @Test 59 | public void testAlign() { 60 | List referenceAlignmentList = createAlignmentList( 61 | REFERENCE_SOURCE_SEGMENT_ARRAY, REFERENCE_TARGET_SEGMENT_ARRAY); 62 | UnifyAligner aligner = new UnifyAligner(referenceAlignmentList); 63 | List alignmentList = createAlignmentList( 64 | SOURCE_SEGMENT_ARRAY, TARGET_SEGMENT_ARRAY); 65 | List resultAlignmentList = aligner.apply(alignmentList); 66 | assertAlignmentListEquals(EXPECTED_SOURCE_SEGMENT_ARRAY, 67 | EXPECTED_TARGET_SEGMENT_ARRAY, resultAlignmentList); 68 | } 69 | 70 | } 71 | 72 | -------------------------------------------------------------------------------- /maligna/src/test/java/net/loomchild/maligna/parser/TmxParserTest.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.parser; 2 | 3 | import java.io.InputStream; 4 | import java.io.Reader; 5 | import java.util.List; 6 | 7 | import net.loomchild.maligna.coretypes.Alignment; 8 | import net.loomchild.maligna.util.Util; 9 | import net.loomchild.maligna.util.TestUtil; 10 | 11 | import org.junit.Test; 12 | 13 | 14 | /** 15 | * Represents {@link TmxParser} unit test. 16 | * @author loomchild 17 | */ 18 | public class TmxParserTest { 19 | 20 | public static final String FILE = "net/loomchild/maligna/res/test/simpletext.tmx"; 21 | 22 | public static final String SOURCE_LANGUAGE = "en"; 23 | 24 | public static final String TARGET_LANGUAGE = "pl"; 25 | 26 | public static final String[][] SOURCE_SEGMENT_ARRAY = { 27 | new String[] {"First sentence. "}, 28 | new String[] {"Second sentence."}, 29 | }; 30 | 31 | public static final String[][] TARGET_SEGMENT_ARRAY = { 32 | new String[] {"Pierwsze zdanie."}, 33 | new String[] {}, 34 | }; 35 | 36 | /** 37 | * Test if parsing {@value #FILE} works as expected. 38 | * @throws Exception 39 | */ 40 | @Test 41 | public void parseCorrect() throws Exception { 42 | InputStream inputStream = Util.getResourceStream(FILE); 43 | Reader reader = Util.getReader(inputStream); 44 | TmxParser parser = new TmxParser(reader, SOURCE_LANGUAGE, 45 | TARGET_LANGUAGE); 46 | List alignmentList = parser.parse(); 47 | TestUtil.assertAlignmentListEquals(SOURCE_SEGMENT_ARRAY, TARGET_SEGMENT_ARRAY, 48 | alignmentList); 49 | } 50 | 51 | public static final String BAD_SOURCE_LANGUAGE = "de"; 52 | 53 | /** 54 | * Test if parsing {@value #FILE} but with a source language 55 | * {@value #BAD_SOURCE_LANGUAGE} throws an exception because it contains 56 | * more than one variant in this language in one translation unit. 57 | * 58 | * @see TmxParser 59 | * @throws Exception 60 | */ 61 | @Test(expected=TmxParseException.class) 62 | public void parseBadVariantCount() throws Exception { 63 | InputStream inputStream = Util.getResourceStream(FILE); 64 | Reader reader = Util.getReader(inputStream); 65 | TmxParser parser = new TmxParser(reader, BAD_SOURCE_LANGUAGE, 66 | TARGET_LANGUAGE); 67 | parser.parse(); 68 | } 69 | 70 | } 71 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/model/language/LanguageModelUtil.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.model.language; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.IOException; 5 | import java.io.Reader; 6 | import java.util.List; 7 | 8 | import net.loomchild.maligna.model.ModelParseException; 9 | 10 | /** 11 | * Represents {@link LanguageModel} utility methods - training and parsing. 12 | * 13 | * @author Jarek Lipski (loomchild) 14 | */ 15 | public class LanguageModelUtil { 16 | 17 | /** 18 | * Trains language model by adding all words from given segment list 19 | * (training corpus) to it and after that calculating the probabilities 20 | * by calling {@link MutableLanguageModel#normalize()}. 21 | * @param segmentList training corpus 22 | */ 23 | public static LanguageModel train(List< List > segmentList) { 24 | MutableLanguageModel model = new MutableLanguageModel(); 25 | 26 | for (List segment : segmentList) { 27 | for (int wid : segment) { 28 | model.addWordOccurence(wid); 29 | } 30 | } 31 | model.normalize(); 32 | return model; 33 | } 34 | 35 | /** 36 | * Parses language model from input stream. Uses simple plaintext format 37 | * where each line consists of word, whitespace and probability. 38 | * @param reader reader from which model should be read 39 | * @return parsed language model 40 | */ 41 | public static LanguageModel parse(Reader reader) { 42 | try { 43 | BufferedReader bufferedReader = new BufferedReader(reader); 44 | MutableLanguageModel languageModel = new MutableLanguageModel(); 45 | String line; 46 | while ((line = bufferedReader.readLine()) != null) { 47 | String[] parts = line.split("\\s"); 48 | if (parts.length == 2) { 49 | int wid = Integer.parseInt(parts[0]); 50 | int count = Integer.parseInt(parts[1]); 51 | languageModel.addWordOccurence(wid, count); 52 | } else if (parts.length != 0) { 53 | throw new ModelParseException("Bad number of line parts."); 54 | } 55 | } 56 | languageModel.normalize(); 57 | return languageModel; 58 | } catch (NumberFormatException e) { 59 | throw new ModelParseException("Part format error", e); 60 | } catch (IOException e) { 61 | throw new ModelParseException("IO error", e); 62 | } 63 | } 64 | 65 | } 66 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/parser/PlaintextParser.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.parser; 2 | 3 | import static net.loomchild.maligna.util.Util.readAll; 4 | 5 | import java.io.Reader; 6 | import java.util.Collections; 7 | import java.util.List; 8 | 9 | import net.loomchild.maligna.coretypes.Alignment; 10 | 11 | /** 12 | * Represents plaintext document parser into an alignment. 13 | * 14 | * Can be constructed from strings as well as files. 15 | * The whole content of each input file (STRING) is treated as a single segment. 16 | * Always returns alignment list containing just one alignment. 17 | * 18 | * @author loomchild 19 | */ 20 | public class PlaintextParser implements Parser { 21 | 22 | private Reader sourceReader; 23 | 24 | private Reader targetReader; 25 | 26 | private String sourceString; 27 | 28 | private String targetString; 29 | 30 | 31 | /** 32 | * Constructs a parser from source and target string. 33 | * @param sourceString source segment 34 | * @param targetString target segment 35 | */ 36 | public PlaintextParser(String sourceString, String targetString) { 37 | this.sourceString = sourceString; 38 | this.targetString = targetString; 39 | } 40 | 41 | /** 42 | * Constructs parser from source and target reader. 43 | * @param sourceReader source text document 44 | * @param targetReader target text document 45 | */ 46 | public PlaintextParser(Reader sourceReader, Reader targetReader) { 47 | this.sourceReader = sourceReader; 48 | this.targetReader = targetReader; 49 | } 50 | 51 | /** 52 | * Parses input documents into an alignment list 53 | * @return alignment list containing just one alignment with one source 54 | * and one target segment 55 | */ 56 | public List parse() { 57 | if (sourceString == null) { 58 | sourceString = readAll(sourceReader); 59 | } 60 | if (targetString == null) { 61 | targetString = readAll(targetReader); 62 | } 63 | 64 | List sourceSegmentList = Collections.singletonList(sourceString); 65 | List targetSegmentList = Collections.singletonList(targetString); 66 | Alignment alignment = new Alignment(sourceSegmentList, targetSegmentList); 67 | List alignmentList = Collections.singletonList(alignment); 68 | 69 | return alignmentList; 70 | } 71 | 72 | } 73 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/modifier/Modifier.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.modifier; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import net.loomchild.maligna.coretypes.Alignment; 7 | import net.loomchild.maligna.filter.Filter; 8 | import net.loomchild.maligna.filter.modifier.modify.ModifyAlgorithm; 9 | 10 | /** 11 | *

Represents a filter manipulating source or target segments in an 12 | * alignment list.

13 | *

The modification can be for example merging segments (merge package), 14 | * splitting segments (split package) or changing segment contents 15 | * (clean package).

16 | *

Applies separate algorithms ({@link ModifyAlgorithm}) to source 17 | * and target segments in each alignment on input list.

18 | * @author loomchild 19 | */ 20 | public class Modifier implements Filter { 21 | 22 | private ModifyAlgorithm sourceAlgorithm; 23 | 24 | private ModifyAlgorithm targetAlgorithm; 25 | 26 | /** 27 | * Creates modifies using two separate source and target segment 28 | * modification algorithms. 29 | * @param sourceAlgorithm source segment modification algorithm 30 | * @param targetAlgorithm target segment modification algorithm 31 | */ 32 | public Modifier(ModifyAlgorithm sourceAlgorithm, 33 | ModifyAlgorithm targetAlgorithm) { 34 | this.sourceAlgorithm = sourceAlgorithm; 35 | this.targetAlgorithm = targetAlgorithm; 36 | } 37 | 38 | /** 39 | * Iterates over input alignment list and applies source algorithm 40 | * to source segments and target algorithm to target segments to each 41 | * alignment. 42 | * @param alignmentList input alignment list 43 | * @return list containing alignments with modified segments 44 | */ 45 | public List apply(List alignmentList) { 46 | List newAlignmentList = new ArrayList(); 47 | for(Alignment alignment : alignmentList) { 48 | List sourceSegmentList = 49 | sourceAlgorithm.modify(alignment.getSourceSegmentList()); 50 | List targetSegmentList = 51 | targetAlgorithm.modify(alignment.getTargetSegmentList()); 52 | Alignment newAlignment = new Alignment(sourceSegmentList, 53 | targetSegmentList, alignment.getScore()); 54 | newAlignmentList.add(newAlignment); 55 | } 56 | return newAlignmentList; 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /maligna/src/test/java/net/loomchild/maligna/filter/aligner/align/AlignAlgorithmMockTest.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.aligner.align; 2 | 3 | import static net.loomchild.maligna.util.TestUtil.assertAlignmentEquals; 4 | import static org.junit.Assert.assertEquals; 5 | 6 | import java.util.ArrayList; 7 | import java.util.Collections; 8 | import java.util.List; 9 | 10 | import net.loomchild.maligna.coretypes.Alignment; 11 | 12 | import org.junit.Test; 13 | 14 | /** 15 | * Represents {@link AlignAlgorithmMock} unit test. 16 | * @author loomchild 17 | */ 18 | public class AlignAlgorithmMockTest { 19 | 20 | /** 21 | * Checks if aligning empty lists returns empty list. 22 | */ 23 | @Test 24 | public void alignEmpty() { 25 | AlignAlgorithm aligner = new AlignAlgorithmMock(2); 26 | List segmentList = Collections.emptyList(); 27 | List alignmentList = aligner.align(segmentList, segmentList); 28 | assertEquals(0, alignmentList.size()); 29 | } 30 | 31 | /** 32 | * Checks whether mock aligner works as described. 33 | */ 34 | @Test 35 | public void align() { 36 | AlignAlgorithm aligner = new AlignAlgorithmMock(2); 37 | String[][] sourceArray = new String[][]{ 38 | new String[]{"a", "b"}, new String[]{"c","d"}, 39 | new String[]{"e", "f"} 40 | }; 41 | String[][] targetArray = new String[][]{ 42 | new String[]{"1", "2"}, new String[]{"3"}, new String[]{} 43 | }; 44 | assert sourceArray.length == targetArray.length; 45 | int alignmentCount = sourceArray.length; 46 | List sourceList = combine(sourceArray); 47 | List targetList = combine(targetArray); 48 | List alignmentList = aligner.align(sourceList, targetList); 49 | assertEquals(alignmentCount, alignmentList.size()); 50 | for (int i = 0; i < alignmentCount; ++i) { 51 | assertAlignmentEquals(sourceArray[i], targetArray[i], 52 | alignmentList.get(i)); 53 | } 54 | } 55 | 56 | /** 57 | * Creates a list of strings containing all strings from input 58 | * two dimensional array. 59 | * @param array array 60 | * @return list 61 | */ 62 | private List combine(String[][] array) { 63 | List list = new ArrayList(); 64 | for (String[] group : array) { 65 | for (String element : group) { 66 | list.add(element); 67 | } 68 | } 69 | return list; 70 | } 71 | 72 | } 73 | -------------------------------------------------------------------------------- /maligna-ui/src/main/java/net/loomchild/maligna/ui/console/command/MacroCommand.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.ui.console.command; 2 | 3 | import java.util.List; 4 | 5 | import net.loomchild.maligna.coretypes.Alignment; 6 | import net.loomchild.maligna.filter.Filter; 7 | import net.loomchild.maligna.filter.macro.GaleAndChurchMacro; 8 | import net.loomchild.maligna.filter.macro.MooreMacro; 9 | import net.loomchild.maligna.filter.macro.PoissonMacro; 10 | import net.loomchild.maligna.filter.macro.TranslationMacro; 11 | import net.loomchild.maligna.filter.meta.FilterDecorators; 12 | import net.loomchild.maligna.formatter.AlFormatter; 13 | import net.loomchild.maligna.formatter.Formatter; 14 | import net.loomchild.maligna.parser.AlParser; 15 | import net.loomchild.maligna.parser.Parser; 16 | import net.loomchild.maligna.ui.console.command.exception.MissingParameterException; 17 | import net.loomchild.maligna.ui.console.command.exception.UnknownParameterException; 18 | import net.loomchild.maligna.filter.macro.PoissonTranslationMacro; 19 | 20 | import org.apache.commons.cli.CommandLine; 21 | import org.apache.commons.cli.Options; 22 | 23 | public class MacroCommand extends AbstractCommand { 24 | 25 | protected void initOptions(Options options) { 26 | options.addOption("c", "class", true, "Macro class. Valid values are: galechurch, moore, poisson, translation, poisson-translation."); 27 | } 28 | 29 | protected void run(CommandLine commandLine) { 30 | String cls = commandLine.getOptionValue('c'); 31 | if (cls == null) { 32 | throw new MissingParameterException("class"); 33 | } 34 | 35 | Filter filter; 36 | if (cls.equals("galechurch")) { 37 | filter = new GaleAndChurchMacro(); 38 | } else if (cls.equals("moore")) { 39 | filter = new MooreMacro(); 40 | } else if (cls.equals("poisson")) { 41 | filter = new PoissonMacro(); 42 | } else if (cls.equals("translation")) { 43 | filter = new TranslationMacro(); 44 | } else if (cls.equals("poisson-translation")) { 45 | filter = new PoissonTranslationMacro(); 46 | } else { 47 | throw new UnknownParameterException("class"); 48 | } 49 | 50 | filter = FilterDecorators.decorate(filter); 51 | 52 | Parser parser = new AlParser(getIn()); 53 | Formatter formatter = new AlFormatter(getOut()); 54 | List alignmentList = parser.parse(); 55 | alignmentList = filter.apply(alignmentList); 56 | formatter.format(alignmentList); 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /maligna/src/test/java/net/loomchild/maligna/filter/meta/IgnoreInfiniteProbabilityAlignmentsFilterDecoratorTest.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.meta; 2 | 3 | import static net.loomchild.maligna.util.TestUtil.assertAlignmentListEquals; 4 | import static net.loomchild.maligna.util.TestUtil.createAlignmentList; 5 | 6 | import java.util.List; 7 | 8 | import net.loomchild.maligna.coretypes.Alignment; 9 | import net.loomchild.maligna.filter.Filter; 10 | import net.loomchild.maligna.filter.aligner.Aligner; 11 | import net.loomchild.maligna.filter.aligner.align.onetoone.OneToOneAlgorithm; 12 | 13 | import org.junit.Test; 14 | 15 | /** 16 | * Represents {@link IgnoreInfiniteProbabilityAlignmentsFilterDecorator} 17 | * unit test. 18 | * @author loomchild 19 | */ 20 | public class IgnoreInfiniteProbabilityAlignmentsFilterDecoratorTest { 21 | 22 | public static final String[][] SOURCE_SEGMENT_ARRAY = { 23 | new String[] {"A", "B"}, 24 | new String[] {"X", "Y"}, 25 | new String[] {"C", "D"}, 26 | }; 27 | 28 | public static final String[][] TARGET_SEGMENT_ARRAY = { 29 | new String[] {"1", "2"}, 30 | new String[] {"9", "8"}, 31 | new String[] {"3", "4"}, 32 | }; 33 | 34 | public static final String[][] EXPECTED_SOURCE_SEGMENT_ARRAY = { 35 | new String[] {"A"}, 36 | new String[] {"B"}, 37 | new String[] {"X", "Y"}, 38 | new String[] {"C"}, 39 | new String[] {"D"}, 40 | }; 41 | 42 | public static final String[][] EXPECTED_TARGET_SEGMENT_ARRAY = { 43 | new String[] {"1"}, 44 | new String[] {"2"}, 45 | new String[] {"9", "8"}, 46 | new String[] {"3"}, 47 | new String[] {"4"}, 48 | }; 49 | 50 | /** 51 | * Tests whether alignments with infinite score are ignored and 52 | * the ones without are not by using {@link OneToOneAlgorithm} aligner. 53 | */ 54 | @Test 55 | public void testIgnoreInfiniteProbability() { 56 | Filter oneToOneAligner = new Aligner(new OneToOneAlgorithm()); 57 | Filter filter = 58 | new IgnoreInfiniteProbabilityAlignmentsFilterDecorator(oneToOneAligner); 59 | 60 | List alignmentList = createAlignmentList( 61 | SOURCE_SEGMENT_ARRAY, TARGET_SEGMENT_ARRAY); 62 | 63 | // Mark middle alignment as fixed. 64 | alignmentList.get(1).setScore(Float.NEGATIVE_INFINITY); 65 | 66 | List resultAlignmentList = filter.apply(alignmentList); 67 | assertAlignmentListEquals(EXPECTED_SOURCE_SEGMENT_ARRAY, 68 | EXPECTED_TARGET_SEGMENT_ARRAY, resultAlignmentList); 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.gitignore.io/api/maven,eclipse,intellij,java 2 | 3 | ### Maven ### 4 | target/ 5 | pom.xml.tag 6 | pom.xml.releaseBackup 7 | pom.xml.versionsBackup 8 | pom.xml.next 9 | release.properties 10 | dependency-reduced-pom.xml 11 | buildNumber.properties 12 | .mvn/timing.properties 13 | 14 | 15 | ### Eclipse ### 16 | *.pydevproject 17 | .metadata 18 | .gradle 19 | bin/ 20 | tmp/ 21 | *.tmp 22 | *.bak 23 | *.swp 24 | *~.nib 25 | local.properties 26 | .settings/ 27 | .loadpath 28 | 29 | # Eclipse Core 30 | .project 31 | 32 | # External tool builders 33 | .externalToolBuilders/ 34 | 35 | # Locally stored "Eclipse launch configurations" 36 | *.launch 37 | 38 | # CDT-specific 39 | .cproject 40 | 41 | # JDT-specific (Eclipse Java Development Tools) 42 | .classpath 43 | 44 | # Java annotation processor (APT) 45 | .factorypath 46 | 47 | # PDT-specific 48 | .buildpath 49 | 50 | # sbteclipse plugin 51 | .target 52 | 53 | # TeXlipse plugin 54 | .texlipse 55 | 56 | 57 | ### Intellij ### 58 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio 59 | 60 | *.iml 61 | 62 | ## Directory-based project format: 63 | .idea/ 64 | # if you remove the above rule, at least ignore the following: 65 | 66 | # User-specific stuff: 67 | # .idea/workspace.xml 68 | # .idea/tasks.xml 69 | # .idea/dictionaries 70 | 71 | # Sensitive or high-churn files: 72 | # .idea/dataSources.ids 73 | # .idea/dataSources.xml 74 | # .idea/sqlDataSources.xml 75 | # .idea/dynamic.xml 76 | # .idea/uiDesigner.xml 77 | 78 | # Gradle: 79 | # .idea/gradle.xml 80 | # .idea/libraries 81 | 82 | # Mongo Explorer plugin: 83 | # .idea/mongoSettings.xml 84 | 85 | ## File-based project format: 86 | *.ipr 87 | *.iws 88 | 89 | ## Plugin-specific files: 90 | 91 | # IntelliJ 92 | /out/ 93 | 94 | # mpeltonen/sbt-idea plugin 95 | .idea_modules/ 96 | 97 | # JIRA plugin 98 | atlassian-ide-plugin.xml 99 | 100 | # Crashlytics plugin (for Android Studio and IntelliJ) 101 | com_crashlytics_export_strings.xml 102 | crashlytics.properties 103 | crashlytics-build.properties 104 | 105 | 106 | ### Java ### 107 | *.class 108 | 109 | # Mobile Tools for Java (J2ME) 110 | .mtj.tmp/ 111 | 112 | # Package Files # 113 | *.jar 114 | *.war 115 | *.ear 116 | 117 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 118 | hs_err_pid* 119 | 120 | private/ 121 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/comparator/Diff.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.comparator; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import net.loomchild.maligna.coretypes.Alignment; 7 | 8 | 9 | /** 10 | * Represents the result of alignment compare. 11 | * Contains common alignment list and non matching, corresponding left and right 12 | * alignment list groups. Number of elements on leftList and rightList is equal. 13 | * 14 | * @author Jarek Lipski (loomchild) 15 | */ 16 | public class Diff { 17 | 18 | private List commonList; 19 | 20 | private List> leftGroupList; 21 | 22 | private List> rightGroupList; 23 | 24 | private List leftList; 25 | 26 | private List rightList; 27 | 28 | public Diff(List commonList, List> leftGroupList, 29 | List> rightGroupList) { 30 | 31 | if (leftGroupList.size() != rightGroupList.size()) { 32 | throw new IllegalArgumentException( 33 | "Left and right list lengths must be equal."); 34 | } 35 | 36 | this.commonList = commonList; 37 | this.leftGroupList = leftGroupList; 38 | this.rightGroupList = rightGroupList; 39 | 40 | this.leftList = mergeGroups(leftGroupList); 41 | this.rightList = mergeGroups(rightGroupList); 42 | 43 | } 44 | 45 | private List mergeGroups(List> groupList) { 46 | List list = new ArrayList(); 47 | for (List alignmentList : groupList) { 48 | list.addAll(alignmentList); 49 | } 50 | return list; 51 | } 52 | 53 | /** 54 | * @return Returns list common to left and right alignments. 55 | */ 56 | public List getCommonList() { 57 | return commonList; 58 | } 59 | 60 | /** 61 | * @return Returns list of alignment groups occurring only in left list. 62 | */ 63 | public List> getLeftGroupList() { 64 | return leftGroupList; 65 | } 66 | 67 | /** 68 | * @return Returns list of alignment groups occurring only in right list. 69 | */ 70 | public List> getRightGroupList() { 71 | return rightGroupList; 72 | } 73 | 74 | /** 75 | * @return Returns list of alignments occurring only in left list. 76 | */ 77 | public List getLeftList() { 78 | return leftList; 79 | } 80 | 81 | /** 82 | * @return Returns list of alignments occurring only in right list. 83 | */ 84 | public List getRightList() { 85 | return rightList; 86 | } 87 | 88 | } 89 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/selector/FractionSelector.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.selector; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Arrays; 5 | import java.util.List; 6 | 7 | import net.loomchild.maligna.coretypes.Alignment; 8 | import net.loomchild.maligna.filter.Filter; 9 | 10 | 11 | /** 12 | * Represents a filter that selects given fraction of most probable 13 | * alignments. 14 | * 15 | * @author Jarek Lipski (loomchild) 16 | */ 17 | public class FractionSelector implements Filter { 18 | 19 | private float fraction; 20 | 21 | /** 22 | * Creates filter. 23 | * @param fraction fraction that will be left after filtering, [0,1] 24 | */ 25 | public FractionSelector(float fraction) { 26 | assert fraction >= 0.0f && fraction <= 1.0f; 27 | this.fraction = fraction; 28 | } 29 | 30 | /** 31 | * Selects most probable alignments from input list and leaves only 32 | * given fraction of the best ones. For example if list has 100 alignments 33 | * and the fraction was set to 0.8, then the resulting list will have 34 | * 80 alignments with highest probability (lowest score). 35 | * Does not change alignments order. 36 | * Resulting list can have few more elements if they have equal score. 37 | * 38 | * TODO: why return more elements sometimes - maybe fix it so the number 39 | * is always correct, by keeping only calculated number of identical 40 | * elements? 41 | * @param alignmentList input alignment list 42 | * @return filtered alignment list 43 | */ 44 | public List apply(List alignmentList) { 45 | float threshold = calculateThreshold(alignmentList); 46 | List filteredAlignmentList = new ArrayList(); 47 | for (Alignment alignment : alignmentList) { 48 | if (alignment.getScore() <= threshold) { 49 | filteredAlignmentList.add(alignment); 50 | } 51 | } 52 | return filteredAlignmentList; 53 | } 54 | 55 | private float calculateThreshold(List alignmentList) { 56 | float[] scoreArray = new float[alignmentList.size()]; 57 | int index = 0; 58 | for (Alignment alignment : alignmentList) { 59 | scoreArray[index] = alignment.getScore(); 60 | ++index; 61 | } 62 | Arrays.sort(scoreArray); 63 | float firstFiltered = fraction * (float)scoreArray.length - 0.5f; 64 | float threshold; 65 | if (firstFiltered < 0.0f) { 66 | threshold = Float.NEGATIVE_INFINITY; 67 | } else { 68 | threshold = scoreArray[(int)firstFiltered]; 69 | } 70 | return threshold; 71 | } 72 | 73 | } 74 | -------------------------------------------------------------------------------- /maligna-ui/src/main/java/net/loomchild/maligna/ui/console/Maligna.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.ui.console; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Arrays; 5 | import java.util.Collections; 6 | import java.util.List; 7 | 8 | import net.loomchild.maligna.ui.console.command.Command; 9 | import net.loomchild.maligna.ui.console.command.CommandFactory; 10 | import net.loomchild.maligna.util.Version; 11 | 12 | import org.apache.commons.logging.Log; 13 | import org.apache.commons.logging.LogFactory; 14 | 15 | public class Maligna { 16 | 17 | private static final Log log = LogFactory.getLog(Maligna.class); 18 | 19 | public static final String MAIN_COMMAND_NAME = "maligna"; 20 | 21 | 22 | public static void main(String[] args) { 23 | Maligna maligna = new Maligna(); 24 | maligna.run(args); 25 | } 26 | 27 | public static void printSignature() { 28 | String signature = "mALIGNa"; 29 | if (Version.getInstance().getVersion() != null) { 30 | signature += " " + Version.getInstance().getVersion(); 31 | } 32 | if (Version.getInstance().getDate() != null) { 33 | signature += ", " + Version.getInstance().getDate(); 34 | } 35 | signature += "."; 36 | System.out.println(signature); 37 | } 38 | 39 | private void run(String[] args) { 40 | try { 41 | if (args.length == 0) { 42 | printUsage(); 43 | } else { 44 | String commandName = args[0]; 45 | if (commandName.equals("-h") || commandName.equals("--help")) { 46 | printHelp(); 47 | } else { 48 | Command command = 49 | CommandFactory.getInstance().getCommand(commandName); 50 | if (command == null) { 51 | printUsage(); 52 | } else { 53 | String[] commandArgs = 54 | Arrays.copyOfRange(args, 1, args.length); 55 | command.run(commandArgs); 56 | } 57 | } 58 | } 59 | } catch (Exception e) { 60 | log.fatal("Unknown exception.", e); 61 | } 62 | } 63 | 64 | private void printUsage() { 65 | System.out.println("Unknown command. Use maligna -h for help."); 66 | } 67 | 68 | private void printHelp() { 69 | printSignature(); 70 | System.out.println("Syntax: "); 71 | System.out.println(" maligna [command options...]"); 72 | System.out.println("Available commands are: "); 73 | List commandNameList = 74 | new ArrayList(CommandFactory.getInstance().getCommandNameSet()); 75 | Collections.sort(commandNameList); 76 | System.out.println(" " + Arrays.toString(commandNameList.toArray())); 77 | System.out.println("To get help on specific command options use maligna -h."); 78 | } 79 | 80 | } 81 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/progress/WriterProgressObserver.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.progress; 2 | 3 | import java.io.IOException; 4 | import java.io.Writer; 5 | 6 | /** 7 | *

8 | * Represents progress observer that uses {@link Writer} to record the events. 9 | *

10 | * 11 | *

12 | * It needs to be registered via 13 | * {@link ProgressManager#registerProgressObserver(ProgressObserver)} method 14 | * before use. 15 | *

16 | * 17 | * @author loomchild 18 | */ 19 | public class WriterProgressObserver implements ProgressObserver { 20 | 21 | /** 22 | * Character that will be used to indicate the progress. 23 | */ 24 | public static final char PROGRESS_CHAR = '.'; 25 | 26 | private Writer writer; 27 | 28 | private int size; 29 | 30 | private int index; 31 | 32 | /** 33 | * Creates progress observer. 34 | * 35 | * @param writer writer to use to communicate events. 36 | * @param size size of progress bar in characters 37 | */ 38 | public WriterProgressObserver(Writer writer, int size) { 39 | this.writer = writer; 40 | this.size = size; 41 | reset(); 42 | } 43 | 44 | /** 45 | * Writes new character if necessary. 46 | */ 47 | @Override 48 | public void completeTask(ProgressMeter progressMeter) { 49 | int newIndex = (int)(size * progressMeter.getProgress()); 50 | if (newIndex > index) { 51 | updateIndex(newIndex); 52 | } 53 | } 54 | 55 | private void updateIndex(int newIndex) { 56 | assert newIndex > index && newIndex <= size; 57 | 58 | for (; index < newIndex; ++index) { 59 | write(PROGRESS_CHAR); 60 | } 61 | } 62 | 63 | /** 64 | * Writes task name. 65 | */ 66 | @Override 67 | public void registerProgressMeter(ProgressMeter progressMeter) { 68 | write(progressMeter.getName() + " [" + progressMeter.getTotalTasks() + " ops] "); 69 | } 70 | 71 | /** 72 | * Resets its index for the next task and writes EOL character. 73 | */ 74 | @Override 75 | public void unregisterProgressMeter(ProgressMeter progressMeter) { 76 | reset(); 77 | write('\n'); 78 | } 79 | 80 | private void reset() { 81 | this.index = 0; 82 | } 83 | 84 | private void write(char character) { 85 | try { 86 | writer.write(character); 87 | writer.flush(); 88 | } catch (IOException e) { 89 | throw new RuntimeException(e); 90 | } 91 | } 92 | 93 | private void write(String string) { 94 | try { 95 | writer.write(string); 96 | writer.flush(); 97 | } catch (IOException e) { 98 | throw new RuntimeException(e); 99 | } 100 | } 101 | 102 | } 103 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/filter/modifier/modify/clean/UnifyRareWordsCleanAlgorithm.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.modifier.modify.clean; 2 | 3 | import static net.loomchild.maligna.model.vocabulary.VocabularyUtil.DEFAULT_TOKENIZE_ALGORITHM; 4 | 5 | import java.util.List; 6 | 7 | import net.loomchild.maligna.model.vocabulary.Vocabulary; 8 | import net.loomchild.maligna.filter.modifier.modify.split.SplitAlgorithm; 9 | import net.loomchild.maligna.model.vocabulary.VocabularyUtil; 10 | 11 | 12 | /** 13 | *

Represents clean algorithm changing all words in segments that are not 14 | * present in given vocabulary to given string (in other words replacing 15 | * all unknown words with predefined string).

16 | * 17 | *

To split segments into words uses given splitting algorithm or default, 18 | * simple one.

19 | * 20 | * @author loomchild 21 | */ 22 | public class UnifyRareWordsCleanAlgorithm extends CleanAlgorithm { 23 | 24 | public static final String DEFAULT_OTHER_WORD = "{OTHER}"; 25 | 26 | private Vocabulary vocabulary; 27 | 28 | private SplitAlgorithm splitAlgorithm; 29 | 30 | private String otherWord; 31 | 32 | /** 33 | * Creates algorithm. 34 | * @param vocabulary vocabulary containing known words 35 | * @param splitAlgorithm algorithm used to split segment into words 36 | * @param otherWord string that will be used to replace unknown words 37 | */ 38 | public UnifyRareWordsCleanAlgorithm(Vocabulary vocabulary, 39 | SplitAlgorithm splitAlgorithm, String otherWord) { 40 | this.vocabulary = vocabulary; 41 | this.splitAlgorithm = splitAlgorithm; 42 | this.otherWord = otherWord; 43 | } 44 | 45 | /** 46 | * Creates algorithm with default tokenize algorithm 47 | * ({@link VocabularyUtil#DEFAULT_TOKENIZE_ALGORITHM}) 48 | * and default unknown word replacement ({@link #DEFAULT_OTHER_WORD}). 49 | * @param vocabulary known words vocabulary 50 | */ 51 | public UnifyRareWordsCleanAlgorithm(Vocabulary vocabulary) { 52 | this(vocabulary, DEFAULT_TOKENIZE_ALGORITHM, DEFAULT_OTHER_WORD); 53 | } 54 | 55 | /** 56 | * Cleans a segment. Result contains all words separated by a single space. 57 | * @param segment 58 | * @return cleaned segment 59 | */ 60 | public String clean(String segment) { 61 | List wordList = splitAlgorithm.split(segment); 62 | StringBuilder resultSegment = new StringBuilder(); 63 | for (String word : wordList) { 64 | if (resultSegment.length() > 0) { 65 | resultSegment.append(" "); 66 | } 67 | if (vocabulary.containsWord(word)) { 68 | resultSegment.append(word); 69 | } else { 70 | resultSegment.append(otherWord); 71 | } 72 | } 73 | return resultSegment.toString(); 74 | } 75 | 76 | } 77 | -------------------------------------------------------------------------------- /maligna-ui/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | 5 | net.loomchild 6 | maligna-ui 7 | 3.0.2-SNAPSHOT 8 | jar 9 | 10 | maligna-ui 11 | 12 | 13 | UTF-8 14 | yyyy-MM-dd HH:mm 15 | true 16 | 17 | 18 | 19 | 20 | 21 | net.loomchild 22 | maligna 23 | ${project.version} 24 | 25 | 26 | 27 | net.loomchild 28 | maligna 29 | ${project.version} 30 | test-jar 31 | 32 | 33 | 34 | junit 35 | junit 36 | 4.13.1 37 | 38 | 39 | 40 | io.takari.junit 41 | takari-cpsuite 42 | 1.2.7 43 | 44 | 45 | 46 | commons-cli 47 | commons-cli 48 | 1.2 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | org.apache.maven.plugins 59 | maven-compiler-plugin 60 | 3.13.0 61 | 62 | ${maven.compiler.release} 63 | 64 | 65 | 66 | 67 | maven-assembly-plugin 68 | 3.3.0 69 | 70 | maligna-${project.version} 71 | false 72 | 73 | 0755 74 | 75 | 76 | src/main/assembly/bin.xml 77 | 78 | 79 | 80 | 81 | package 82 | 83 | single 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/model/length/MutableLengthModel.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.model.length; 2 | 3 | import java.io.PrintWriter; 4 | import java.io.Writer; 5 | import java.util.ArrayList; 6 | 7 | /** 8 | * Represents length model that can be changed after it was created. 9 | * After the model has been populated {@link #normalize()} method must 10 | * be called to calculate the probabilities. 11 | * 12 | * @author loomchild 13 | */ 14 | public class MutableLengthModel implements LengthModel { 15 | 16 | private ArrayList lengthProbabilityArray; 17 | 18 | float meanLength; 19 | 20 | int totalLength; 21 | 22 | int lengthOccurenceCount; 23 | 24 | /** 25 | * Creates empty length model. 26 | */ 27 | public MutableLengthModel() { 28 | this.lengthProbabilityArray = new ArrayList(); 29 | this.meanLength = 0.0f; 30 | this.lengthOccurenceCount = 0; 31 | this.totalLength = 0; 32 | } 33 | 34 | public float getLengthProbability(int length) { 35 | assert length >= 0; 36 | if (length < lengthProbabilityArray.size()) { 37 | return lengthProbabilityArray.get(length); 38 | } else { 39 | return 0; 40 | } 41 | } 42 | 43 | public float getMeanLength() { 44 | return meanLength; 45 | } 46 | 47 | /** 48 | * Adds occurrence of segment length to the model. 49 | * @param length segment length 50 | */ 51 | public void addLengthOccurence(int length) { 52 | assert length >= 0; 53 | ensureSize(length + 1); 54 | lengthProbabilityArray.set(length, lengthProbabilityArray.get(length) + 1); 55 | ++lengthOccurenceCount; 56 | totalLength += length; 57 | } 58 | 59 | /** 60 | * Calculates the occurrence probabilities. This method should be called 61 | * after model has been populated. 62 | */ 63 | public void normalize() { 64 | for (int i = 0; i < lengthProbabilityArray.size(); ++i) { 65 | float probability = lengthProbabilityArray.get(i) / 66 | (float)lengthOccurenceCount; 67 | lengthProbabilityArray.set(i, probability); 68 | } 69 | meanLength = (float)totalLength / (float)lengthOccurenceCount; 70 | } 71 | 72 | /** 73 | * Ensures that length probability array has given size by expanding 74 | * it with zeros if required. 75 | * @param size 76 | */ 77 | private void ensureSize(int size) { 78 | int currentSize = lengthProbabilityArray.size(); 79 | if (size > currentSize) { 80 | lengthProbabilityArray.ensureCapacity(size); 81 | for (int i = currentSize; i < size; ++i) { 82 | lengthProbabilityArray.add(0.0f); 83 | } 84 | } 85 | } 86 | 87 | public void format(Writer writer) { 88 | PrintWriter printWriter = new PrintWriter(writer, true); 89 | for (int i = 0; i < lengthProbabilityArray.size(); ++i) { 90 | printWriter.println(i + "\t" + lengthProbabilityArray.get(i)); 91 | } 92 | } 93 | 94 | } 95 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # ChangeLog 2 | 3 | ## version 0.0 2005-11 4 | 5 | * Project inception - first version based on Gale and Church article. 6 | 7 | ## version 0.9 2005-12-16 8 | 9 | * Functional version used by me in sentence alignment for Exprimo project; hard to use. 10 | 11 | ## version 1.0 2006-01-30 12 | 13 | * First public release to the users. A little clumsy text interface, added documentation and examples. 14 | 15 | ## version 1.9 2006-02 16 | 17 | * Conceptual preparation for second version. Initial work and integration with sentence splitter. 18 | 19 | ## version 2.0 2006-07 20 | 21 | * New system architecture enabling swapping of alignment algorithm. 22 | * Gale and Church re-implementation using the new architecture. 23 | * Added support for multiple parsers and formatters. Added TXT parser and TMX formatter. 24 | * Integration with segment SRX-rule based sentence splitter. 25 | 26 | ## version 2.1 2007-02 27 | 28 | * Created a tool for calculating precision and recall of alignment to test its quality. 29 | * TMX parser. 30 | 31 | ## version 2.2 2007-03 32 | 33 | * Improved architecture based solely on filters (like UNIX programs). 34 | * New text interface based on filters. 35 | 36 | ## version 2.3 2008-04 37 | 38 | * Official name changed to mALIGNa. Put project on sourceforge.net (around January). 39 | * Removed UNKNOWN_WORD and UNKNOWN_WID - instead of that UnifyAligner concept. 40 | * Created maligna.sh script gathering all maligna commands. 41 | 42 | ## version 2.4 2008-08 43 | 44 | * Created Oracle calculator - calculator that uses human-supplied alignment suggestions. 45 | * Created meta-calculators - Minimum, Maximum and Composite. 46 | * Cleaned up text interface. 47 | * Improved, official support for Windows - fixed .bat scripts, fixed presentation formatter 48 | 49 | ## version 2.5 2009-04 50 | 51 | * Segment SRX splitter library changed license to MIT and became part of mALIGNa distribution. 52 | * Translated Readme and other documents to English based on Jimmy O'Regan work. JavaDocs are still Polish. 53 | * Removed dependency on loomchild-util 54 | 55 | ## version 2.6 2010-12 56 | 57 | * English is the main maligna language - translated all JavaDocs and user documentation; improved documentation overall. 58 | * API change: Moved Alignment and Category to coretypes package, changed Category role a little. 59 | * Bug fixes for issues in ViterbiAlgorithm found by Anna Mündelein. 60 | * Created IgnoreInfiniteProbabilityAlignmentsFilterDecorator. Now alignments with -INF score are preserved by all commands. 61 | 62 | ## version 2.7 2011-10 63 | 64 | * Added progress meter to improve user interaction and as a basis for Okapi library integration. 65 | 66 | ## version 3.0 2015-09 67 | 68 | * Rename root package from net.sourceforge.align to net.loomchild.maligna 69 | * Migrate to Github 70 | * Migrate to Maven build 71 | * Publish maligna JAR on Maven Central 72 | 73 | -------------------------------------------------------------------------------- /maligna/src/test/java/net/loomchild/maligna/filter/meta/CompositeFilterTest.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.filter.meta; 2 | 3 | import static net.loomchild.maligna.util.TestUtil.assertAlignmentListEquals; 4 | import static net.loomchild.maligna.util.TestUtil.createAlignmentList; 5 | 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | 9 | import net.loomchild.maligna.coretypes.Alignment; 10 | import net.loomchild.maligna.filter.Filter; 11 | import net.loomchild.maligna.filter.aligner.Aligner; 12 | import net.loomchild.maligna.filter.aligner.align.AlignAlgorithm; 13 | import net.loomchild.maligna.filter.aligner.align.AlignAlgorithmMock; 14 | import net.loomchild.maligna.filter.modifier.Modifier; 15 | import net.loomchild.maligna.filter.modifier.modify.merge.MergeAlgorithm; 16 | import net.loomchild.maligna.filter.modifier.modify.merge.SeparatorMergeAlgorithm; 17 | import net.loomchild.maligna.filter.modifier.modify.split.SplitAlgorithm; 18 | import net.loomchild.maligna.filter.modifier.modify.split.SplitAlgorithmMock; 19 | 20 | import org.junit.Test; 21 | 22 | /** 23 | * Represents {@link CompositeFilter} unit test. 24 | * @author loomchild 25 | */ 26 | public class CompositeFilterTest { 27 | 28 | public static final String[][] SOURCE_SEGMENT_ARRAY = { 29 | new String[] {"abcdef"} 30 | }; 31 | 32 | public static final String[][] TARGET_SEGMENT_ARRAY = { 33 | new String[] {"12345"} 34 | }; 35 | 36 | public static final String[][] EXPECTED_SOURCE_SEGMENT_ARRAY = { 37 | new String[] {"ab"}, 38 | new String[] {"cd"}, 39 | new String[] {"ef"}, 40 | }; 41 | 42 | public static final String[][] EXPECTED_TARGET_SEGMENT_ARRAY = { 43 | new String[] {"12"}, 44 | new String[] {"34"}, 45 | new String[] {"5"}, 46 | }; 47 | 48 | /** 49 | * Creates a composite filter consisting of 50 | * {@link SplitAlgorithmMock} filter, {@link AlignAlgorithmMock} filter and 51 | * {@link SeparatorMergeAlgorithm}, applies the filter and checks 52 | * if the results are correct. 53 | */ 54 | @Test 55 | public void testRunAllFilters() { 56 | SplitAlgorithm splitAlgorithm = new SplitAlgorithmMock(1); 57 | AlignAlgorithm alignAlgorithm = new AlignAlgorithmMock(2); 58 | MergeAlgorithm mergeAlgorithm = new SeparatorMergeAlgorithm(); 59 | List filterList = new ArrayList(); 60 | filterList.add(new Modifier(splitAlgorithm, splitAlgorithm)); 61 | filterList.add(new Aligner(alignAlgorithm)); 62 | filterList.add(new Modifier(mergeAlgorithm, mergeAlgorithm)); 63 | CompositeFilter composite = new CompositeFilter(filterList); 64 | List alignmentList = createAlignmentList( 65 | SOURCE_SEGMENT_ARRAY, TARGET_SEGMENT_ARRAY); 66 | List resultAlignmentList = composite.apply(alignmentList); 67 | assertAlignmentListEquals(EXPECTED_SOURCE_SEGMENT_ARRAY, 68 | EXPECTED_TARGET_SEGMENT_ARRAY, resultAlignmentList); 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/calculator/content/OracleCalculator.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.calculator.content; 2 | 3 | import java.util.Collection; 4 | import java.util.HashSet; 5 | import java.util.List; 6 | import java.util.Set; 7 | 8 | import net.loomchild.maligna.coretypes.Alignment; 9 | import net.loomchild.maligna.calculator.Calculator; 10 | 11 | /** 12 | * Represents calculator using reference alignments to calculate score. 13 | * Returns successScore if measured alignment is present in reference 14 | * alignments, failureScore otherwise. 15 | * 16 | * @author loomchild 17 | */ 18 | public class OracleCalculator implements Calculator { 19 | 20 | public static final float DEFAULT_FAILURE_SCORE = Float.POSITIVE_INFINITY; 21 | 22 | public static final float DEFAULT_SUCCESS_SCORE = 0.0f; 23 | 24 | private Set alignmentSet; 25 | 26 | private float successScore; 27 | 28 | private float failureScore; 29 | 30 | /** 31 | * Creates oracle calculator. 32 | * 33 | * @param alignmentCollection reference alignment 34 | * @param failureScore score returned when calculated alignment is not 35 | * in the reference alignment 36 | * @param successScore score returned when calculated alignment is in 37 | * the reference alignment 38 | */ 39 | public OracleCalculator(Collection alignmentCollection, 40 | float failureScore, float successScore) { 41 | this.alignmentSet = new HashSet(alignmentCollection); 42 | this.failureScore = failureScore; 43 | this.successScore = successScore; 44 | } 45 | 46 | /** 47 | * Creates oracle calculator. Success score is equal to 48 | * {@link #DEFAULT_SUCCESS_SCORE}. 49 | * 50 | * @param alignmentCollection reference alignment 51 | * @param failureScore score returned when calculated alignment is not 52 | * in the reference alignment 53 | */ 54 | public OracleCalculator(Collection alignmentCollection, 55 | float failureScore) { 56 | this(alignmentCollection, failureScore, DEFAULT_SUCCESS_SCORE); 57 | } 58 | 59 | /** 60 | * Creates oracle calculator. Success score is equal to 61 | * {@link #DEFAULT_SUCCESS_SCORE} and failure score is equal to 62 | * {@value #DEFAULT_FAILURE_SCORE}. 63 | * 64 | * @param alignmentCollection reference alignment 65 | */ 66 | public OracleCalculator(Collection alignmentCollection) { 67 | this(alignmentCollection, DEFAULT_FAILURE_SCORE, DEFAULT_SUCCESS_SCORE); 68 | } 69 | 70 | /** 71 | * Returns successScore if measured alignment is present in reference 72 | * alignments, failureScore otherwise. 73 | */ 74 | public float calculateScore(List sourceSegmentList, 75 | List targetSegmentList) { 76 | Alignment alignment = new Alignment(sourceSegmentList, targetSegmentList); 77 | if (alignmentSet.contains(alignment)) { 78 | return successScore; 79 | } else { 80 | return failureScore; 81 | } 82 | } 83 | 84 | } 85 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/calculator/length/LengthCalculator.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.calculator.length; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import net.loomchild.maligna.calculator.Calculator; 7 | import net.loomchild.maligna.calculator.length.counter.Counter; 8 | 9 | 10 | /** 11 | * Represents calculator that computes alignment probability based only on 12 | * segment length. Implements part of the {@link Calculator} functionality 13 | * and provides utility functions to inheriting concrete length-based 14 | * calculators. 15 | * 16 | * @author loomchild 17 | */ 18 | public abstract class LengthCalculator implements Calculator { 19 | 20 | private Counter counter; 21 | 22 | /** 23 | * Creates a calculator. 24 | * @param counter segment length counter (for example character count, 25 | * word count) 26 | */ 27 | public LengthCalculator(Counter counter) { 28 | this.counter = counter; 29 | } 30 | 31 | /** 32 | * Calculates alignment score first by computing lengths of all the segments 33 | * and later passing the results and control to the subclasses to do 34 | * the actual score calculation. 35 | */ 36 | public float calculateScore(List sourceSegmentList, 37 | List targetSegmentList) { 38 | List sourceLengthList = calculateLengthList(sourceSegmentList); 39 | List targetLengthList = calculateLengthList(targetSegmentList); 40 | return calculateLengthScore(sourceLengthList, targetLengthList); 41 | } 42 | 43 | /** 44 | * Calculates and returns lengths of subsequent segments. 45 | * @param segmentList segment list 46 | * @return list of lengths of source segments 47 | */ 48 | protected List calculateLengthList(List segmentList) { 49 | List lengthList = new ArrayList(); 50 | for (String segment : segmentList) { 51 | int length = counter.calculateLength(segment); 52 | if (length > 0) { 53 | lengthList.add(length); 54 | } 55 | } 56 | return lengthList; 57 | } 58 | 59 | /** 60 | * Utility function to calculate total length of the segments. 61 | * Returns sum of the lengths on the input list. Used by subclasses. 62 | * 63 | * @param lengthList list containing lengths 64 | * @return sum of lengths on the list 65 | */ 66 | protected int calculateTotalLength(List lengthList) { 67 | int totalLength = 0; 68 | for (int length : lengthList) { 69 | totalLength += length; 70 | } 71 | return totalLength; 72 | } 73 | 74 | /** 75 | * Abstract method implemented by subclasses to compute the actual score. 76 | * 77 | * @param sourceLengthList lengths of source segments 78 | * @param targetLengthList lengths of target segments 79 | * @return source to target segments alignment score, >= 0 80 | */ 81 | protected abstract float calculateLengthScore(List sourceLengthList, 82 | List targetLengthList); 83 | 84 | } 85 | -------------------------------------------------------------------------------- /maligna/src/main/java/net/loomchild/maligna/util/bind/MarshallerUnmarshaller.java: -------------------------------------------------------------------------------- 1 | package net.loomchild.maligna.util.bind; 2 | 3 | import static net.loomchild.maligna.util.Util.getResourceStream; 4 | 5 | import java.io.InputStream; 6 | import java.io.Reader; 7 | import java.io.Writer; 8 | import java.util.ArrayList; 9 | import java.util.List; 10 | 11 | import javax.xml.XMLConstants; 12 | import javax.xml.bind.JAXBContext; 13 | import javax.xml.bind.JAXBException; 14 | import javax.xml.bind.Marshaller; 15 | import javax.xml.bind.Unmarshaller; 16 | import javax.xml.transform.Source; 17 | import javax.xml.transform.stream.StreamSource; 18 | import javax.xml.validation.Schema; 19 | import javax.xml.validation.SchemaFactory; 20 | 21 | import net.loomchild.maligna.util.ResourceNotFoundException; 22 | 23 | import org.xml.sax.SAXException; 24 | 25 | /** 26 | * Represents configurable, validating XML marshaller / unmarshaller. 27 | * Implemented using JAXB technology. 28 | * 29 | * @author loomchild 30 | */ 31 | public class MarshallerUnmarshaller { 32 | 33 | private Marshaller marshaller; 34 | 35 | private Unmarshaller unmarshaller; 36 | 37 | public MarshallerUnmarshaller(String context, 38 | String[] schemaNameArray) { 39 | try { 40 | List sourceList = new ArrayList(); 41 | for (String schemaName : schemaNameArray) { 42 | InputStream schemaStream = getResourceStream(schemaName); 43 | sourceList.add(new StreamSource(schemaStream)); 44 | } 45 | SchemaFactory schemaFactory = SchemaFactory.newInstance( 46 | XMLConstants.W3C_XML_SCHEMA_NS_URI); 47 | Schema schema = 48 | schemaFactory.newSchema(sourceList.toArray(new Source[]{})); 49 | JAXBContext jaxbcontext = JAXBContext.newInstance(context); 50 | unmarshaller = jaxbcontext.createUnmarshaller(); 51 | unmarshaller.setSchema(schema); 52 | unmarshaller.setEventHandler(new QuietValidationEventHandler()); 53 | marshaller = jaxbcontext.createMarshaller(); 54 | marshaller.setSchema(schema); 55 | marshaller.setProperty(Marshaller.JAXB_FORMATTED_OUTPUT, true); 56 | } catch (JAXBException e) { 57 | throw new BindException("JAXB error", e); 58 | } catch (SAXException e) { 59 | throw new BindException("Error parsing XML Schema", e); 60 | } catch (ResourceNotFoundException e) { 61 | throw new BindException("Resource not found", e); 62 | } 63 | } 64 | 65 | public MarshallerUnmarshaller(String context, String schemaName) { 66 | this(context, new String[] {schemaName}); 67 | } 68 | 69 | public void marshal(Object object, Writer writer) { 70 | try { 71 | marshaller.marshal(object, writer); 72 | } catch (JAXBException e) { 73 | throw new BindException("JAXB marshalling error", e); 74 | } 75 | } 76 | 77 | public Object unmarshal(Reader reader) { 78 | try { 79 | return unmarshaller.unmarshal(reader); 80 | } catch (JAXBException e) { 81 | throw new BindException("JAXB unmarshalling error", e); 82 | } 83 | } 84 | 85 | 86 | } 87 | --------------------------------------------------------------------------------