├── .editorconfig
├── .gitignore
├── CREDITS
├── LICENSE.txt
├── README.md
├── RELEASE-HOWTO.md
├── changelog.txt
├── collatex-core
├── pom.xml
└── src
│ ├── main
│ ├── java
│ │ └── eu
│ │ │ └── interedition
│ │ │ └── collatex
│ │ │ ├── CollationAlgorithm.java
│ │ │ ├── CollationAlgorithmFactory.java
│ │ │ ├── Token.java
│ │ │ ├── VariantGraph.java
│ │ │ ├── Witness.java
│ │ │ ├── dekker
│ │ │ ├── DekkerAlgorithm.kt
│ │ │ ├── InspectableCollationAlgorithm.java
│ │ │ ├── Match.kt
│ │ │ ├── PhraseMatchDetector.kt
│ │ │ ├── TranspositionDetector.kt
│ │ │ ├── Tuple.java
│ │ │ ├── astar
│ │ │ │ ├── AstarAlgorithm.java
│ │ │ │ └── Cost.java
│ │ │ ├── editgraphaligner
│ │ │ │ ├── EditGraphAligner.java
│ │ │ │ └── MatchCube.java
│ │ │ ├── island
│ │ │ │ ├── Archipelago.java
│ │ │ │ ├── Coordinate.java
│ │ │ │ ├── Island.kt
│ │ │ │ ├── IslandCollection.java
│ │ │ │ ├── IslandCompetition.java
│ │ │ │ ├── IslandConflictResolver.java
│ │ │ │ ├── IslandDepthAndSizeComparator.java
│ │ │ │ ├── IslandSelection.java
│ │ │ │ ├── IslandSizeComparator.java
│ │ │ │ └── package-info.java
│ │ │ ├── legacy
│ │ │ │ ├── IslandPositionComparator.java
│ │ │ │ ├── MatchTable.java
│ │ │ │ ├── MatchTableImpl.java
│ │ │ │ ├── MatchTableLinker.java
│ │ │ │ ├── MatchTableSelection.java
│ │ │ │ └── MatchTableSerializer.java
│ │ │ ├── package-info.java
│ │ │ └── token_index
│ │ │ │ ├── Block.kt
│ │ │ │ ├── TokenIndex.kt
│ │ │ │ └── TokenIndexToMatches.kt
│ │ │ ├── matching
│ │ │ ├── EditDistance.java
│ │ │ ├── EditDistanceTokenComparator.java
│ │ │ ├── EqualityTokenComparator.java
│ │ │ ├── Matches.java
│ │ │ ├── StrictEqualityTokenComparator.java
│ │ │ └── package-info.java
│ │ │ ├── medite
│ │ │ ├── AlignmentDecisionGraph.java
│ │ │ ├── Matches.java
│ │ │ ├── MediteAlgorithm.java
│ │ │ ├── SuffixTree.java
│ │ │ └── package-info.java
│ │ │ ├── needlemanwunsch
│ │ │ ├── NeedlemanWunschAlgorithm.java
│ │ │ ├── NeedlemanWunschScorer.java
│ │ │ └── package-info.java
│ │ │ ├── package-info.java
│ │ │ ├── simple
│ │ │ ├── SimpleCollation.java
│ │ │ ├── SimplePatternTokenizer.java
│ │ │ ├── SimpleToken.java
│ │ │ ├── SimpleTokenNormalizers.java
│ │ │ ├── SimpleVariantGraphSerializer.java
│ │ │ ├── SimpleWitness.java
│ │ │ ├── SimpleWitnessTeiBuilder.java
│ │ │ └── package-info.java
│ │ │ ├── suffixarray
│ │ │ ├── Algorithm.java
│ │ │ ├── BPR.java
│ │ │ ├── CharSequenceAdapter.java
│ │ │ ├── DeepShallow.java
│ │ │ ├── DensePositiveDecorator.java
│ │ │ ├── DensePositiveMapper.java
│ │ │ ├── DivSufSort.java
│ │ │ ├── ExtraTrailingCellsDecorator.java
│ │ │ ├── GenericArrayAdapter.java
│ │ │ ├── ISuffixArrayBuilder.java
│ │ │ ├── ISymbolMapper.java
│ │ │ ├── MinMax.java
│ │ │ ├── QSufSort.java
│ │ │ ├── SAIS.java
│ │ │ ├── Skew.java
│ │ │ ├── SuffixArrays.java
│ │ │ ├── SuffixData.java
│ │ │ ├── Tools.java
│ │ │ └── Traversals.java
│ │ │ ├── suffixtree
│ │ │ ├── ActivePoint.java
│ │ │ ├── Cursor.java
│ │ │ ├── Edge.java
│ │ │ ├── Node.java
│ │ │ ├── Sequence.java
│ │ │ ├── SequenceTerminal.java
│ │ │ ├── Suffix.java
│ │ │ ├── SuffixTree.java
│ │ │ └── Utils.java
│ │ │ └── util
│ │ │ ├── GreedyStringTilingAlgorithm.java
│ │ │ ├── ParallelSegmentationApparatus.java
│ │ │ ├── StreamUtil.java
│ │ │ ├── VariantGraphRanking.java
│ │ │ ├── VariantGraphTraversal.kt
│ │ │ ├── VertexMatch.java
│ │ │ └── package-info.java
│ └── javadoc
│ │ └── overview.html
│ └── test
│ └── java
│ └── eu
│ └── interedition
│ └── collatex
│ ├── AbstractTest.java
│ ├── ScriptEngineTest.java
│ ├── VariantGraphTest.java
│ ├── dekker
│ ├── AlignmentTest.java
│ ├── BeckettTest.java
│ ├── DarwinTest.java
│ ├── DekkerAlgorithmTest.java
│ ├── SpencerHoweTest.java
│ ├── TranspositionGraphTest.java
│ ├── TranspositionRenderingTest.java
│ ├── VariantGraphRankerTest.java
│ ├── VariantGraphTest.java
│ ├── editgraphaligner
│ │ └── EditGraphMultiWitnessAlignerTest.java
│ ├── island
│ │ └── IslandTest.java
│ ├── legacy
│ │ ├── HermansTest.java
│ │ ├── IslandConflictResolverTest.java
│ │ ├── MatchTableLinkerTest.java
│ │ └── MatchTableTest.java
│ └── token_index
│ │ ├── TokenComparatorTest.java
│ │ ├── TokenIndexTest.java
│ │ └── VariantGraphMatcher.java
│ ├── lab
│ ├── CollateXLaboratory.java
│ ├── MatchMatrixCellStatus.java
│ ├── MatchMatrixTableModel.java
│ ├── MatchTableCell.java
│ └── WitnessPanel.java
│ ├── matching
│ ├── MatchesTest.java
│ └── NearMatcherTest.java
│ ├── medite
│ ├── MediteTest.java
│ └── SuffixTreeTest.java
│ ├── needlemanwunsch
│ └── NeedlemanWunschTest.java
│ ├── output
│ └── AlignmentTableTest.java
│ └── simple
│ ├── SimpleWitnessTeiBuilderTest.java
│ └── SimpleWitnessTest.java
├── collatex-pythonport
├── .gitignore
├── .pydevproject
├── AUTHORS.rst
├── CONTRIBUTING.rst
├── ClusterShell
│ ├── RangeSet.py
│ └── __init__.py
├── HISTORY.rst
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.rst
├── RELEASE.txt
├── TODO.txt
├── collatex
│ ├── HTML.py
│ ├── __init__.py
│ ├── astar.py
│ ├── block.py
│ ├── core_classes.py
│ ├── core_functions.py
│ ├── display_module.py
│ ├── edit_graph_aligner.py
│ ├── exceptions.py
│ ├── experimental_astar_aligner.py
│ ├── extended_suffix_array.py
│ ├── linsuffarr.py
│ ├── near_matching.py
│ ├── suffix_based_scorer.py
│ ├── tokenindex.py
│ └── transposition_handling.py
├── docs
│ ├── Makefile
│ ├── authors.rst
│ ├── conf.py
│ ├── contributing.rst
│ ├── history.rst
│ ├── index.rst
│ ├── installation.rst
│ ├── make.bat
│ ├── readme.rst
│ └── usage.rst
├── installing_pygraphviz.txt
├── near_matching_analysis.ipynb
├── old_readme
├── requirements.txt
├── setup.cfg
├── setup.py
├── tests
│ ├── __init__.py
│ ├── test_alignment.py
│ ├── test_alignment_table_rendering.py
│ ├── test_astar_edit_graph_aligner.py
│ ├── test_beckett.py
│ ├── test_collatex.py
│ ├── test_collatex_block_witnesses.py
│ ├── test_collatex_dekker_algorithm.py
│ ├── test_collatex_linsuffarr.py
│ ├── test_collatex_simple.py
│ ├── test_decision_tree_aligner.py
│ ├── test_edit_graph_aligner.py
│ ├── test_export_alignment_table_as_tei.py
│ ├── test_export_alignment_table_as_xml.py
│ ├── test_near_matching.py
│ ├── test_near_matching_pretokenized.py
│ ├── test_suffix_based_scorer.py
│ ├── test_suffix_edit_graph_aligner.py
│ ├── test_tokenindex.py
│ ├── test_tokenized_json.py
│ ├── test_variant_graph.py
│ └── test_witness_tokens.py
├── tox.ini
└── use_cases
│ ├── darwin.py
│ └── darwin_chapter1_para1.json
├── collatex-servlet
├── .gitignore
├── pom.xml
└── src
│ └── main
│ ├── java
│ └── eu
│ │ └── interedition
│ │ └── collatex
│ │ ├── http
│ │ ├── CollateApplication.java
│ │ └── CollateResource.java
│ │ └── io
│ │ ├── IOExceptionMapper.java
│ │ ├── SimpleCollationJSONMessageBodyReader.java
│ │ ├── VariantGraphDotMessageBodyWriter.java
│ │ ├── VariantGraphGraphMLMessageBodyWriter.java
│ │ ├── VariantGraphJSONMessageBodyWriter.java
│ │ ├── VariantGraphSVGMessageBodyWriter.java
│ │ └── VariantGraphTEIMessageBodyWriter.java
│ ├── resources
│ └── static
│ │ ├── collatex-console.js
│ │ ├── collatex.css
│ │ ├── collatex.js
│ │ ├── collatex.png
│ │ ├── darwin.html
│ │ ├── favicon.ico
│ │ ├── index.html
│ │ ├── yui-3.8.1-head.html
│ │ ├── yui-3.8.1-modules.css
│ │ └── yui-3.8.1-modules.js
│ └── webapp
│ └── WEB-INF
│ └── web.xml
├── collatex-tools
├── pom.xml
└── src
│ ├── main
│ ├── java
│ │ └── eu
│ │ │ └── interedition
│ │ │ └── collatex
│ │ │ └── tools
│ │ │ ├── CollateX.java
│ │ │ ├── CollationPipe.java
│ │ │ ├── CollationServer.java
│ │ │ ├── JsonProcessor.java
│ │ │ └── PluginScript.java
│ └── resources
│ │ └── static
│ │ ├── collatex-console.js
│ │ ├── collatex.css
│ │ ├── collatex.js
│ │ ├── collatex.png
│ │ ├── darwin.html
│ │ ├── favicon.ico
│ │ ├── index.html
│ │ ├── yui-3.8.1-head.html
│ │ ├── yui-3.8.1-modules.css
│ │ └── yui-3.8.1-modules.js
│ └── test
│ ├── fixtures
│ ├── base-2w.json
│ └── base-4w.json
│ └── java
│ └── eu
│ └── interedition
│ └── collatex
│ └── tools
│ └── CollectionPipeTest.java
├── docs
├── _layouts
│ └── default.html
├── images
│ ├── html2_output.png
│ ├── svg_output.png
│ └── svg_simple_output.png
├── pythonport.md
└── style.css
├── logging.properties
├── pom.xml
├── site
├── Gruntfile.js
├── collatex.js
├── collatex.less
├── google-code-prettify
│ ├── prettify-sunburst-theme.less
│ ├── prettify.js
│ └── prettify.less
├── grunt
│ ├── aliases.yaml
│ ├── browserify.js
│ ├── clean.js
│ ├── connect.js
│ ├── cssmin.js
│ ├── jade.js
│ ├── less.js
│ ├── uglify.js
│ └── watch.js
├── htdocs
│ ├── about
│ │ └── index.html
│ ├── collatex.css
│ ├── collatex.js
│ ├── doc
│ │ └── index.html
│ ├── download
│ │ └── index.html
│ ├── favicon.ico
│ ├── images
│ │ ├── aligner.png
│ │ ├── analyzer.png
│ │ ├── tokenizer.png
│ │ ├── variant-graph-collatex.png
│ │ ├── variant-graph-schmidt.png
│ │ └── variant-graph-snippet.png
│ └── index.html
├── package.json
└── templates
│ ├── about
│ └── index.jade
│ ├── doc
│ ├── index.jade
│ └── usage-instructions.txt
│ ├── download
│ └── index.jade
│ ├── index.jade
│ └── page.jade
└── tmp
└── .gitignore
/.editorconfig:
--------------------------------------------------------------------------------
1 | # http://editorconfig.org/
2 |
3 | root = true
4 |
5 | [*]
6 | charset = utf-8
7 | end_of_line = lf
8 | trim_trailing_whitespace = true
9 | insert_final_newline = false
10 | indent_style = space
11 | indent_size = 4
12 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .idea
3 | target
4 | *.iml
5 | *.ipr
6 | *.iws
7 | *.ipynb
8 | *.egg-info
9 | Digraph.gv*
10 | collatex-pythonport/build/*
11 | collatex-pythonport/dist/*
12 |
13 | .classpath
14 | .eggs
15 | .project
16 | .settings/
17 | bin/
18 | site/vendor
19 | node_modules
20 | /.pydevproject
21 |
--------------------------------------------------------------------------------
/CREDITS:
--------------------------------------------------------------------------------
1 | This software includes code from the Shingle Cloud Library,
2 | Copyright (C) 2009 Arno Mittelbach, Lasse Lehmann. The Shingle Cloud Library
3 | is free software and redistributed under the terms of the GNU Lesser General
4 | Public License.
5 |
6 | This software includes code from JSuffixArrays, a project which provides the
7 | implementation of a number of algorithms for computing Suffix Arrays over
8 | sequences of symbols of arbitrary size (not only strings).
9 | Copyright (C) 2014 Carrot Search s.c. JSuffixArrays is licensed under the
10 | Apache Software License, Version 2.0.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [CollateX](http://collatex.net/) is a software to
2 |
3 | 1. read **multiple (≥ 2) versions of a text**, splitting each version into parts (tokens) to be compared,
4 | 1. **identify similarities of and differences between the versions** (including moved/transposed segments) by aligning tokens, and
5 | 1. output the alignment results in a **variety of formats for further processing**, for instance
6 | 1. to support **the production of a critical apparatus** or the stemmatical analysis of a text's genesis.
7 |
8 | It resembles software used to compute differences between files (e.g. [diff](http://en.wikipedia.org/wiki/Diff)) or tools for [sequence alignment](http://en.wikipedia.org/wiki/Sequence_alignment) which are commonly used in Bioinformatics. While CollateX shares some of the techniques and algorithms with those tools, it mainly aims for a flexible and configurable approach to the problem of finding similarities and differences in texts, sometimes trading computational soundness or complexity for the user's ability to influence results.
9 |
10 | As such it is primarily designed for use cases in disciplines like [Philology](http://en.wikipedia.org/wiki/Philology) or – more specifically – the field of [Textual Criticism](http://en.wikipedia.org/wiki/Textual_criticism) where the assessment of findings is based on interpretation and therefore can be supported by computational means but is not necessarily computable.
11 |
12 | Please go to for further information.
13 |
--------------------------------------------------------------------------------
/RELEASE-HOWTO.md:
--------------------------------------------------------------------------------
1 | ## Setup GPG
2 |
3 | Releasing artifacts to Maven Central requires signing them. Make sure gpg is set up:
4 |
5 | [http://central.sonatype.org/pages/working-with-pgp-signatures.html](http://central.sonatype.org/pages/working-with-pgp-signatures.html).
6 |
7 | ## Setup Maven
8 |
9 | In `$HOME/.m2/settings.xml`, add credentials for accessing
10 | [Sonatype's OSS Repository](https://oss.sonatype.org/):
11 |
12 |
13 |
14 |
15 | ossrh-interedition
16 | interedition
17 | ...
18 |
19 |
20 |
21 |
22 | ## Update changelog
23 |
24 | Edit `changelog.txt`.
25 |
26 | ## Update POMs and site to reflect new release version
27 |
28 | mvn versions:set -DnewVersion=1.2.3
29 |
30 | Edit `site/grunt/jade.js`, update `dist.options.data.version` and regenerate site via
31 |
32 | grunt deploy
33 |
34 | from the `site/` directory.
35 |
36 | ## Deploy artifacts to staging area
37 |
38 | Activate the `release` profile in order to attach Javadocs and sources to build artifacts.
39 |
40 | mvn clean deploy -P release
41 |
42 | ## Commit and push released version to VCS and tag release
43 |
44 | Push to Github and create a release.
45 |
46 | ## Release successfully staged artifacts
47 |
48 | mvn nexus-staging:release
49 |
50 | ## Start new development cycle by updating to SNAPSHOT version
51 |
52 | mvn versions:set -DnewVersion=1.2.4-SNAPSHOT
53 |
--------------------------------------------------------------------------------
/collatex-core/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | 4.0.0
4 |
5 | eu.interedition
6 | collatex
7 | 1.8-SNAPSHOT
8 |
9 | collatex-core
10 | 1.8-SNAPSHOT
11 | CollateX Core
12 | A Java library for collating textual sources, for example, to produce an apparatus.
13 |
14 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/CollationAlgorithmFactory.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 The Interedition Development Group.
3 | *
4 | * This file is part of CollateX.
5 | *
6 | * CollateX is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * CollateX is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with CollateX. If not, see .
18 | */
19 |
20 | package eu.interedition.collatex;
21 |
22 | import eu.interedition.collatex.dekker.DekkerAlgorithm;
23 | import eu.interedition.collatex.dekker.editgraphaligner.EditGraphAligner;
24 | import eu.interedition.collatex.medite.MediteAlgorithm;
25 | import eu.interedition.collatex.needlemanwunsch.NeedlemanWunschAlgorithm;
26 | import eu.interedition.collatex.util.GreedyStringTilingAlgorithm;
27 | import eu.interedition.collatex.util.VertexMatch;
28 |
29 | import java.util.Comparator;
30 | import java.util.SortedSet;
31 | import java.util.function.Function;
32 |
33 | /**
34 | * @author Gregor Middell
35 | * @author Ronald Haentjens Dekker
36 | */
37 | public class CollationAlgorithmFactory {
38 |
39 | /* TODO: THIS SHOULD BE THE DEFAULT! */
40 | public static CollationAlgorithm dekker(Comparator comparator) {
41 | return new DekkerAlgorithm(comparator);
42 | }
43 |
44 | /* TODO: THIS SHOULD NOT BE THE DEFAULT! */
45 | // THIS WAS INSPIRED BY THE PYTHON VERSION OF COLLATEX
46 | // HOWEVER IT DOES NOT HAVE TRANSPOSITION DETECTION!
47 | public static CollationAlgorithm editGraphDekker(Comparator comparator) {
48 | return new EditGraphAligner(comparator);
49 | }
50 |
51 | public static CollationAlgorithm needlemanWunsch(Comparator comparator) {
52 | return new NeedlemanWunschAlgorithm(comparator);
53 | }
54 |
55 | public static CollationAlgorithm greedyStringTiling(Comparator comparator, int minimumTileLength) {
56 | return new GreedyStringTilingAlgorithm(comparator, minimumTileLength);
57 | }
58 |
59 | public static CollationAlgorithm medite(Comparator comparator, Function, Integer> matchEvaluator) {
60 | return new MediteAlgorithm(comparator, matchEvaluator);
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/Token.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 The Interedition Development Group.
3 | *
4 | * This file is part of CollateX.
5 | *
6 | * CollateX is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * CollateX is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with CollateX. If not, see .
18 | */
19 |
20 | package eu.interedition.collatex;
21 |
22 |
23 | /**
24 | * The normalized version of the token.
25 | */
26 | public interface Token {
27 | Witness getWitness();
28 | }
29 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/Witness.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 The Interedition Development Group.
3 | *
4 | * This file is part of CollateX.
5 | *
6 | * CollateX is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * CollateX is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with CollateX. If not, see .
18 | */
19 |
20 | package eu.interedition.collatex;
21 |
22 | import java.util.Comparator;
23 |
24 | /**
25 | * IWitness
26 | *
27 | * Representation of a single textual witness
28 | */
29 | public interface Witness {
30 |
31 | String getSigil();
32 |
33 | Comparator SIGIL_COMPARATOR = Comparator.comparing(Witness::getSigil);
34 | }
35 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/dekker/InspectableCollationAlgorithm.java:
--------------------------------------------------------------------------------
1 | package eu.interedition.collatex.dekker;
2 |
3 | import eu.interedition.collatex.CollationAlgorithm;
4 |
5 | import java.util.List;
6 |
7 | /**
8 | * Created by ronalddekker on 08/10/15.
9 | */
10 | public interface InspectableCollationAlgorithm extends CollationAlgorithm {
11 | List> getPhraseMatches();
12 |
13 | List> getTranspositions();
14 |
15 | /*
16 | * This check disables transposition rendering in the variant
17 | * graph when the variant graph contains more then two witnesses.
18 | * Transposition detection is done in a progressive manner
19 | * (witness by witness). When viewing the resulting graph
20 | * containing the variation for all witnesses
21 | * the detected transpositions can look strange, since segments
22 | * may have split into smaller or larger parts.
23 | */
24 | void setMergeTranspositions(boolean b);
25 | }
26 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/dekker/Match.kt:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 The Interedition Development Group.
3 | *
4 | * This file is part of CollateX.
5 | *
6 | * CollateX is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * CollateX is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with CollateX. If not, see .
18 | */
19 | package eu.interedition.collatex.dekker
20 |
21 | import eu.interedition.collatex.Token
22 | import eu.interedition.collatex.VariantGraph
23 | import java.util.*
24 | import java.util.function.Function
25 | import java.util.function.Predicate
26 | import java.util.stream.Collectors
27 |
28 | /**
29 | * @author [Gregor Middell](http://gregor.middell.net/)
30 | */
31 | class Match(val vertex: VariantGraph.Vertex, val token: Token?) {
32 |
33 | override fun hashCode(): Int {
34 | return Objects.hash(vertex, token)
35 | }
36 |
37 | override fun equals(obj: Any?): Boolean {
38 | if (obj != null && obj is Match) {
39 | val other = obj
40 | return vertex == other.vertex && token == other.token
41 | }
42 | return super.equals(obj)
43 | }
44 |
45 | override fun toString(): String {
46 | return "{$vertex; $token}"
47 | }
48 |
49 | companion object {
50 | fun createPhraseMatch(vertices: List, tokens: List): List {
51 | val phraseMatch: MutableList = ArrayList(vertices.size)
52 | val vertexIt = vertices.iterator()
53 | val tokenIt = tokens.iterator()
54 | while (vertexIt.hasNext() && tokenIt.hasNext()) {
55 | phraseMatch.add(Match(vertexIt.next(), tokenIt.next()))
56 | }
57 | return phraseMatch
58 | }
59 |
60 | fun createNoBoundaryMatchPredicate(graph: VariantGraph): Predicate {
61 | return Predicate { input: Match -> input.vertex != graph.start && input.vertex != graph.end }
62 | }
63 |
64 | @JvmField
65 | val PHRASE_MATCH_TO_TOKENS = //
66 | Function { input: List -> input.stream().map { m: Match -> m.token }.collect(Collectors.toList()) }
67 | }
68 | }
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/dekker/Tuple.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 The Interedition Development Group.
3 | *
4 | * This file is part of CollateX.
5 | *
6 | * CollateX is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * CollateX is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with CollateX. If not, see .
18 | */
19 |
20 | package eu.interedition.collatex.dekker;
21 |
22 | import java.util.Objects;
23 |
24 | /**
25 | * @author Gregor Middell
26 | */
27 | public class Tuple {
28 |
29 | public final T left;
30 | public final T right;
31 |
32 | public Tuple(T left, T right) {
33 | this.left = left;
34 | this.right = right;
35 | }
36 |
37 | @Override
38 | public boolean equals(Object obj) {
39 | if (obj != null && obj instanceof Tuple) {
40 | final Tuple other = (Tuple) obj;
41 | return (left.equals(other.left) || left.equals(other.right)) && (right.equals(other.right) || right.equals(other.left));
42 | }
43 | return super.equals(obj);
44 | }
45 |
46 | @Override
47 | public int hashCode() {
48 | return Objects.hash(left, right);
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/dekker/astar/AstarAlgorithm.java:
--------------------------------------------------------------------------------
1 | package eu.interedition.collatex.dekker.astar;
2 |
3 | import java.util.*;
4 |
5 |
6 | /*
7 | * Implementation of the a* algorithm to find the optimal
8 | * solution in a decision tree.
9 | *
10 | * @author: Ronald Haentjens Dekker
11 | */
12 | public abstract class AstarAlgorithm> {
13 | // The map of navigated nodes.
14 | protected Map cameFrom;
15 |
16 | protected List aStar(N startNode, C startCost) {
17 | // The set of nodes already evaluated.
18 | Set closed = new HashSet<>();
19 | cameFrom = new HashMap<>();
20 |
21 | // Cost from start along best known path.
22 | Map gScore = new HashMap<>();
23 | gScore.put(startNode, startCost);
24 |
25 | // Estimated total cost from start to goal through y.
26 | final Map fScore = new HashMap<>();
27 | fScore.put(startNode, gScore.get(startNode).plus(heuristicCostEstimate(startNode)));
28 |
29 | // The set of tentative nodes to be evaluated, initially containing the start node
30 | Comparator comp = Comparator.comparing(fScore::get);
31 | PriorityQueue open = new PriorityQueue<>(10, comp);
32 | open.add(startNode);
33 |
34 | while(!open.isEmpty()) {
35 | N current = open.poll();
36 | if (isGoal(current)) {
37 | return reconstructPath(cameFrom, current);
38 | }
39 | closed.add(current);
40 | for (N neighbor : neighborNodes(current)) {
41 | if (closed.contains(neighbor)) {
42 | continue;
43 | }
44 | C tentativeGScore = gScore.get(current).plus(distBetween(current, neighbor));
45 | if (!open.contains(neighbor)||tentativeGScore.compareTo(gScore.get(neighbor))<0) {
46 | cameFrom.put(neighbor, current);
47 | gScore.put(neighbor, tentativeGScore);
48 | fScore.put(neighbor, gScore.get(neighbor).plus(heuristicCostEstimate(neighbor)));
49 | if (!open.contains(neighbor)) {
50 | open.add(neighbor);
51 | }
52 | }
53 | }
54 | }
55 | throw new IllegalStateException("No node found that suits goal condition!");
56 | }
57 |
58 | protected List reconstructPath(Map cameFrom, N current) {
59 | ArrayList path = new ArrayList<>();
60 | do {
61 | path.add(0, current);
62 | current = cameFrom.get(current);
63 | } while (current != null);
64 | return path;
65 | }
66 |
67 | protected abstract boolean isGoal(N node);
68 |
69 | protected abstract Iterable neighborNodes(N current);
70 |
71 | protected abstract C heuristicCostEstimate(N node);
72 |
73 | protected abstract C distBetween(N current, N neighbor);
74 |
75 | }
76 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/dekker/astar/Cost.java:
--------------------------------------------------------------------------------
1 | package eu.interedition.collatex.dekker.astar;
2 |
3 | /*
4 | * Generic cost value object for use with the a* algorithm.
5 | *
6 | * @author: Ronald Haentjens Dekker
7 | */
8 | public abstract class Cost> implements Comparable {
9 |
10 | protected abstract T plus(T other);
11 |
12 | }
13 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/dekker/island/Coordinate.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 The Interedition Development Group.
3 | *
4 | * This file is part of CollateX.
5 | *
6 | * CollateX is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * CollateX is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with CollateX. If not, see .
18 | */
19 |
20 | package eu.interedition.collatex.dekker.island;
21 |
22 | import eu.interedition.collatex.dekker.Match;
23 |
24 | import java.util.Objects;
25 |
26 | public class Coordinate implements Comparable {
27 | public int row;
28 | public int column;
29 | public Match match;
30 |
31 | public Coordinate(int row, int column) {
32 | this.column = column;
33 | this.row = row;
34 | this.match = null; //TODO: this constructor should be removed
35 | }
36 |
37 | // row -> position in witness (0+)
38 | // column -> rank in variant graph (0+)
39 | // match -> combination of witness token and vertex
40 | public Coordinate(int row, int column, Match match) {
41 | this.column = column;
42 | this.row = row;
43 | this.match = match;
44 | }
45 |
46 | Coordinate(Coordinate other) {
47 | this(other.row, other.column);
48 | }
49 |
50 | public int getRow() {
51 | return row;
52 | }
53 |
54 | public int getColumn() {
55 | return column;
56 | }
57 |
58 | public boolean sameColumn(Coordinate c) {
59 | return c.column == column;
60 | }
61 |
62 | public boolean sameRow(Coordinate c) {
63 | return c.row == row;
64 | }
65 |
66 | public boolean bordersOn(Coordinate c) {
67 | return (Math.abs(this.row - c.getRow()) == 1) && (Math.abs(this.column - c.getColumn()) == 1);
68 | }
69 |
70 | @Override
71 | public boolean equals(Object o) {
72 | if (o != null & o instanceof Coordinate) {
73 | final Coordinate c = (Coordinate) o;
74 | return (this.row == c.getRow() && this.column == c.getColumn());
75 | }
76 | return super.equals(o);
77 | }
78 |
79 | @Override
80 | public int hashCode() {
81 | return Objects.hash(row, column);
82 | }
83 |
84 | @Override
85 | public int compareTo(Coordinate o) {
86 | final int result = column - o.column;
87 | return (result == 0 ? row - o.row : result);
88 | }
89 |
90 | @Override
91 | public String toString() {
92 | return "(" + row + "," + column + ")";
93 | }
94 | }
95 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/dekker/island/IslandCompetition.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 The Interedition Development Group.
3 | *
4 | * This file is part of CollateX.
5 | *
6 | * CollateX is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * CollateX is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with CollateX. If not, see .
18 | */
19 |
20 | package eu.interedition.collatex.dekker.island;
21 |
22 | public enum IslandCompetition {
23 | CompetingIslandAndOnIdealIine, CompetingIsland, NonCompetingIsland
24 |
25 | }
26 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/dekker/island/IslandDepthAndSizeComparator.java:
--------------------------------------------------------------------------------
1 | package eu.interedition.collatex.dekker.island;
2 |
3 | import java.util.Comparator;
4 |
5 | /**
6 | * Created by ronalddekker on 09/10/15.
7 | */
8 | public class IslandDepthAndSizeComparator implements Comparator {
9 | @Override
10 | public int compare(Island o1, Island o2) {
11 | int depthComparison = o2.getDepth() - o1.getDepth();
12 | if (depthComparison != 0) {
13 | return depthComparison;
14 | }
15 | return o2.size() - o1.size();
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/dekker/island/IslandSelection.java:
--------------------------------------------------------------------------------
1 | package eu.interedition.collatex.dekker.island;
2 |
3 | import java.util.List;
4 |
5 | /**
6 | * Created by ronalddekker on 09/10/15.
7 | */
8 | public interface IslandSelection {
9 | /*
10 | * Return whether a coordinate overlaps with an already committed coordinate
11 | */
12 | boolean doesCoordinateOverlapWithCommittedCoordinate(Coordinate coordinate);
13 |
14 | /*
15 | * Return whether an island overlaps with an already committed island
16 | */
17 | boolean isIslandPossibleCandidate(Island island);
18 |
19 | /*
20 | * Commit an island
21 | * Island will be part of the final alignment
22 | */
23 | void addIsland(Island isl);
24 |
25 | boolean doesCandidateLayOnVectorOfCommittedIsland(Island island);
26 |
27 | int size();
28 |
29 | List getIslands();
30 |
31 | boolean containsCoordinate(int row, int column);
32 |
33 | List getPossibleIslands();
34 | }
35 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/dekker/island/IslandSizeComparator.java:
--------------------------------------------------------------------------------
1 | package eu.interedition.collatex.dekker.island;
2 |
3 | import java.util.Comparator;
4 |
5 | /**
6 | * Created by ronalddekker on 09/10/15.
7 | */
8 | public class IslandSizeComparator implements Comparator {
9 | @Override
10 | public int compare(Island o1, Island o2) {
11 | return o2.size() - o1.size();
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/dekker/island/package-info.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 The Interedition Development Group.
3 | *
4 | * This file is part of CollateX.
5 | *
6 | * CollateX is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * CollateX is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with CollateX. If not, see .
18 | */
19 |
20 | /**
21 | * An island is a data structure that contains a series of matches between tokens and variant graph vertices.
22 | * Islands can conflict (overlap) with each other. This package contains a conflict resolver class to deal with this.
23 | *
24 | * @see eu.interedition.collatex.dekker.legacy.MatchTableImpl
25 | * @see eu.interedition.collatex.dekker.legacy.MatchTableLinker
26 | *
27 | */
28 | package eu.interedition.collatex.dekker.island;
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/dekker/legacy/IslandPositionComparator.java:
--------------------------------------------------------------------------------
1 | package eu.interedition.collatex.dekker.legacy;
2 |
3 | import eu.interedition.collatex.dekker.island.Island;
4 |
5 | import java.util.Comparator;
6 |
7 | /**
8 | * Created by ronalddekker on 09/10/15.
9 | */
10 | public class IslandPositionComparator implements Comparator {
11 | @Override
12 | public int compare(Island o1, Island o2) {
13 | return o1.getLeftEnd().compareTo(o2.getLeftEnd());
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/dekker/legacy/MatchTable.java:
--------------------------------------------------------------------------------
1 | package eu.interedition.collatex.dekker.legacy;
2 |
3 | import eu.interedition.collatex.Token;
4 | import eu.interedition.collatex.VariantGraph;
5 | import eu.interedition.collatex.dekker.island.Island;
6 |
7 | import java.util.List;
8 | import java.util.Set;
9 |
10 | /**
11 | * Created by ronald on 4/26/15.
12 | */
13 | public interface MatchTable {
14 | VariantGraph.Vertex vertexAt(int rowIndex, int columnIndex);
15 |
16 | Token tokenAt(int rowIndex, int columnIndex);
17 |
18 | // Warning: this method reiterates the witness!
19 | // This method is only meant for the user interface and serialization classes!
20 | // Use the tokenAt method in all other cases.
21 | List rowList();
22 |
23 | List columnList();
24 |
25 | // Since the coordinates in allMatches are ordered from upper left to lower right,
26 | // we don't need to check the lower right neighbor.
27 | Set getIslands();
28 | }
29 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/dekker/legacy/MatchTableLinker.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 The Interedition Development Group.
3 | *
4 | * This file is part of CollateX.
5 | *
6 | * CollateX is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * CollateX is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with CollateX. If not, see .
18 | */
19 |
20 | package eu.interedition.collatex.dekker.legacy;
21 |
22 | import eu.interedition.collatex.Token;
23 | import eu.interedition.collatex.VariantGraph;
24 | import eu.interedition.collatex.dekker.island.*;
25 |
26 | import java.util.Comparator;
27 | import java.util.HashMap;
28 | import java.util.Map;
29 | import java.util.logging.Level;
30 | import java.util.logging.Logger;
31 |
32 | public class MatchTableLinker {
33 | static Logger LOG = Logger.getLogger(MatchTableLinker.class.getName());
34 |
35 | public MatchTableLinker() {
36 | super();
37 | }
38 |
39 | public Map link(VariantGraph base, Iterable witness, Comparator comparator) {
40 | // create MatchTable and fill it with matches
41 | LOG.fine("create MatchTable and fill it with matches");
42 | MatchTable table = MatchTableImpl.create(base, witness, comparator);
43 |
44 | // create IslandConflictResolver
45 | LOG.fine("create island conflict resolver");
46 | IslandConflictResolver resolver = new IslandConflictResolver(new MatchTableSelection(table));
47 |
48 | // The IslandConflictResolver createNonConflictingVersion() method
49 | // selects the optimal islands
50 | LOG.fine("select the optimal islands");
51 | IslandSelection preferredIslands = resolver.createNonConflictingVersion();
52 | if (LOG.isLoggable(Level.FINE)) {
53 | LOG.log(Level.FINE, "Number of preferred Islands: {0}", preferredIslands.size());
54 | }
55 |
56 | // Here the result is put in a map
57 | Map map = new HashMap<>();
58 | for (Island island : preferredIslands.getIslands()) {
59 | for (Coordinate c : island) {
60 | map.put(table.tokenAt(c.row, c.column), table.vertexAt(c.row, c.column));
61 | }
62 | }
63 | return map;
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/dekker/package-info.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 The Interedition Development Group.
3 | *
4 | * This file is part of CollateX.
5 | *
6 | * CollateX is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * CollateX is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with CollateX. If not, see .
18 | */
19 |
20 | /**
21 | * Dekker's implementation of a collation algorithm.
22 | *
23 | * Supports progressive alignment of multiple witnesses including heuristic detection of transpositions.
24 | *
25 | * @see eu.interedition.collatex.dekker.DekkerAlgorithm
26 | *
27 | */
28 | package eu.interedition.collatex.dekker;
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/matching/EditDistance.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 The Interedition Development Group.
3 | *
4 | * This file is part of CollateX.
5 | *
6 | * CollateX is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * CollateX is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with CollateX. If not, see .
18 | */
19 |
20 | package eu.interedition.collatex.matching;
21 |
22 | public final class EditDistance {
23 | private static final int MAX_DISTANCE_COMPARISON = 2500;
24 |
25 | public static int compute(String str1, String str2) {
26 | if ((str1.length() * str2.length() > MAX_DISTANCE_COMPARISON)) {
27 | return MAX_DISTANCE_COMPARISON;
28 | }
29 |
30 | final char[] str1Chars = str1.toCharArray();
31 | final int str1Length = str1Chars.length;
32 | final char[] str2Chars = str2.toCharArray();
33 | final int str2Length = str2Chars.length;
34 |
35 | if (str1Length == 0) {
36 | return str2Length;
37 | }
38 | if (str2Length == 0) {
39 | return str1Length;
40 | }
41 |
42 | int[][][] cache = new int[30][][];
43 | int matrix[][];
44 | if (str2Length >= cache.length) {
45 | matrix = form(str1Length, str2Length);
46 | } else if (cache[str2Length] != null) {
47 | matrix = cache[str2Length];
48 | } else {
49 | matrix = cache[str2Length] = form(str1Length, str2Length);
50 | }
51 |
52 | for (int i = 1; i <= str1Length; i++) {
53 | final char str1Char = str1Chars[i - 1];
54 | for (int j = 1; j <= str2Length; j++) {
55 | final char str2Char = str2Chars[j - 1];
56 | final int cost = (str1Char == str2Char ? 0 : 1);
57 | matrix[i][j] = min3(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + cost);
58 | }
59 | }
60 |
61 | return matrix[str1Length][str2Length];
62 | }
63 |
64 | private static int[][] form(int n, int m) {
65 | int[][] d = new int[n + 1][m + 1];
66 |
67 | for (int i = 0; i <= n; i++) {
68 | d[i][0] = i;
69 |
70 | }
71 | for (int j = 0; j <= m; j++) {
72 | d[0][j] = j;
73 | }
74 | return d;
75 | }
76 |
77 | private static int min3(int a, int b, int c) {
78 | int mi = a;
79 | if (b < mi) {
80 | mi = b;
81 | }
82 | if (c < mi) {
83 | mi = c;
84 | }
85 | return mi;
86 | }
87 |
88 | }
89 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/matching/EditDistanceTokenComparator.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 The Interedition Development Group.
3 | *
4 | * This file is part of CollateX.
5 | *
6 | * CollateX is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * CollateX is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with CollateX. If not, see .
18 | */
19 |
20 | package eu.interedition.collatex.matching;
21 |
22 | import eu.interedition.collatex.Token;
23 | import eu.interedition.collatex.simple.SimpleToken;
24 |
25 | import java.util.Comparator;
26 |
27 | public class EditDistanceTokenComparator implements Comparator {
28 |
29 | private final int threshold;
30 |
31 | public EditDistanceTokenComparator() {
32 | this(1);
33 | }
34 |
35 | public EditDistanceTokenComparator(int threshold) {
36 | this.threshold = threshold;
37 | }
38 |
39 | @Override
40 | public int compare(Token base, Token witness) {
41 | final String baseContent = ((SimpleToken) base).getNormalized();
42 | final String witnessContent = ((SimpleToken) witness).getNormalized();
43 | return (EditDistance.compute(baseContent, witnessContent) <= threshold) ?
44 | 0 : baseContent.compareTo(witnessContent);
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/matching/EqualityTokenComparator.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 The Interedition Development Group.
3 | *
4 | * This file is part of CollateX.
5 | *
6 | * CollateX is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * CollateX is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with CollateX. If not, see .
18 | */
19 |
20 | package eu.interedition.collatex.matching;
21 |
22 | import eu.interedition.collatex.Token;
23 | import eu.interedition.collatex.simple.SimpleToken;
24 |
25 | import java.util.Comparator;
26 |
27 | public class EqualityTokenComparator implements Comparator {
28 |
29 | @Override
30 | public int compare(Token base, Token witness) {
31 | final String baseContent = ((SimpleToken) base).getNormalized();
32 | final String witnessContent = ((SimpleToken) witness).getNormalized();
33 | return baseContent.compareTo(witnessContent);
34 | }
35 |
36 | }
37 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/matching/StrictEqualityTokenComparator.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 The Interedition Development Group.
3 | *
4 | * This file is part of CollateX.
5 | *
6 | * CollateX is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * CollateX is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with CollateX. If not, see .
18 | */
19 |
20 | package eu.interedition.collatex.matching;
21 |
22 | import eu.interedition.collatex.Token;
23 | import eu.interedition.collatex.simple.SimpleToken;
24 |
25 | import java.util.Comparator;
26 |
27 | public class StrictEqualityTokenComparator implements Comparator {
28 |
29 | @Override
30 | public int compare(Token base, Token witness) {
31 | final String baseContent = ((SimpleToken) base).getContent();
32 | final String witnessContent = ((SimpleToken) witness).getContent();
33 | return baseContent.compareTo(witnessContent);
34 | }
35 |
36 | }
37 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/matching/package-info.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 The Interedition Development Group.
3 | *
4 | * This file is part of CollateX.
5 | *
6 | * CollateX is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * CollateX is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with CollateX. If not, see .
18 | */
19 |
20 | /**
21 | * {@link java.util.Comparator Comparators} for matching tokens.
22 | *
23 | * Implementation base the equality of tokens on strict or on approximate equality of their respective textual contents.
24 | *
25 | * @see eu.interedition.collatex.matching.StrictEqualityTokenComparator
26 | * @see eu.interedition.collatex.matching.EditDistanceTokenComparator
27 | *
28 | */
29 | package eu.interedition.collatex.matching;
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/medite/package-info.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 The Interedition Development Group.
3 | *
4 | * This file is part of CollateX.
5 | *
6 | * CollateX is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * CollateX is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with CollateX. If not, see .
18 | */
19 |
20 | /**
21 | * Implementation of a collation algorithm inspired by the work of Jean-Gabriel Ganascia and Julien Bourdaillet
22 | * on MEDITE.
23 | *
24 | * @see eu.interedition.collatex.medite.MediteAlgorithm
25 | *
26 | */
27 | package eu.interedition.collatex.medite;
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunsch/NeedlemanWunschScorer.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 The Interedition Development Group.
3 | *
4 | * This file is part of CollateX.
5 | *
6 | * CollateX is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * CollateX is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with CollateX. If not, see .
18 | */
19 |
20 | package eu.interedition.collatex.needlemanwunsch;
21 |
22 | /**
23 | * @author Gregor Middell
24 | */
25 | public interface NeedlemanWunschScorer {
26 |
27 | float score(A a, B b);
28 |
29 | float gap();
30 | }
31 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunsch/package-info.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 The Interedition Development Group.
3 | *
4 | * This file is part of CollateX.
5 | *
6 | * CollateX is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * CollateX is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with CollateX. If not, see .
18 | */
19 |
20 | /**
21 | * A version of the Needleman-Wunsch algorithm.
22 | *
23 | * This algorithm strives for global alignment of witnesses and bases the alignment on a configurable scoring of matches vs. differences/gaps.
24 | * It does not try to detect transpositions.
25 | *
26 | * @see eu.interedition.collatex.needlemanwunsch.NeedlemanWunschAlgorithm
27 | * @see eu.interedition.collatex.needlemanwunsch.NeedlemanWunschScorer
28 | */
29 | package eu.interedition.collatex.needlemanwunsch;
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/package-info.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 The Interedition Development Group.
3 | *
4 | * This file is part of CollateX.
5 | *
6 | * CollateX is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * CollateX is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with CollateX. If not, see .
18 | */
19 |
20 | /**
21 | * Base package of CollateX containing interfaces for its core concepts and a
22 | * {@link eu.interedition.collatex.CollationAlgorithmFactory factory class} serving as the starting
23 | * point for collation workflows.
24 | *
25 | */
26 | package eu.interedition.collatex;
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/simple/SimpleCollation.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 The Interedition Development Group.
3 | *
4 | * This file is part of CollateX.
5 | *
6 | * CollateX is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * CollateX is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with CollateX. If not, see .
18 | */
19 |
20 | package eu.interedition.collatex.simple;
21 |
22 | import eu.interedition.collatex.CollationAlgorithm;
23 | import eu.interedition.collatex.VariantGraph;
24 |
25 | import java.util.List;
26 |
27 | public class SimpleCollation {
28 |
29 | private final List witnesses;
30 | private final CollationAlgorithm algorithm;
31 | private final boolean joined;
32 |
33 | public SimpleCollation(List witnesses, CollationAlgorithm algorithm, boolean joined) {
34 | this.witnesses = witnesses;
35 | this.algorithm = algorithm;
36 | this.joined = joined;
37 | }
38 |
39 | public List getWitnesses() {
40 | return witnesses;
41 | }
42 |
43 | public CollationAlgorithm getAlgorithm() {
44 | return algorithm;
45 | }
46 |
47 | public boolean isJoined() {
48 | return joined;
49 | }
50 |
51 | public VariantGraph collate(VariantGraph graph) {
52 | algorithm.collate(graph, witnesses);
53 | if (joined) {
54 | VariantGraph.JOIN.apply(graph);
55 | }
56 | return graph;
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/simple/SimplePatternTokenizer.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 The Interedition Development Group.
3 | *
4 | * This file is part of CollateX.
5 | *
6 | * CollateX is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * CollateX is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with CollateX. If not, see .
18 | */
19 |
20 | package eu.interedition.collatex.simple;
21 |
22 | import java.util.LinkedList;
23 | import java.util.List;
24 | import java.util.function.Function;
25 | import java.util.regex.Matcher;
26 | import java.util.regex.Pattern;
27 | import java.util.stream.Stream;
28 |
29 | /**
30 | * @author Gregor Middell
31 | * @author Ronald Haentjens Dekker
32 | */
33 | public class SimplePatternTokenizer {
34 |
35 | static final String PUNCT = Pattern.quote(".?!,;:");
36 |
37 | static Function> tokenizer(Pattern pattern) {
38 | return input -> {
39 | final Matcher matcher = pattern.matcher(input);
40 | final List tokens = new LinkedList<>();
41 | while (matcher.find()) {
42 | tokens.add(input.substring(matcher.start(), matcher.end()));
43 | }
44 | return tokens.stream();
45 | };
46 | }
47 |
48 | public static final Function> BY_WHITESPACE = tokenizer(Pattern.compile("\\s*?\\S+\\s*]"));
49 |
50 | public static final Function> BY_WS_AND_PUNCT = tokenizer(Pattern.compile("[\\s" + PUNCT + "]*?[^\\s" + PUNCT + "]+[\\s" + PUNCT + "]*"));
51 |
52 | public static final Function> BY_WS_OR_PUNCT = tokenizer(Pattern.compile("[" + PUNCT + "]+[\\s]*|[^" + PUNCT + "\\s]+[\\s]*"));
53 |
54 | }
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/simple/SimpleToken.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 The Interedition Development Group.
3 | *
4 | * This file is part of CollateX.
5 | *
6 | * CollateX is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * CollateX is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with CollateX. If not, see .
18 | */
19 |
20 | package eu.interedition.collatex.simple;
21 |
22 | import eu.interedition.collatex.Token;
23 | import eu.interedition.collatex.Witness;
24 | import eu.interedition.collatex.util.StreamUtil;
25 | import eu.interedition.collatex.util.VertexMatch;
26 |
27 | import java.util.SortedSet;
28 | import java.util.function.Function;
29 | import java.util.stream.Collectors;
30 |
31 | public class SimpleToken implements Token, Comparable {
32 | private final SimpleWitness witness;
33 | private final String content;
34 | private final String normalized;
35 |
36 | public SimpleToken(SimpleWitness witness, String content, String normalized) {
37 | this.witness = witness;
38 | this.content = content;
39 | this.normalized = normalized;
40 | }
41 |
42 | public String getContent() {
43 | return content;
44 | }
45 |
46 | @Override
47 | public Witness getWitness() {
48 | return witness;
49 | }
50 |
51 | public String getNormalized() {
52 | return normalized;
53 | }
54 |
55 | @Override
56 | public String toString() {
57 | return witness.toString() + ":" + witness.getTokens().indexOf(this) + ":'" + normalized + "'";
58 | }
59 |
60 | public static String toString(Iterable extends Token> tokens) {
61 | return StreamUtil.stream(tokens)
62 | .filter(t -> SimpleToken.class.isAssignableFrom(t.getClass()))
63 | .map(t -> (SimpleToken) t)
64 | .map(SimpleToken::getContent)
65 | .collect(Collectors.joining())
66 | .trim();
67 | }
68 |
69 | @Override
70 | public int compareTo(SimpleToken o) {
71 | return witness.compare(this, o);
72 | }
73 |
74 | public static final Function, Integer> TOKEN_MATCH_EVALUATOR = input -> {
75 | int value = 0;
76 | for (VertexMatch.WithToken match : input) {
77 | value += ((SimpleToken) match.token).getContent().length();
78 | }
79 | return value;
80 | };
81 | }
82 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/simple/SimpleTokenNormalizers.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 The Interedition Development Group.
3 | *
4 | * This file is part of CollateX.
5 | *
6 | * CollateX is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * CollateX is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with CollateX. If not, see .
18 | */
19 |
20 | package eu.interedition.collatex.simple;
21 |
22 | import java.util.function.Function;
23 |
24 | /**
25 | * @author Gregor Middell
26 | * @author Ronald Haentjens Dekker
27 | */
28 | public class SimpleTokenNormalizers {
29 |
30 | public static final Function LOWER_CASE = String::toLowerCase;
31 |
32 | public static final Function TRIM_WS = String::trim;
33 |
34 | public static final Function TRIM_WS_PUNCT = input -> {
35 | int start = 0;
36 | int end = input.length() - 1;
37 | while (start <= end && isWhitespaceOrPunctuation(input.charAt(start))) {
38 | start++;
39 | }
40 | while (end >= start && isWhitespaceOrPunctuation(input.charAt(end))) {
41 | end--;
42 | }
43 | return input.substring(start, end + 1);
44 | };
45 |
46 | public static boolean isWhitespaceOrPunctuation(char c) {
47 | if (Character.isWhitespace(c)) {
48 | return true;
49 | }
50 | switch (Character.getType(c)) {
51 | case Character.START_PUNCTUATION:
52 | case Character.END_PUNCTUATION:
53 | case Character.OTHER_PUNCTUATION:
54 | return true;
55 | default:
56 | return false;
57 | }
58 | }
59 |
60 | public static final Function LC_TRIM_WS_PUNCT = LOWER_CASE.andThen(TRIM_WS_PUNCT);
61 |
62 | public static final Function LC_TRIM_WS = LOWER_CASE.andThen(TRIM_WS);
63 | }
64 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/simple/package-info.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 The Interedition Development Group.
3 | *
4 | * This file is part of CollateX.
5 | *
6 | * CollateX is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * CollateX is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with CollateX. If not, see .
18 | */
19 |
20 | /**
21 | * Default implementations for core interfaces like witnesses and tokens.
22 | *
23 | *
Classes in this package make fundamental assumptions about the nature of text version to be collated, e.g. that they
24 | * can be tokenized by whitespace, that tokens might be case insensitive, that punctuation might not matter or that
25 | * XML input adheres to a particular schema.
26 | *
27 | *
Users are advised to implement {@link eu.interedition.collatex.Token} and {@link eu.interedition.collatex.Witness}
28 | * themselves and adjust their implementations to the use case at hand where those assumptions do not hold.
29 | */
30 | package eu.interedition.collatex.simple;
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/CharSequenceAdapter.java:
--------------------------------------------------------------------------------
1 | package eu.interedition.collatex.suffixarray;
2 |
3 | /**
4 | * An adapter for constructing suffix arrays on character sequences.
5 | *
6 | * @author Michał Nowak (Carrot Search)
7 | * @author Dawid Weiss (Carrot Search)
8 | * @see SuffixArrays#create(CharSequence)
9 | * @see SuffixArrays#create(CharSequence, ISuffixArrayBuilder)
10 | */
11 | final class CharSequenceAdapter {
12 | private final ISuffixArrayBuilder delegate;
13 |
14 | /**
15 | * Last mapped input in {@link #buildSuffixArray(CharSequence)}.
16 | */
17 | int[] input;
18 |
19 | /**
20 | * Construct an adapter with a given underlying suffix array construction strategy.
21 | * The suffix array builder should accept non-negative characters, with a possibly
22 | * large alphabet size.
23 | *
24 | * @see DensePositiveDecorator
25 | */
26 | public CharSequenceAdapter(ISuffixArrayBuilder builder) {
27 | this.delegate = builder;
28 | }
29 |
30 | /**
31 | * Construct a suffix array for a given character sequence.
32 | */
33 | public int[] buildSuffixArray(CharSequence sequence) {
34 | /*
35 | * Allocate slightly more space, some suffix construction strategies need it and
36 | * we don't want to waste space for multiple symbol mappings.
37 | */
38 |
39 | this.input = new int[sequence.length() + SuffixArrays.MAX_EXTRA_TRAILING_SPACE];
40 | for (int i = sequence.length() - 1; i >= 0; i--) {
41 | input[i] = sequence.charAt(i);
42 | }
43 |
44 | final int start = 0;
45 | final int length = sequence.length();
46 |
47 | final ISymbolMapper mapper = new DensePositiveMapper(input, start, length);
48 | mapper.map(input, start, length);
49 |
50 | return delegate.buildSuffixArray(input, start, length);
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/DensePositiveDecorator.java:
--------------------------------------------------------------------------------
1 | package eu.interedition.collatex.suffixarray;
2 |
3 | /**
4 | * A decorator around {@link ISuffixArrayBuilder} that accepts any input symbols and maps
5 | * it to non-negative, compact (dense) alphabet. Relative symbols order is preserved (changes are
6 | * limited to a constant shift and compaction of symbols). The input is remapped in-place,
7 | * but additional space is required for the mapping.
8 | *
9 | * @author Michał Nowak (Carrot Search)
10 | * @author Dawid Weiss (Carrot Search)
11 | */
12 | public final class DensePositiveDecorator implements ISuffixArrayBuilder {
13 | private final ISuffixArrayBuilder delegate;
14 |
15 | /*
16 | *
17 | */
18 | public DensePositiveDecorator(ISuffixArrayBuilder delegate) {
19 | this.delegate = delegate;
20 | }
21 |
22 | /*
23 | *
24 | */
25 | @Override
26 | public int[] buildSuffixArray(int[] input, final int start, final int length) {
27 | final MinMax minmax = Tools.minmax(input, start, length);
28 |
29 | final ISymbolMapper mapper;
30 | if (minmax.range() > 0x10000) {
31 | throw new RuntimeException("Large symbol space not implemented yet.");
32 | }
33 | mapper = new DensePositiveMapper(input, start, length);
34 |
35 | mapper.map(input, start, length);
36 | try {
37 | return delegate.buildSuffixArray(input, start, length);
38 | } finally {
39 | mapper.undo(input, start, length);
40 | }
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/DensePositiveMapper.java:
--------------------------------------------------------------------------------
1 | package eu.interedition.collatex.suffixarray;
2 |
3 | /**
4 | * In the "dense" scenario we keep "forward" mapping between original keys (shifted to
5 | * positive indexes) and their new key values. A "reverse" mapping is used to restore
6 | * original values in place of the mapped keys upon exit.
7 | *
8 | * @author Michał Nowak (Carrot Search)
9 | * @author Dawid Weiss (Carrot Search)
10 | */
11 | final class DensePositiveMapper implements ISymbolMapper {
12 | private final int offset;
13 | private final int[] forward;
14 | private final int[] backward;
15 |
16 | /*
17 | *
18 | */
19 | public DensePositiveMapper(int[] input, int start, int length) {
20 | final MinMax minmax = Tools.minmax(input, start, length);
21 | final int min = minmax.min;
22 | final int max = minmax.max;
23 |
24 | final int[] forward = new int[max - min + 1];
25 | final int offset = -min;
26 |
27 | // Mark all symbols present in the alphabet.
28 | final int end = start + length;
29 | for (int i = start; i < end; i++) {
30 | forward[input[i] + offset] = 1;
31 | }
32 |
33 | // Collect present symbols, assign unique codes.
34 | int k = 1;
35 | for (int i = 0; i < forward.length; i++) {
36 | if (forward[i] != 0) {
37 | forward[i] = k++;
38 | }
39 | }
40 |
41 | final int[] backward = new int[k];
42 | for (int i = start; i < end; i++) {
43 | final int v = forward[input[i] + offset];
44 | backward[v] = input[i];
45 | }
46 |
47 | this.offset = offset;
48 | this.forward = forward;
49 | this.backward = backward;
50 | }
51 |
52 | /*
53 | *
54 | */
55 | @Override
56 | public void map(int[] input, final int start, final int length) {
57 | for (int i = start, l = length; l > 0; l--, i++) {
58 | input[i] = forward[input[i] + offset];
59 | }
60 | }
61 |
62 | /*
63 | *
64 | */
65 | @Override
66 | public void undo(int[] input, final int start, final int length) {
67 | for (int i = start, l = length; l > 0; l--, i++) {
68 | input[i] = backward[input[i]];
69 | }
70 | }
71 | }
72 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/ExtraTrailingCellsDecorator.java:
--------------------------------------------------------------------------------
1 | package eu.interedition.collatex.suffixarray;
2 |
3 | /**
4 | * A decorator around {@link ISuffixArrayBuilder} that:
5 | *
6 | *
provides extra space after the input for end-of-string markers
7 | *
shifts the input to zero-based positions.
8 | *
9 | *
10 | * @author Michał Nowak (Carrot Search)
11 | * @author Dawid Weiss (Carrot Search)
12 | */
13 | public final class ExtraTrailingCellsDecorator implements ISuffixArrayBuilder {
14 | private final ISuffixArrayBuilder delegate;
15 | private final int extraCells;
16 |
17 | /**
18 | * @see SuffixArrays#MAX_EXTRA_TRAILING_SPACE
19 | */
20 | public ExtraTrailingCellsDecorator(ISuffixArrayBuilder delegate, int extraCells) {
21 | this.delegate = delegate;
22 | this.extraCells = extraCells;
23 | }
24 |
25 | /*
26 | *
27 | */
28 | @Override
29 | public int[] buildSuffixArray(int[] input, final int start, final int length) {
30 | if (start == 0 && start + length + extraCells < input.length) {
31 | return delegate.buildSuffixArray(input, start, length);
32 | }
33 |
34 | final int[] shifted = new int[input.length + extraCells];
35 | System.arraycopy(input, start, shifted, 0, length);
36 |
37 | final int[] SA = delegate.buildSuffixArray(shifted, 0, length);
38 |
39 | return SA;
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/GenericArrayAdapter.java:
--------------------------------------------------------------------------------
1 | package eu.interedition.collatex.suffixarray;
2 |
3 | import java.util.Comparator;
4 | import java.util.TreeMap;
5 |
6 | /**
7 | * An adapter for constructing suffix arrays on generic arrays.
8 | *
9 | * @author Anton Olsson for friprogramvarusyndikatet.se
10 | */
11 | class GenericArrayAdapter {
12 |
13 | private final ISuffixArrayBuilder delegate;
14 | int[] input;
15 | TreeMap tokIDs;
16 | private final Comparator super T> comparator;
17 |
18 | public GenericArrayAdapter(ISuffixArrayBuilder builder) {
19 | // TODO make sure T is comparable
20 | this.delegate = builder;
21 | this.comparator = null;
22 | }
23 |
24 | public GenericArrayAdapter(ISuffixArrayBuilder builder, Comparator super T> comparator) {
25 | // TODO make sure that comparator != null or T is comparable
26 | this.delegate = builder;
27 | this.comparator = comparator;
28 | }
29 |
30 | /**
31 | * Construct a suffix array for a given generic token array.
32 | */
33 | public int[] buildSuffixArray(T[] tokens) {
34 | final int length = tokens.length;
35 | /*
36 | * Allocate slightly more space, some suffix construction strategies need it and
37 | * we don't want to waste space for multiple symbol mappings.
38 | */
39 | input = new int[length + SuffixArrays.MAX_EXTRA_TRAILING_SPACE];
40 |
41 | //System.out.println("Assigning token ids ...");
42 |
43 | /*
44 | * We associate every token to an id, all `equal´ tokens to the same id.
45 | * The suffix array is built using only the the ids.
46 | */
47 | tokIDs = new TreeMap<>(comparator);
48 |
49 | for (int i = 0; i < length; i++) {
50 | tokIDs.putIfAbsent(tokens[i], i);
51 | input[i] = tokIDs.get(tokens[i]);
52 | }
53 |
54 | //System.out.println("Token ids assigned.");
55 |
56 | return delegate.buildSuffixArray(input, 0, length);
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/ISuffixArrayBuilder.java:
--------------------------------------------------------------------------------
1 | package eu.interedition.collatex.suffixarray;
2 |
3 | /**
4 | * An algorithm that can produce a suffix array for a sequence of integer symbols.
5 | *
6 | * @author Michał Nowak (Carrot Search)
7 | * @author Dawid Weiss (Carrot Search)
8 | * @see #buildSuffixArray(int[], int, int)
9 | */
10 | public interface ISuffixArrayBuilder {
11 | /**
12 | * Computes suffix array for sequence of symbols (integers). The processed sequence is
13 | * a subsequence of input determined by start and
14 | * length parameters.
15 | *
16 | * Concrete implementations may have additional requirements and constraints
17 | * concerning the input. For example, it is quite common that extra cells are required
18 | * after start + length to store special marker symbols. Also, some
19 | * algorithms may require non-negative symbols in the input. For such constrained
20 | * algorithms, use various decorators and adapters available in this package.
21 | *
22 | * @param input A sequence of input symbols, int-coded.
23 | * @param start The starting index (inclusive) in input.
24 | * @param length Number of symbols to process.
25 | * @return An array of indices such that the suffix of input at index
26 | * result[i] is lexicographically larger or equal to any other
27 | * suffix that precede it. Note that the output array may be larger than
28 | * input.length, in which case only the first
29 | * input.length elements are of relevance.
30 | *
31 | * The returned array contains suffix indexes starting from 0 (so
32 | * start needs to be added manually to access a given suffix in
33 | * input).
34 | */
35 | int[] buildSuffixArray(int[] input, int start, int length);
36 | }
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/ISymbolMapper.java:
--------------------------------------------------------------------------------
1 | package eu.interedition.collatex.suffixarray;
2 |
3 | /**
4 | * Symbol mappers (reversible int-coding).
5 | *
6 | * @author Michał Nowak (Carrot Search)
7 | * @author Dawid Weiss (Carrot Search)
8 | */
9 | interface ISymbolMapper {
10 | void map(int[] input, int start, int length);
11 |
12 | void undo(int[] input, int start, int length);
13 | }
14 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/MinMax.java:
--------------------------------------------------------------------------------
1 | package eu.interedition.collatex.suffixarray;
2 |
3 | /**
4 | * Holder for minimum and maximum.
5 | *
6 | * @author Michał Nowak (Carrot Search)
7 | * @author Dawid Weiss (Carrot Search)
8 | * @see Tools#minmax(int[], int, int)
9 | */
10 | final class MinMax {
11 | public final int min;
12 | public final int max;
13 |
14 | MinMax(int min, int max) {
15 | this.min = min;
16 | this.max = max;
17 | }
18 |
19 | public int range() {
20 | return max - min;
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/SuffixData.java:
--------------------------------------------------------------------------------
1 | package eu.interedition.collatex.suffixarray;
2 |
3 | /**
4 | * A holder structure for a suffix array and longest common prefix array of
5 | * a given sequence.
6 | *
7 | * @author Michał Nowak (Carrot Search)
8 | * @author Dawid Weiss (Carrot Search)
9 | */
10 | public final class SuffixData {
11 | private final int[] suffixArray;
12 | private final int[] lcp;
13 |
14 | SuffixData(int[] sa, int[] lcp) {
15 | this.suffixArray = sa;
16 | this.lcp = lcp;
17 | }
18 |
19 | public int[] getSuffixArray() {
20 | return suffixArray;
21 | }
22 |
23 | public int[] getLCP() {
24 | return lcp;
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/Tools.java:
--------------------------------------------------------------------------------
1 | package eu.interedition.collatex.suffixarray;
2 |
3 | /**
4 | * Utility methods used throughout entire project.
5 | *
6 | * @author Michał Nowak (Carrot Search)
7 | * @author Dawid Weiss (Carrot Search)
8 | */
9 | final class Tools {
10 | private Tools() {
11 | // No instances.
12 | }
13 |
14 | /**
15 | * Check if all symbols in the given range are greater than 0, return
16 | * true if so, false otherwise.
17 | */
18 | static boolean allPositive(int[] input, int start, int length) {
19 | for (int i = length - 1, index = start; i >= 0; i--, index++) {
20 | if (input[index] <= 0) {
21 | return false;
22 | }
23 | }
24 |
25 | return true;
26 | }
27 |
28 | /**
29 | * Determine the maximum value in a slice of an array.
30 | */
31 | static int max(int[] input, int start, int length) {
32 | assert length >= 1;
33 |
34 | int max = input[start];
35 | for (int i = length - 2, index = start + 1; i >= 0; i--, index++) {
36 | final int v = input[index];
37 | if (v > max) {
38 | max = v;
39 | }
40 | }
41 |
42 | return max;
43 | }
44 |
45 | /**
46 | * Determine the minimum value in a slice of an array.
47 | */
48 | static int min(int[] input, int start, int length) {
49 | assert length >= 1;
50 |
51 | int min = input[start];
52 | for (int i = length - 2, index = start + 1; i >= 0; i--, index++) {
53 | final int v = input[index];
54 | if (v < min) {
55 | min = v;
56 | }
57 | }
58 |
59 | return min;
60 | }
61 |
62 | /**
63 | * Calculate minimum and maximum value for a slice of an array.
64 | */
65 | static MinMax minmax(int[] input, final int start, final int length) {
66 | int max = input[start];
67 | int min = max;
68 | for (int i = length - 2, index = start + 1; i >= 0; i--, index++) {
69 | final int v = input[index];
70 | if (v > max) {
71 | max = v;
72 | }
73 | if (v < min) {
74 | min = v;
75 | }
76 | }
77 |
78 | return new MinMax(min, max);
79 | }
80 |
81 | /**
82 | * Throw {@link AssertionError} if a condition is false. This should
83 | * be called when the assertion must be always verified (as in the case of verifying
84 | * the algorithm's preconditions). For other, internal assertions, one should use
85 | * assert keyword so that such assertions can be disabled at run-time (for
86 | * performance reasons).
87 | */
88 | static void assertAlways(boolean condition, String msg) {
89 | if (!condition) {
90 | throw new AssertionError(msg);
91 | }
92 | }
93 | }
94 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/Cursor.java:
--------------------------------------------------------------------------------
1 | package eu.interedition.collatex.suffixtree;
2 |
3 | import java.util.Collection;
4 | import java.util.Collections;
5 | import java.util.HashSet;
6 |
7 | /**
8 | * @param
9 | * @param
10 | * @author Max Garfinkel
11 | */
12 | public class Cursor> {
13 |
14 | private final SuffixTree tree;
15 | private Node node;
16 | private Edge edge;
17 | private int length;
18 |
19 | Cursor(SuffixTree tree) {
20 | this.tree = tree;
21 | node = tree.getRoot();
22 | edge = null;
23 | length = 0;
24 | }
25 |
26 | boolean proceedTo(T item) {
27 | if (edge == null) {
28 | Edge tmpEdge = node.getEdgeStarting(item);
29 | if (tmpEdge != null) {
30 | edge = tmpEdge;
31 | length = 1;
32 | return true;
33 | }
34 | return false;
35 | } else if (edge.getLength() > length) {
36 | T nextItem = edge.getItemAt(length);
37 | if (nextItem != null && item.equals(nextItem)) {
38 | length++;
39 | return true;
40 | }
41 | return false;
42 | } else {
43 | Node terminal = edge.getTerminal();
44 | if (terminal == null)
45 | return false;
46 | Edge tmpEdge = terminal.getEdgeStarting(item);
47 | if (tmpEdge != null) {
48 | edge = tmpEdge;
49 | length = 1;
50 | node = terminal;
51 | return true;
52 | }
53 | return false;
54 | }
55 | }
56 |
57 | Collection> getSequenceTerminals() {
58 | if (edge == null) {
59 | return node.getSuffixTerminals();
60 | }
61 |
62 | if ((edge.getLength() - 1 == length && !edge.isTerminating())//
63 | || (edge.getItemAt(length).getClass().equals(SequenceTerminal.class)) //
64 | && !edge.isTerminating()//
65 | ) {
66 | Object seqTerminal = edge.getItemAt(length);
67 | @SuppressWarnings("unchecked")
68 | SequenceTerminal term = (SequenceTerminal) seqTerminal;
69 | Collection> collection = new HashSet<>();
70 | collection.add(term);
71 | return collection;
72 | }
73 | Node terminal = edge.getTerminal();
74 | if (terminal == null)
75 | return Collections.emptySet();
76 |
77 | Collection> edges = terminal.getEdges();
78 | Collection> returnCollection = new HashSet<>();
79 | for (Edge edge : edges) {
80 | Object o = edge.getStartItem();
81 | if (o.getClass().equals(SequenceTerminal.class)) {
82 | @SuppressWarnings("unchecked")
83 | SequenceTerminal returnTerminal = (SequenceTerminal) o;
84 | returnCollection.add(returnTerminal);
85 | }
86 | }
87 | return returnCollection;
88 | }
89 |
90 | void returnToRoot() {
91 | node = tree.getRoot();
92 | edge = null;
93 | length = 0;
94 | }
95 |
96 | }
97 |
--------------------------------------------------------------------------------
/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/Sequence.java:
--------------------------------------------------------------------------------
1 | package eu.interedition.collatex.suffixtree;
2 |
3 | import java.util.ArrayList;
4 | import java.util.Iterator;
5 | import java.util.List;
6 |
7 | /**
8 | * Represents a sequence of items. This plays the part of the string in a non
9 | * generic suffix tree implementation. This object automatically appends a
10 | * terminating item to the end of the instance which is included in all
11 | * operations.
12 | *
13 | * @author Max Garfinkel
14 | */
15 | public class Sequence> implements Iterable