├── images
    ├── output.png
    └── word-graph.png
├── src
    └── main
    │   ├── resources
    │       ├── en-sent.bin
    │       ├── en-token.bin
    │       ├── en-pos-maxent.bin
    │       └── logback.xml
    │   └── java
    │       └── org
    │           └── stefano
    │               └── distributional
    │                   ├── model
    │                       ├── components
    │                       │   ├── GraphWeigher.java
    │                       │   ├── impl
    │                       │   │   ├── NaiveGraphWeigher.java
    │                       │   │   ├── DefaultPathCompressor.java
    │                       │   │   ├── AdvancedGraphWeigher.java
    │                       │   │   └── DefaultGraphEncoder.java
    │                       │   ├── GraphModel.java
    │                       │   ├── PathCompressor.java
    │                       │   └── GraphEncoder.java
    │                       └── Summarizer.java
    │                   ├── Main.java
    │                   └── utils
    │                       └── OpenNLP.java
├── settings.gradle
├── .gitignore
├── README.md
└── LICENSE


/images/output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stefano-bragaglia/Multi-Sentence-Compression/HEAD/images/output.png


--------------------------------------------------------------------------------
/images/word-graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stefano-bragaglia/Multi-Sentence-Compression/HEAD/images/word-graph.png


--------------------------------------------------------------------------------
/src/main/resources/en-sent.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stefano-bragaglia/Multi-Sentence-Compression/HEAD/src/main/resources/en-sent.bin


--------------------------------------------------------------------------------
/src/main/resources/en-token.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stefano-bragaglia/Multi-Sentence-Compression/HEAD/src/main/resources/en-token.bin


--------------------------------------------------------------------------------
/src/main/resources/en-pos-maxent.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stefano-bragaglia/Multi-Sentence-Compression/HEAD/src/main/resources/en-pos-maxent.bin


--------------------------------------------------------------------------------
/settings.gradle:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This settings file was auto generated by the Gradle buildInit task
 3 |  * by 'stefano' at '23/01/17 15:46' with Gradle 3.2.1
 4 |  *
 5 |  * The settings file is used to specify which projects to include in your build.
 6 |  * In a single project build this file can be empty or even removed.
 7 |  *
 8 |  * Detailed information about configuring a multi-project build in Gradle can be found
 9 |  * in the user guide at https://docs.gradle.org/3.2.1/userguide/multi_project_builds.html
10 |  */
11 | 
12 | rootProject.name = 'Multi-Sentence-Compression'
13 | 


--------------------------------------------------------------------------------
/src/main/java/org/stefano/distributional/model/components/GraphWeigher.java:
--------------------------------------------------------------------------------
 1 | package org.stefano.distributional.model.components;
 2 | 
 3 | import org.neo4j.graphdb.GraphDatabaseService;
 4 | 
 5 | /**
 6 |  * This interface provide a method to weight the {@code FOLLOWS} relationships in a {@code word graph}.
 7 |  */
 8 | public interface GraphWeigher {
 9 | 
10 |     /**
11 |      * This method weights the {@code FOLLOWS} relationships in the given {@code graph}.
12 |      *
13 |      * @param graph the {@link GraphDatabaseService} whose {@code FOLLOWS} relationships have to be weighted
14 |      */
15 |     void weight(GraphDatabaseService graph);
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 |     <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
 3 |         <!-- encoders are assigned the type
 4 |              ch.qos.logback.classic.encoder.PatternLayoutEncoder by default -->
 5 |         <target>System.out</target>
 6 |         <withJansi>true</withJansi>
 7 |         <encoder>
 8 |             <pattern>%green(%d{HH:mm:ss.SSS}) %boldYellow([%thread]) %highlight(%-5level) %cyan(%logger{36}) - %white(%msg%n)
 9 |             </pattern>
10 |         </encoder>
11 |     </appender>
12 | 
13 |     <!--<logger name="edu.emory.clir.clearnlp" level="WARN"/>-->
14 |     <!--<logger name="com.subgraph" level="DEBUG"/>-->
15 |     <!--<logger name="org.springframework" level="WARN"/>-->
16 |     <!--<logger name="org.springframework.boot" level="WARN"/>-->
17 |     <!--<logger name="org.eclispe.jetty" level="WARN"/>-->
18 |     <!--<logger name="org.thymeleaf" level="WARN"/>-->
19 |     <!--<logger name="org.hybernate" level="WARN"/>-->
20 |     <!--<logger name="boot" level="WARN"/>-->
21 |     <!--<logger name="boot.controllers" level="INFO"/>-->
22 | 
23 |     <root level="DEBUG">
24 |         <appender-ref ref="STDOUT"/>
25 |     </root>
26 | </configuration>


--------------------------------------------------------------------------------
/src/main/java/org/stefano/distributional/model/components/impl/NaiveGraphWeigher.java:
--------------------------------------------------------------------------------
 1 | package org.stefano.distributional.model.components.impl;
 2 | 
 3 | import org.neo4j.graphdb.GraphDatabaseService;
 4 | import org.neo4j.graphdb.Relationship;
 5 | import org.neo4j.graphdb.Transaction;
 6 | import org.slf4j.Logger;
 7 | import org.slf4j.LoggerFactory;
 8 | import org.stefano.distributional.model.components.GraphWeigher;
 9 | 
10 | import static java.util.Objects.requireNonNull;
11 | import static org.stefano.distributional.model.components.GraphModel.FOLLOWS;
12 | 
13 | /**
14 |  * This class provides a naive method to weight the {@code FOLLOWS} relationships in a {@code word graph}.
15 |  * This method generate weights that are inversely proportional to their frequency.
16 |  */
17 | public final class NaiveGraphWeigher implements GraphWeigher {
18 | 
19 |     private static final Logger logger = LoggerFactory.getLogger(NaiveGraphWeigher.class);
20 | 
21 |     @Override
22 |     public void weight(GraphDatabaseService graph) {
23 |         requireNonNull(graph, "'graph' is null");
24 | 
25 |         int total = 0;
26 |         try (Transaction tx = graph.beginTx()) {
27 |             long elapsed = System.nanoTime();
28 |             logger.debug("Computing weights between words...");
29 |             for (Relationship follows : graph.getAllRelationships()) {
30 |                 if (follows.isType(FOLLOWS)) {
31 |                     double weight = 1.0 / (double) follows.getProperty("freq", 1.0);
32 |                     follows.setProperty("weight", weight);
33 |                     total += 1;
34 |                     if (total % 50 == 0) {
35 |                         logger.debug("{} relationships analysed so far...", total);
36 |                     }
37 |                 }
38 |             }
39 |             elapsed = System.nanoTime() - elapsed;
40 |             logger.info("{} relationship/s analysed in {} ms.",
41 |                     total, String.format("%,.3f", elapsed / 1_000_000_000.0));
42 |             tx.success();
43 |         }
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/java/org/stefano/distributional/model/components/impl/DefaultPathCompressor.java:
--------------------------------------------------------------------------------
 1 | package org.stefano.distributional.model.components.impl;
 2 | 
 3 | import org.neo4j.graphalgo.GraphAlgoFactory;
 4 | import org.neo4j.graphalgo.PathFinder;
 5 | import org.neo4j.graphdb.GraphDatabaseService;
 6 | import org.neo4j.graphdb.Path;
 7 | import org.neo4j.graphdb.Relationship;
 8 | import org.neo4j.graphdb.Transaction;
 9 | import org.slf4j.Logger;
10 | import org.slf4j.LoggerFactory;
11 | import org.stefano.distributional.model.components.GraphModel;
12 | import org.stefano.distributional.model.components.PathCompressor;
13 | 
14 | import java.util.Optional;
15 | import java.util.Set;
16 | import java.util.TreeSet;
17 | 
18 | /**
19 |  * This class provides the default method to generate a compressive summary from a {@code word graph}.
20 |  */
21 | public final class DefaultPathCompressor implements PathCompressor {
22 | 
23 |     private static final Logger logger = LoggerFactory.getLogger(DefaultPathCompressor.class);
24 | 
25 |     @Override
26 |     public Optional<String> compress(GraphDatabaseService graph, int maxDepth) {
27 |         try (Transaction tx = graph.beginTx()) {
28 |             long elapsed = System.nanoTime();
29 |             logger.debug("Computing all the paths between START and END nodes and their costs...");
30 |             int total = 0;
31 |             Set<CostPath> paths = new TreeSet<>();
32 |             PathFinder<Path> finder = GraphAlgoFactory.allPaths(EXPANDER, maxDepth);
33 |             for (Path path : finder.findAllPaths(GraphModel.start(graph), GraphModel.end(graph))) {
34 |                 if (path.length() >= PathCompressor.MIN_DEPTH && PathCompressor.hasVerb(path)) {
35 |                     double cost = 0.0;
36 |                     for (Relationship follows : path.relationships()) {
37 |                         cost += (double) follows.getProperty("weight", 1.0);
38 |                     }
39 |                     paths.add(new CostPath(path, cost));
40 |                 }
41 |                 total += 1;
42 |             }
43 |             logger.info("{} valid path/s found (out of {} possible) in {} ms.",
44 |                     paths.size(), total, String.format("%,.3f", elapsed / 1_000_000_000.0));
45 |             if (paths.isEmpty()) {
46 |                 return Optional.empty();
47 |             }
48 |             logger.debug("Generating the compressive summary");
49 |             return PathCompressor.decode(paths.iterator().next().getPath());
50 |         }
51 |     }
52 | 
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/java/org/stefano/distributional/model/components/GraphModel.java:
--------------------------------------------------------------------------------
 1 | package org.stefano.distributional.model.components;
 2 | 
 3 | import org.neo4j.graphdb.*;
 4 | 
 5 | import static java.util.Objects.requireNonNull;
 6 | 
 7 | /**
 8 |  * Definitions for {@code word graphs}.
 9 |  */
10 | public class GraphModel {
11 | 
12 |     public static final Label SENTENCE = Label.label("SENTENCE");
13 |     public static final Label START = Label.label("START");
14 |     public static final Label WORD = Label.label("WORD");
15 |     public static final Label END = Label.label("END");
16 |     public static final Label VERB = Label.label("VERB");
17 |     public static final RelationshipType FOLLOWS = RelationshipType.withName("FOLLOWS");
18 |     public static final RelationshipType CONTAINS = RelationshipType.withName("CONTAINS");
19 | 
20 |     private GraphModel() {
21 |         throw new UnsupportedOperationException("'GraphModel' class should not be instantiated");
22 |     }
23 | 
24 |     /**
25 |      * Returns the {@code START} node in the given {@code graph}.
26 |      * If such node exists, its {@code frequency} is increased by 1 and eventually returned.
27 |      * If it doesn't, the node is created and initialised to {@code frequency} == 1 and returned.
28 |      *
29 |      * @param graph the target {@link GraphDatabaseService}
30 |      * @return the updated {@code START} node, or a newly created instance
31 |      */
32 |     public static Node start(GraphDatabaseService graph) {
33 |         requireNonNull(graph, "'graph' is null");
34 | 
35 |         return terminal(graph, START);
36 |     }
37 | 
38 |     /**
39 |      * Returns the {@code END} node in the given {@code graph}.
40 |      * If such node exists, its {@code frequency} is increased by 1 and eventually returned.
41 |      * If it doesn't, the node is created and initialised to {@code frequency} == 1 and returned.
42 |      *
43 |      * @param graph the target {@link GraphDatabaseService}
44 |      * @return the updated {@code END} node, or a newly created instance
45 |      */
46 |     public static Node end(GraphDatabaseService graph) {
47 |         requireNonNull(graph, "'graph' is null");
48 | 
49 |         return terminal(graph, END);
50 |     }
51 | 
52 |     private static Node terminal(GraphDatabaseService graph, Label label) {
53 |         requireNonNull(graph, "'graph' is null");
54 | 
55 |         ResourceIterator<Node> nodes = graph.findNodes(label);
56 |         if (nodes.hasNext()) {
57 |             Node node = nodes.next();
58 |             double freq = (double) node.getProperty("freq", 1.0);
59 |             node.setProperty("freq", 1.0 + freq);
60 |             return node;
61 |         }
62 |         Node node = graph.createNode(label);
63 |         node.setProperty("freq", 1.0);
64 |         return node;
65 |     }
66 | 
67 | }
68 | 


--------------------------------------------------------------------------------
/src/main/java/org/stefano/distributional/model/components/impl/AdvancedGraphWeigher.java:
--------------------------------------------------------------------------------
 1 | package org.stefano.distributional.model.components.impl;
 2 | 
 3 | import org.neo4j.graphdb.*;
 4 | import org.slf4j.Logger;
 5 | import org.slf4j.LoggerFactory;
 6 | import org.stefano.distributional.model.components.GraphWeigher;
 7 | 
 8 | import static java.util.Objects.requireNonNull;
 9 | import static org.stefano.distributional.model.components.GraphModel.CONTAINS;
10 | import static org.stefano.distributional.model.components.GraphModel.FOLLOWS;
11 | 
12 | /**
13 |  * This class provides an advanced method to weight the {@code FOLLOWS} relationships in a {@code word graph}.
14 |  * This method generate weights that are inversely proportional to the frequency of paths scaled down by their length.
15 |  */
16 | public final class AdvancedGraphWeigher implements GraphWeigher {
17 | 
18 |     private static final Logger logger = LoggerFactory.getLogger(AdvancedGraphWeigher.class);
19 | 
20 |     @Override
21 |     public void weight(GraphDatabaseService graph) {
22 |         requireNonNull(graph, "'graph' is null");
23 | 
24 |         int total = 0;
25 |         try (Transaction tx = graph.beginTx()) {
26 |             long elapsed = System.nanoTime();
27 |             logger.debug("Computing weights between words...");
28 |             for (Relationship follows : graph.getAllRelationships()) {
29 |                 if (follows.isType(FOLLOWS)) {
30 |                     Node tail = follows.getStartNode();
31 |                     Node head = follows.getEndNode();
32 |                     double freqTail = (double) tail.getProperty("freq", 1.0);
33 |                     double freqHead = (double) head.getProperty("freq", 1.0);
34 |                     double denom = 0.0;
35 |                     for (Relationship containsTail : tail.getRelationships(CONTAINS, Direction.INCOMING)) {
36 |                         int posTail = (int) containsTail.getProperty("pos", 0);
37 |                         Node sentence = containsTail.getStartNode();
38 |                         for (Relationship containsHead : sentence.getRelationships(CONTAINS, Direction.OUTGOING)) {
39 |                             if (containsHead.getEndNode().equals(head)) {
40 |                                 int posHead = (int) containsHead.getProperty("pos", 0);
41 |                                 denom += 1.0 / (posHead - posTail);
42 |                             }
43 |                         }
44 |                     }
45 |                     double weight = (freqTail + freqHead) / denom;
46 |                     weight = (weight) / (freqTail * freqHead);
47 |                     follows.setProperty("weight", weight);
48 |                     total += 1;
49 |                     if (total % 50 == 0) {
50 |                         logger.debug("{} relationships analysed so far...", total);
51 |                     }
52 |                 }
53 |             }
54 |             elapsed = System.nanoTime() - elapsed;
55 |             logger.info("{} relationship/s analysed in {} ms.",
56 |                     total, String.format("%,.3f", elapsed / 1_000_000_000.0));
57 |             tx.success();
58 |         }
59 |     }
60 | }
61 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by .ignore support plugin (hsz.mobi)
  2 | ### JetBrains template
  3 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
  4 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
  5 | 
  6 | # User-specific stuff:
  7 | .idea
  8 | .idea/workspace.xml
  9 | .idea/tasks.xml
 10 | .idea/dictionaries
 11 | .idea/vcs.xml
 12 | .idea/jsLibraryMappings.xml
 13 | 
 14 | # Sensitive or high-churn files:
 15 | .idea/dataSources.ids
 16 | .idea/dataSources.xml
 17 | .idea/dataSources.local.xml
 18 | .idea/sqlDataSources.xml
 19 | .idea/dynamic.xml
 20 | .idea/uiDesigner.xml
 21 | 
 22 | # Gradle:
 23 | .idea/gradle.xml
 24 | .idea/libraries
 25 | 
 26 | # Mongo Explorer plugin:
 27 | .idea/mongoSettings.xml
 28 | 
 29 | ## File-based project format:
 30 | *.iws
 31 | 
 32 | ## Plugin-specific files:
 33 | 
 34 | # IntelliJ
 35 | /out/
 36 | 
 37 | # mpeltonen/sbt-idea plugin
 38 | .idea_modules/
 39 | 
 40 | # JIRA plugin
 41 | atlassian-ide-plugin.xml
 42 | 
 43 | # Crashlytics plugin (for Android Studio and IntelliJ)
 44 | com_crashlytics_export_strings.xml
 45 | crashlytics.properties
 46 | crashlytics-build.properties
 47 | fabric.properties
 48 | ### Java template
 49 | *.class
 50 | 
 51 | # Mobile Tools for Java (J2ME)
 52 | .mtj.tmp/
 53 | 
 54 | # Package Files #
 55 | *.jar
 56 | *.war
 57 | *.ear
 58 | 
 59 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
 60 | hs_err_pid*
 61 | ### Maven template
 62 | target/
 63 | pom.xml.tag
 64 | pom.xml.releaseBackup
 65 | pom.xml.versionsBackup
 66 | pom.xml.next
 67 | release.properties
 68 | dependency-reduced-pom.xml
 69 | buildNumber.properties
 70 | .mvn/timing.properties
 71 | ### OSX template
 72 | *.DS_Store
 73 | .AppleDouble
 74 | .LSOverride
 75 | 
 76 | # Icon must end with two \r
 77 | Icon
 78 | 
 79 | # Thumbnails
 80 | ._*
 81 | 
 82 | # Files that might appear in the root of a volume
 83 | .DocumentRevisions-V100
 84 | .fseventsd
 85 | .Spotlight-V100
 86 | .TemporaryItems
 87 | .Trashes
 88 | .VolumeIcon.icns
 89 | .com.apple.timemachine.donotpresent
 90 | 
 91 | # Directories potentially created on remote AFP share
 92 | .AppleDB
 93 | .AppleDesktop
 94 | Network Trash Folder
 95 | Temporary Items
 96 | .apdisk
 97 | ### Windows template
 98 | # Windows image file caches
 99 | Thumbs.db
100 | ehthumbs.db
101 | 
102 | # Folder config file
103 | Desktop.ini
104 | 
105 | # Recycle Bin used on file shares
106 | $RECYCLE.BIN/
107 | 
108 | # Windows Installer files
109 | *.cab
110 | *.msi
111 | *.msm
112 | *.msp
113 | 
114 | # Windows shortcuts
115 | *.lnk
116 | ### Linux template
117 | *~
118 | 
119 | # temporary files which can be created if a process still has a handle open of a deleted file
120 | .fuse_hidden*
121 | 
122 | # KDE directory preferences
123 | .directory
124 | 
125 | # Linux trash folder which might appear on any partition or disk
126 | .Trash-*
127 | ### Gradle template
128 | .gradle
129 | build/
130 | 
131 | # Ignore Gradle GUI config
132 | gradle-app.setting
133 | 
134 | # Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored)
135 | !gradle-wrapper.jar
136 | 
137 | # Cache of project
138 | .gradletasknamecache
139 | 
140 | # # Work around https://youtrack.jetbrains.com/issue/IDEA-116898
141 | # gradle/wrapper/gradle-wrapper.properties
142 | .idea/
143 | src/test/
144 | 


--------------------------------------------------------------------------------
/src/main/java/org/stefano/distributional/Main.java:
--------------------------------------------------------------------------------
 1 | package org.stefano.distributional;
 2 | 
 3 | import org.slf4j.Logger;
 4 | import org.slf4j.LoggerFactory;
 5 | import org.stefano.distributional.model.Summarizer;
 6 | import org.stefano.distributional.model.components.impl.AdvancedGraphWeigher;
 7 | import org.stefano.distributional.model.components.impl.DefaultGraphEncoder;
 8 | import org.stefano.distributional.model.components.impl.DefaultPathCompressor;
 9 | import org.stefano.distributional.utils.OpenNLP;
10 | 
11 | import java.io.IOException;
12 | import java.nio.file.Path;
13 | import java.nio.file.Paths;
14 | import java.util.Arrays;
15 | import java.util.Collection;
16 | import java.util.List;
17 | import java.util.Optional;
18 | 
19 | /**
20 |  * TODO Replace with proper description...
21 |  * <p>
22 |  * Created by stefano on 23/01/2017.
23 |  */
24 | public class Main {
25 | 
26 |     private static final Logger logger = LoggerFactory.getLogger(Main.class);
27 | 
28 |     public static void main(String[] args) throws IOException {
29 |         Path folder = Paths.get(args[0]);
30 | 
31 |         List<String> sentences = Arrays.asList(
32 |                 "The wife of a former U.S. president Bill Clinton, Hillary Clinton, visited China last Monday.",
33 |                 "Hillary Clinton wanted to visit China last month but postponed her plans till Monday last week.",
34 |                 "Hillary Clinton paid a visit to the People Republic of China on Monday.",
35 |                 "Last week the Secretary State Ms. Clinton visited Chinese officials.");
36 | 
37 |         Collection<String> stopWords = Arrays.asList("a", "able", "about", "above", "after", "all", "also", "an",
38 |                 "and", "any", "as", "ask", "at", "back", "bad", "be", "because", "beneath", "big", "but", "by",
39 |                 "call", "can", "case", "child", "come", "company", "could", "day", "different", "do", "early", "even",
40 |                 "eye", "fact", "feel", "few", "find", "first", "for", "from", "get", "give", "go", "good",
41 |                 "government", "great", "group", "hand", "have", "he", "her", "high", "him", "his", "how", "i", "if",
42 |                 "important", "in", "into", "it", "its", "just", "know", "large", "last", "leave", "life", "like",
43 |                 "little", "long", "look", "make", "man", "me", "most", "my", "new", "next", "no", "not", "now",
44 |                 "number", "of", "old", "on", "one", "only", "or", "other", "our", "out", "over", "own", "part",
45 |                 "people", "person", "place", "point", "problem", "public", "right", "same", "say", "see", "seem",
46 |                 "she", "small", "so", "some", "take", "tell", "than", "that", "the", "their", "them", "then", "there",
47 |                 "these", "they", "thing", "think", "this", "time", "to", "try", "two", "under", "up", "us", "use",
48 |                 "want", "way", "we", "week", "well", "what", "when", "which", "who", "will", "with", "woman", "work",
49 |                 "world", "would", "year", "you", "young", "your");
50 | 
51 |         Summarizer summarizer = Summarizer.builder()
52 |                 .on(folder)
53 |                 .withEncoder(new DefaultGraphEncoder())
54 |                 .withWeigher(new AdvancedGraphWeigher())
55 |                 .withCompressor(new DefaultPathCompressor())
56 |                 .build();
57 |         Optional<String> summary = summarizer.process(sentences, stopWords);
58 |         if (summary.isPresent()) {
59 |             System.out.println(" >> " +summary.get());
60 |         } else {
61 |             logger.info("No summary available.");
62 |         }
63 |         logger.info("Done.");
64 |     }
65 | 
66 | }
67 | 


--------------------------------------------------------------------------------
/src/main/java/org/stefano/distributional/utils/OpenNLP.java:
--------------------------------------------------------------------------------
  1 | package org.stefano.distributional.utils;
  2 | 
  3 | import opennlp.tools.postag.POSModel;
  4 | import opennlp.tools.postag.POSTagger;
  5 | import opennlp.tools.postag.POSTaggerME;
  6 | import opennlp.tools.sentdetect.SentenceDetector;
  7 | import opennlp.tools.sentdetect.SentenceDetectorME;
  8 | import opennlp.tools.sentdetect.SentenceModel;
  9 | import opennlp.tools.tokenize.Tokenizer;
 10 | import opennlp.tools.tokenize.TokenizerME;
 11 | import opennlp.tools.tokenize.TokenizerModel;
 12 | import org.slf4j.Logger;
 13 | import org.slf4j.LoggerFactory;
 14 | 
 15 | import java.io.IOException;
 16 | import java.io.InputStream;
 17 | 
 18 | /**
 19 |  * TODO Replace with proper description...
 20 |  * <p>
 21 |  * Created by stefano on 23/01/2017.
 22 |  */
 23 | public class OpenNLP {
 24 | 
 25 |     private static final Logger logger = LoggerFactory.getLogger(OpenNLP.class);
 26 |     private static SentenceDetector detector = null;
 27 |     private static Tokenizer tokenizer = null;
 28 |     private static POSTagger tagger = null;
 29 | 
 30 |     private OpenNLP() {
 31 |         throw new UnsupportedOperationException("'OpenNLP' class should not be instantiated");
 32 |     }
 33 | 
 34 |     public static SentenceDetector getSentenceDetector() {
 35 |         if (detector == null) {
 36 |             InputStream stream = OpenNLP.class.getResourceAsStream("/en-sent.bin");
 37 |             try {
 38 |                 SentenceModel model = new SentenceModel(stream);
 39 |                 detector = new SentenceDetectorME(model);
 40 |                 logger.info("OpenNLP sentence detector lazily initialised");
 41 |             } catch (IOException e) {
 42 |                 e.printStackTrace();
 43 |             } finally {
 44 |                 if (stream != null) {
 45 |                     try {
 46 |                         stream.close();
 47 |                     } catch (IOException e) {
 48 |                         e.printStackTrace();
 49 |                     }
 50 |                 }
 51 |             }
 52 |         }
 53 |         return detector;
 54 |     }
 55 | 
 56 |     public static Tokenizer getTokenizer() {
 57 |         if (tokenizer == null) {
 58 |             InputStream stream = OpenNLP.class.getResourceAsStream("/en-token.bin");
 59 |             try {
 60 |                 TokenizerModel model = new TokenizerModel(stream);
 61 |                 tokenizer = new TokenizerME(model);
 62 |                 logger.info("OpenNLP tokenizer lazily initialised");
 63 |             } catch (IOException e) {
 64 |                 e.printStackTrace();
 65 |             } finally {
 66 |                 if (stream != null) {
 67 |                     try {
 68 |                         stream.close();
 69 |                     } catch (IOException e) {
 70 |                         e.printStackTrace();
 71 |                     }
 72 |                 }
 73 |             }
 74 |         }
 75 |         return tokenizer;
 76 |     }
 77 | 
 78 |     public static POSTagger getPOSTagger() {
 79 |         if (tagger == null) {
 80 |             InputStream stream = OpenNLP.class.getResourceAsStream("/en-pos-maxent.bin");
 81 |             try {
 82 |                 POSModel model = new POSModel(stream);
 83 |                 tagger = new POSTaggerME(model);
 84 |                 logger.info("OpenNLP POS tagger lazily initialised");
 85 |             } catch (IOException e) {
 86 |                 e.printStackTrace();
 87 |             } finally {
 88 |                 if (stream != null) {
 89 |                     try {
 90 |                         stream.close();
 91 |                     } catch (IOException e) {
 92 |                         e.printStackTrace();
 93 |                     }
 94 |                 }
 95 |             }
 96 |         }
 97 |         return tagger;
 98 |     }
 99 | 
100 | }
101 | 


--------------------------------------------------------------------------------
/src/main/java/org/stefano/distributional/model/components/PathCompressor.java:
--------------------------------------------------------------------------------
  1 | package org.stefano.distributional.model.components;
  2 | 
  3 | import org.neo4j.graphdb.*;
  4 | 
  5 | import java.util.Optional;
  6 | 
  7 | import static java.util.Objects.requireNonNull;
  8 | 
  9 | /**
 10 |  * This interface provides a method to generate a compressive summary from a {@code word graph}.
 11 |  */
 12 | public interface PathCompressor {
 13 | 
 14 |     PathExpander<Object> EXPANDER = PathExpanderBuilder.empty().add(GraphModel.FOLLOWS, Direction.OUTGOING).build();
 15 | 
 16 |     int MIN_DEPTH = 8;
 17 | 
 18 |     /**
 19 |      * Checks whether the given {@code path} contains at least a {@code verb}.
 20 |      *
 21 |      * @param path the {@link Path} to be checked
 22 |      * @return {@code true} if the given {@code path} contains a {@code verb}, {@code false} otherwise
 23 |      */
 24 |     static boolean hasVerb(Path path) {
 25 |         requireNonNull(path, "'path' is null");
 26 | 
 27 |         for (Node node : path.nodes()) {
 28 |             if (node.hasLabel(GraphModel.VERB)) {
 29 |                 return true;
 30 |             }
 31 |         }
 32 |         return false;
 33 |     }
 34 | 
 35 |     /**
 36 |      * Decodes the given {@code path} into a string and returns it.
 37 |      *
 38 |      * @param path the {@link Path} to be decoded
 39 |      * @return the (possibly empty) string from the given {@code path}
 40 |      */
 41 |     static Optional<String> decode(Path path) {
 42 |         requireNonNull(path, "'path' is null");
 43 | 
 44 |         String sentence = "";
 45 |         for (Node node : path.nodes()) {
 46 |             sentence = (sentence + " " + node.getProperty("word", "")).trim();
 47 |         }
 48 |         if (!sentence.isEmpty()) {
 49 |             return Optional.of(sentence + ".");
 50 |         }
 51 |         return Optional.empty();
 52 |     }
 53 | 
 54 |     /**
 55 |      * This method finds in the given {@code graph} all the paths from {@code start} to {@code end} that only use
 56 |      * {@code FOLLOWS} relationships and are no longer than the given {@code maxDepth}.
 57 |      * The paths that have no verbs or shorter than {@link this.MIN_DEPTH} are ignored because they likely lead
 58 |      * to poor summaries.
 59 |      * All the other paths are sorted by increasing cost, which is the sum of the (logarithmic) weights
 60 |      * on the {@code FOLLOWS} relationships of each path.
 61 |      * The minimal cost path most likely contains the most important concepts that summarises the graph
 62 |      * in a grammatically sounded way.
 63 |      * If such path exists, it is compressed to generate the summary to return.
 64 |      *
 65 |      * @param graph    the {@link GraphDatabaseService} with the {@code word graph} to be summarised
 66 |      * @param maxDepth the upper bound limit on the paths' length
 67 |      * @return the string that best summarises the graph, if any
 68 |      */
 69 |     Optional<String> compress(GraphDatabaseService graph, int maxDepth);
 70 | 
 71 |     /**
 72 |      * A {@link Path} associated with its {@code cost}.
 73 |      */
 74 |     final class CostPath implements Comparable<CostPath> {
 75 |         private final Path path;
 76 |         private final double cost;
 77 | 
 78 |         public CostPath(Path path, double cost) {
 79 |             this.path = requireNonNull(path, "'path' is null");
 80 |             this.cost = cost;
 81 |         }
 82 | 
 83 |         public Path getPath() {
 84 |             return path;
 85 |         }
 86 | 
 87 |         public double getCost() {
 88 |             return cost;
 89 |         }
 90 | 
 91 |         @Override
 92 |         public int compareTo(CostPath other) {
 93 |             requireNonNull(other, "'other' is null");
 94 | 
 95 |             int result = Double.compare(cost, other.cost);
 96 |             if (result == 0) {
 97 |                 if (!path.equals(other.path)) {
 98 |                     result = Integer.compare(path.length(), other.path.length());
 99 |                     if (result == 0) {
100 |                         result = (int) System.nanoTime() % 2;
101 |                     }
102 |                 }
103 |             }
104 |             return result;
105 |         }
106 |     }
107 | }
108 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Multi-Sentence Compression
 2 | ====
 3 | 
 4 | Compressing a cluster of related sentences into a single sentence that retains the most relevant non-redundant concepts from the original cluster and is grammatically sound is a complex task.
 5 | 
 6 | This project implements the method suggested in ["Multi-Sentence Compressing: Finding Shortest Paths in Word Graphs"](http://www.aclweb.org/anthology/C10-1037) (**Katja Filippova.** Google Inc. _In Proc of 23rd Intl Conf COLING, 2010._) which is based upon shortest paths in word graphs.
 7 | 
 8 | Specifically, we use:
 9 | * [OpenNLP](https://opennlp.apache.org) for basic sentence detection, tokenisation and POD tagging
10 | * [Neo4j](https://neo4j.com) for graph generation and traversal
11 | * Wikipedia's list of [most common words in English](https://en.wikipedia.org/wiki/Most_common_words_in_English).
12 | 
13 | The procedure consists in:
14 | * generating a `word graph`
15 | * weighting the **edges** between _words_
16 | * compressing the graph into a _meaningful summary_.
17 | 
18 | Word graph
19 | ----
20 | 
21 | A `word graph` (or `adjacency text graph`) is a directed graph where:
22 | * _words_ become **nodes** (punctuation is ignored)
23 | * _adjacent words_ are connected by **edges** (type: _FOLLOWS_)
24 | * _frequencies_ of words and adjacencies are saved on both **nodes** and **edges**.
25 | 
26 | The **lower case text** and **POS tag** of each _word_ act as key, so that words with the same grammatical usage are unique in the graph.   
27 | The only exception to this rule is for [stop-words](https://en.wikipedia.org/wiki/Most_common_words_in_English) which are always duplicated (if not involved in a _sintagmatic association_ with a relevant word) to keep their _frequencies_ (and importance in the graph) low.
28 | 
29 | Our data model also includes a **node** to represent each _sentence_ (with _id_) and as many _CONTAINS_ **edges** as _words_ in each sentence (with their relative _pos_). The chain of _words_ of each sentence is also preceded by a _START_ **node** and followed by an _END_ **node**.
30 | 
31 | Given the following cluster of related sentences:
32 | 
33 | 1. _The wife of a former U.S. president Bill Clinton, Hillary Clinton, visited China last Monday._
34 | 2. _Hillary Clinton wanted to visit China last month but postponed her plans till Monday last week._
35 | 3. _Hillary Clinton paid a visit to the People Republic of China on Monday._
36 | 4. _Last week the Secretary State Ms. Clinton visited Chinese officials._
37 | 
38 | the resulting `word graph` is presented below.
39 | 
40 | ![Word graph for the example cluster](/images/word-graph.png)
41 | 
42 | Weights
43 | ----
44 | 
45 | Both weight methods discussed in the [original paper](http://www.aclweb.org/anthology/C10-1037) have been implemented.
46 | 
47 | The **naive** method simply considers the inverse of the _frequency_ of each _FOLLOWS_ **edge**.
48 | 
49 | The **advanced** method is more sophisticated as it keeps into account **sintagmatic associations** scaled by 
50 | the relative distance of the terms in their enclosing sentences.
51 |  
52 | In particular:
53 |  
54 |                             freq(i) + freq(j)
55 |     w'(edge(i, j)) = ------------------------------
56 |                       SUM(s in S) diff(s, i, j)^-1 
57 | 
58 |                     | pos(s, j) - pos(s, i)    if pos(s, i) < pos(s, j)
59 |     diff(s, i, j) = | 
60 |                     | 0                        otherwise
61 | 
62 |                        w'(edge(i, j))
63 |     w"(edge(i, j) = -------------------
64 |                      freq(i) x freq(j)
65 | 
66 | Notice that these weights are costs: the lower, the better.
67 | 
68 | Compression
69 | ----
70 | 
71 | The goal of this step is to generalise the input sentences by generating an appropriate compression (inductive task).
72 | All the **paths** from _START_ to _END_ describe all the possible _worlds_ that can be reached upon summarisation.
73 | 
74 | In order to obtain sound summaries, we require paths to be at least **8 words** long and to contain **at least a verb**.
75 | The remaining paths are ranked by **increasing cost**, which is the sum of the weights on their **edges** normalised by **path length**.
76 | 
77 | By visiting the _words_ in the **minimal cost path** (if any), the desired compression summary is generated.
78 | 
79 | Results
80 | ----
81 | 
82 | The project is organised as a [Gradle Application](https://docs.gradle.org/current/userguide/application_plugin.html), 
83 | therefore it is sufficient to issue the following command on the terminal in the root folder of the project 
84 | (provided that [Gradle]() is installed locally):
85 | 
86 |     gradle clean run
87 | 
88 | The example introduced above, for instance, produces the following output:
89 | 
90 | ![Output for the example cluster](/images/output.png)
91 | 
92 | which includes the following summary: 
93 | 
94 |     Hillary Clinton wanted to visit China last week.
95 | 
96 | The algorithm has been successfully applied to English and Spanish by using an _ad-hoc_ **stop-word list** of 600 term ca.
97 | The experimental results are discussed in the [original paper](http://www.aclweb.org/anthology/C10-1037).


--------------------------------------------------------------------------------
/src/main/java/org/stefano/distributional/model/Summarizer.java:
--------------------------------------------------------------------------------
  1 | package org.stefano.distributional.model;
  2 | 
  3 | import org.neo4j.graphdb.GraphDatabaseService;
  4 | import org.neo4j.graphdb.factory.GraphDatabaseFactory;
  5 | import org.slf4j.Logger;
  6 | import org.slf4j.LoggerFactory;
  7 | import org.stefano.distributional.model.components.GraphEncoder;
  8 | import org.stefano.distributional.model.components.GraphWeigher;
  9 | import org.stefano.distributional.model.components.PathCompressor;
 10 | 
 11 | import java.io.IOException;
 12 | import java.nio.file.FileVisitResult;
 13 | import java.nio.file.Files;
 14 | import java.nio.file.Path;
 15 | import java.nio.file.SimpleFileVisitor;
 16 | import java.nio.file.attribute.BasicFileAttributes;
 17 | import java.util.Collection;
 18 | import java.util.List;
 19 | import java.util.Optional;
 20 | 
 21 | import static java.util.Objects.requireNonNull;
 22 | 
 23 | /**
 24 |  * A facade for {@link GraphEncoder}, {@link GraphWeigher} and {@link PathCompressor} to summarise {@code sentences}.
 25 |  */
 26 | public final class Summarizer {
 27 | 
 28 |     private static final Logger logger = LoggerFactory.getLogger(Summarizer.class);
 29 | 
 30 |     private final Path folder;
 31 |     private final GraphEncoder encoder;
 32 |     private final GraphWeigher weigher;
 33 |     private final PathCompressor compressor;
 34 | 
 35 |     private Summarizer(SummarizerBuilder builder) {
 36 |         requireNonNull(builder, "'builder' is null");
 37 |         this.folder = builder.currentFolder;
 38 |         this.encoder = builder.currentEncoder;
 39 |         this.weigher = builder.currentWeigher;
 40 |         this.compressor = builder.currentCompressor;
 41 |     }
 42 | 
 43 |     /**
 44 |      * Returns a {@code builder} for {@link Summarizer}.
 45 |      *
 46 |      * @return a {@code builder} for {@link Summarizer}
 47 |      */
 48 |     public static RequiresFolder builder() {
 49 |         return new SummarizerBuilder();
 50 |     }
 51 | 
 52 |     /**
 53 |      * Process the given {@code sentences} with respect to the given {@code stopWords} and returns
 54 |      * the equivalent {@code multi-sentence compression}, if any.
 55 |      *
 56 |      * @param sentences the {@link List<String>} to compress
 57 |      * @param stopWords the {@link Collection<String>} of common words
 58 |      * @return the equivalent {@code multi-sentence compression}, if any
 59 |      */
 60 |     public Optional<String> process(List<String> sentences, Collection<String> stopWords) {
 61 |         requireNonNull(sentences, "'sentences' is null");
 62 |         requireNonNull(stopWords, "'stopWords' is null");
 63 | 
 64 |         if (sentences.isEmpty()) {
 65 |             return Optional.empty();
 66 |         }
 67 |         long elapsed = System.nanoTime();
 68 |         logger.debug("Compressing the following sentences:\n\t{}", String.join("\n\t", sentences));
 69 |         cleanup();
 70 |         GraphDatabaseService graph = new GraphDatabaseFactory().newEmbeddedDatabase(folder.toFile());
 71 |         int maxLength = encoder.encode(graph, sentences, stopWords);
 72 |         weigher.weight(graph);
 73 |         Optional<String> summary = compressor.compress(graph, maxLength);
 74 |         graph.shutdown();
 75 |         elapsed = System.nanoTime() - elapsed;
 76 |         logger.info("Compression completed in {} ms.", String.format("%,.3f", elapsed / 1_000_000_000.0));
 77 |         return summary;
 78 |     }
 79 | 
 80 |     private void cleanup() {
 81 |         long elapsed = System.nanoTime();
 82 |         logger.debug("Preparing database folder...");
 83 |         if (Files.notExists(folder)) {
 84 |             try {
 85 |                 Files.createDirectories(folder);
 86 |             } catch (IOException e) {
 87 |                 throw new IllegalArgumentException("'folder' can't be created: " + folder, e);
 88 |             }
 89 |         } else {
 90 |             try {
 91 |                 Files.walkFileTree(folder, new SimpleFileVisitor<Path>() {
 92 |                     @Override
 93 |                     public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
 94 |                         Files.delete(file);
 95 |                         return FileVisitResult.CONTINUE;
 96 |                     }
 97 | 
 98 |                     @Override
 99 |                     public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException {
100 |                         Files.delete(dir);
101 |                         return FileVisitResult.CONTINUE;
102 |                     }
103 |                 });
104 |             } catch (IOException e) {
105 |                 throw new IllegalArgumentException("'folder' can't be deleted: " + folder, e);
106 |             }
107 |         }
108 |         elapsed = System.nanoTime() - elapsed;
109 |         logger.debug("Database ready in {} ms.", String.format("%,.3f", elapsed / 1_000_000_000.0));
110 |     }
111 | 
112 |     /**
113 |      * An helper class to build a {@link Summarizer}.
114 |      */
115 |     public interface RequiresFolder {
116 |         RequiresEncoder on(Path folder);
117 |     }
118 | 
119 |     /**
120 |      * An helper class to build a {@link Summarizer}.
121 |      */
122 |     public interface RequiresEncoder extends RequiresFolder {
123 |         RequiresWeigher withEncoder(GraphEncoder encoder);
124 |     }
125 | 
126 |     /**
127 |      * An helper class to build a {@link Summarizer}.
128 |      */
129 |     public interface RequiresWeigher extends RequiresEncoder {
130 |         RequiresCompressor withWeigher(GraphWeigher weigher);
131 |     }
132 | 
133 |     /**
134 |      * An helper class to build a {@link Summarizer}.
135 |      */
136 |     public interface RequiresCompressor extends RequiresWeigher {
137 |         SummarizerBuilder withCompressor(PathCompressor compressor);
138 |     }
139 | 
140 |     /**
141 |      * An helper class to build a {@link Summarizer}.
142 |      */
143 |     public static class SummarizerBuilder implements RequiresCompressor {
144 | 
145 |         private Path currentFolder;
146 |         private GraphEncoder currentEncoder;
147 |         private GraphWeigher currentWeigher;
148 |         private PathCompressor currentCompressor;
149 | 
150 |         private SummarizerBuilder() {
151 |         }
152 | 
153 |         @Override
154 |         public RequiresEncoder on(Path folder) {
155 |             requireNonNull(folder, "'graph' is null");
156 |             folder = folder.toAbsolutePath().normalize();
157 |             if (Files.exists(folder) && !Files.isDirectory(folder)) {
158 |                 throw new IllegalArgumentException("'graph' is not a folder: " + folder);
159 |             }
160 |             currentFolder = folder;
161 |             return this;
162 |         }
163 | 
164 |         @Override
165 |         public RequiresWeigher withEncoder(GraphEncoder encoder) {
166 |             requireNonNull(encoder, "'encoder' is null");
167 |             currentEncoder = encoder;
168 |             return this;
169 |         }
170 | 
171 |         @Override
172 |         public RequiresCompressor withWeigher(GraphWeigher weigher) {
173 |             requireNonNull(weigher, "'weigher' is null");
174 |             currentWeigher = weigher;
175 |             return this;
176 |         }
177 | 
178 |         @Override
179 |         public SummarizerBuilder withCompressor(PathCompressor compressor) {
180 |             requireNonNull(compressor, "'compressor' is null");
181 |             currentCompressor = compressor;
182 |             return this;
183 |         }
184 | 
185 |         public Summarizer build() {
186 |             return new Summarizer(this);
187 |         }
188 |     }
189 | }
190 | 


--------------------------------------------------------------------------------
/src/main/java/org/stefano/distributional/model/components/impl/DefaultGraphEncoder.java:
--------------------------------------------------------------------------------
  1 | package org.stefano.distributional.model.components.impl;
  2 | 
  3 | import opennlp.tools.sentdetect.SentenceDetector;
  4 | import org.neo4j.graphdb.*;
  5 | import org.slf4j.Logger;
  6 | import org.slf4j.LoggerFactory;
  7 | import org.stefano.distributional.model.components.GraphEncoder;
  8 | import org.stefano.distributional.model.components.GraphModel;
  9 | import org.stefano.distributional.utils.OpenNLP;
 10 | 
 11 | import java.util.*;
 12 | 
 13 | import static java.util.Objects.requireNonNull;
 14 | import static org.stefano.distributional.model.components.GraphModel.*;
 15 | 
 16 | /**
 17 |  * This class provides the default method to encode some {@code sentences} into a {@code word graph}.
 18 |  */
 19 | public final class DefaultGraphEncoder implements GraphEncoder {
 20 | 
 21 |     private static final Logger logger = LoggerFactory.getLogger(DefaultGraphEncoder.class);
 22 | 
 23 |     private static final SentenceDetector DETECTOR = OpenNLP.getSentenceDetector();
 24 | 
 25 |     @Override
 26 |     public int encode(GraphDatabaseService graph, List<String> sentences, Collection<String> stopWords) {
 27 |         requireNonNull(graph, "'graph' is null");
 28 |         requireNonNull(sentences, "'sentences' is null");
 29 |         requireNonNull(stopWords, "'stopWords' is null");
 30 | 
 31 |         int maxLength = 0;
 32 |         try (Transaction tx = graph.beginTx()) {
 33 |             long elapsed = System.nanoTime();
 34 |             logger.debug("Starting encoding...");
 35 |             int id = 0;
 36 |             for (String content : sentences) {
 37 |                 for (String sentence : DETECTOR.sentDetect(content)) {
 38 |                     Token[] tokens = Token.parse(sentence);
 39 |                     logger.debug("Encoding sentence #{} ({} word/s; punctuation is ignored)...", id, tokens.length);
 40 |                     Node parent = graph.createNode(SENTENCE);
 41 |                     parent.setProperty("id", id++);
 42 |                     parent.setProperty("length", tokens.length);
 43 |                     int pos;
 44 |                     Node previous = GraphModel.start(graph);
 45 |                     for (pos = 0; pos < tokens.length; pos++) {
 46 |                         Node current = tokens[pos].isStopWord(stopWords) ?
 47 |                                 getStopWord(graph, tokens, pos) :
 48 |                                 getWord(graph, tokens, pos);
 49 |                         parent.createRelationshipTo(current, CONTAINS).setProperty("pos", pos);
 50 |                         GraphEncoder.link(previous, current);
 51 |                         previous = current;
 52 |                     }
 53 |                     GraphEncoder.link(previous, GraphModel.end(graph));
 54 |                     maxLength = Integer.max(pos, maxLength);
 55 |                 }
 56 |             }
 57 |             elapsed = System.nanoTime() - elapsed;
 58 |             logger.info("Word graph generated in {} ms.",
 59 |                     String.format("%,.3f", elapsed / 1_000_000_000.0));
 60 |             tx.success();
 61 |         }
 62 |         return maxLength;
 63 |     }
 64 | 
 65 |     private Node getStopWord(GraphDatabaseService graph, Token[] tokens, int pos) {
 66 |         Label label = tokens[pos].getLabel();
 67 |         String text = tokens[pos].getText();
 68 |         ResourceIterator<Node> nodes = graph.findNodes(label, "text", text);
 69 |         if (nodes.hasNext()) {
 70 |             List<Context> contexts = new ArrayList<>();
 71 |             while (nodes.hasNext()) {
 72 |                 Node node = nodes.next();
 73 |                 Context context = getContext(graph, tokens, pos, node);
 74 |                 if (!context.isEmpty()) {
 75 |                     contexts.add(context);
 76 |                 }
 77 |             }
 78 |             if (!contexts.isEmpty()) {
 79 |                 Collections.sort(contexts);
 80 |                 Node node = contexts.get(0).getNode();
 81 |                 node.setProperty("freq", (double) node.getProperty("freq", 1.0) + 1.0);
 82 |                 return node;
 83 |             }
 84 |             return GraphEncoder.word(graph, tokens[pos], true);
 85 |         }
 86 |         return GraphEncoder.word(graph, tokens[pos], true);
 87 |     }
 88 | 
 89 |     private Node getWord(GraphDatabaseService graph, Token[] tokens, int pos) {
 90 |         Label label = tokens[pos].getLabel();
 91 |         String text = tokens[pos].getText();
 92 |         ResourceIterator<Node> nodes = graph.findNodes(label, "text", text);
 93 |         if (nodes.hasNext()) {
 94 |             List<Context> contexts = new ArrayList<>();
 95 |             while (nodes.hasNext()) {
 96 |                 Node node = nodes.next();
 97 |                 Context context = getContext(graph, tokens, pos, node);
 98 |                 contexts.add(context);
 99 |             }
100 |             Collections.sort(contexts);
101 |             Node node = contexts.get(0).getNode();
102 |             node.setProperty("freq", (double) node.getProperty("freq", 1.0) + 1.0);
103 |             return node;
104 |         }
105 |         return GraphEncoder.word(graph, tokens[pos], false);
106 |     }
107 | 
108 |     private Context getContext(GraphDatabaseService graph, Token[] tokens, int pos, Node node) {
109 |         int count = 0;
110 |         double freq = 0.0;
111 |         for (Direction direction : new Direction[]{Direction.INCOMING, Direction.OUTGOING}) {
112 |             Collection<String> texts = getTextsFromToken(tokens, pos, direction, 3);
113 |             if (!texts.isEmpty()) {
114 |                 Map<String, Double> freqTexts = getTextsFromNode(node, direction, 3);
115 |                 texts.retainAll(freqTexts.keySet());
116 |                 count += texts.size();
117 |                 for (String t : texts) {
118 |                     freq += freqTexts.getOrDefault(t, 1.0);
119 |                 }
120 |             }
121 |         }
122 |         return new Context(node, count, freq);
123 |     }
124 | 
125 |     private Collection<String> getTextsFromToken(Token[] tokens, int pos, Direction direction, int distance) {
126 |         if (direction == Direction.BOTH) {
127 |             return Collections.emptySet();
128 |         }
129 |         Set<String> result = new HashSet<>();
130 |         int min = direction == Direction.INCOMING ?
131 |                 Integer.max(0, pos - distance) :
132 |                 pos + 1;
133 |         int max = direction == Direction.INCOMING ?
134 |                 pos : Integer.min(tokens.length, pos + distance + 1);
135 |         for (int i = min; i < max; i++) {
136 |             result.add(tokens[i].getText());
137 |         }
138 |         return result;
139 |     }
140 | 
141 |     private Map<String, Double> getTextsFromNode(Node node, Direction direction, int distance) {
142 |         if (distance < 0) {
143 |             return Collections.emptyMap();
144 |         }
145 |         Map<String, Double> result = new HashMap<>();
146 |         for (Relationship relationship : node.getRelationships(FOLLOWS, direction)) {
147 |             Node other = relationship.getOtherNode(node);
148 |             String text = (String) other.getProperty("text", "");
149 |             double freq = (double) other.getProperty("freq", 1.0);
150 |             if (!text.isEmpty()) {
151 |                 result.put(text, result.getOrDefault(text, 1.0) + freq);
152 |                 if (distance > 1) {
153 |                     Map<String, Double> map = getTextsFromNode(other, direction, distance - 1);
154 |                     for (String mapText : map.keySet()) {
155 |                         double mapFreq = map.get(mapText);
156 |                         result.put(mapText, result.getOrDefault(mapText, 1.0) + mapFreq);
157 |                     }
158 |                 }
159 |             }
160 |         }
161 |         return result;
162 |     }
163 | 
164 | 
165 | }
166 | 


--------------------------------------------------------------------------------
/src/main/java/org/stefano/distributional/model/components/GraphEncoder.java:
--------------------------------------------------------------------------------
  1 | package org.stefano.distributional.model.components;
  2 | 
  3 | import opennlp.tools.postag.POSTagger;
  4 | import opennlp.tools.tokenize.Tokenizer;
  5 | import org.neo4j.graphdb.*;
  6 | import org.stefano.distributional.utils.OpenNLP;
  7 | 
  8 | import java.util.*;
  9 | 
 10 | import static java.util.Objects.requireNonNull;
 11 | 
 12 | /**
 13 |  * This interface provides a method to encode some {@code sentences} into a {@code word graph}.
 14 |  */
 15 | public interface GraphEncoder {
 16 | 
 17 |     /**
 18 |      * Creates a {@code FOLLOWS} relationship between the given {@code tail} and {@code head} nodes
 19 |      * with {@code frequency} {@code 1.0} if no such relationship already exists, or updates
 20 |      * the {@code frequency} of the existing and eventually returns it.
 21 |      *
 22 |      * @param tail the start {@link Node} of the link to handle
 23 |      * @param head the end {@link Node} of the link to handle
 24 |      * @return the relationship between {@code tail} and {@code head} with updated {@code frequency}
 25 |      * if exists, a newly created relationship with {@code frequency} {@code 1.0} otherwise
 26 |      */
 27 |     static Relationship link(Node tail, Node head) {
 28 |         for (Relationship relationship : tail.getRelationships(GraphModel.FOLLOWS, Direction.OUTGOING)) {
 29 |             if (relationship.getOtherNode(tail).equals(head)) {
 30 |                 double weight = (double) relationship.getProperty("freq", 1.0);
 31 |                 relationship.setProperty("freq", 1.0 + weight);
 32 |                 return relationship;
 33 |             }
 34 |         }
 35 |         Relationship relationship = tail.createRelationshipTo(head, GraphModel.FOLLOWS);
 36 |         relationship.setProperty("freq", 1.0);
 37 |         return relationship;
 38 |     }
 39 | 
 40 |     /**
 41 |      * Creates a {@link Node} in the given {@code graph} using the given {@code token} and {@code stopWord} flag.
 42 |      *
 43 |      * @param graph    the {@link GraphDatabaseService} where to create a node
 44 |      * @param token    the {@link Token} to convert into a node
 45 |      * @param stopWord a flag which tells if the node refers to a common word or not
 46 |      * @return the resulting {@link Node}
 47 |      */
 48 |     static Node word(GraphDatabaseService graph, Token token, boolean stopWord) {
 49 |         Label label = token.getLabel();
 50 |         Node node = graph.createNode(GraphModel.WORD, label);
 51 |         if (token.getTag().startsWith("VB")) {
 52 |             node.addLabel(GraphModel.VERB);
 53 |         }
 54 |         node.setProperty("text", token.getText());
 55 |         node.setProperty("word", token.getWord());
 56 |         node.setProperty("freq", 1.0);
 57 |         node.setProperty("stop", stopWord);
 58 |         return node;
 59 |     }
 60 | 
 61 | 
 62 |     /**
 63 |      * Encodes the given {@code sentences} as a {@code word graph} using the given {@code stopWords}
 64 |      * into the given {@code graph}, returning the length of the longest sentence.
 65 |      * Notice that punctuation is ignored and common words tend to build secondary paths.
 66 |      *
 67 |      * @param graph     the {@link GraphDatabaseService} where the given {@code sentences} are going to be saved
 68 |      * @param sentences the {@link List<String>} to be encoded into the given {@code graph}
 69 |      * @param stopWords the {@link Collection<String>} to identify common words
 70 |      * @return the number of words of the longest sentence among the given {@code sentences}
 71 |      */
 72 |     int encode(GraphDatabaseService graph, List<String> sentences, Collection<String> stopWords);
 73 | 
 74 |     /**
 75 |      * A {@code token} with (lower) text, word and POS tag.
 76 |      */
 77 |     final class Token {
 78 | 
 79 |         private static final Tokenizer TOKENIZER = OpenNLP.getTokenizer();
 80 |         private static final POSTagger TAGGER = OpenNLP.getPOSTagger();
 81 |         private static final Map<String, Label> LABELS = new HashMap<>();
 82 |         private final String text;
 83 |         private final String word;
 84 |         private final String tag;
 85 | 
 86 |         private Token(String token, String tag) {
 87 |             token = requireNonNull(token, "'token' is null").trim();
 88 |             if (token.isEmpty()) {
 89 |                 throw new IllegalArgumentException("'token' is empty");
 90 |             }
 91 |             this.tag = requireNonNull(tag, "'tag' is null").trim();
 92 |             if (this.tag.isEmpty()) {
 93 |                 throw new IllegalArgumentException("'tag' is empty");
 94 |             }
 95 |             this.text = token.toLowerCase();
 96 |             this.word = token;
 97 |         }
 98 | 
 99 |         public static Token[] parse(String sentence) {
100 |             sentence = requireNonNull(sentence, "'sentence' is null").trim();
101 |             if (sentence.isEmpty()) {
102 |                 throw new IllegalArgumentException("'sentence' is empty");
103 |             }
104 | 
105 |             String[] tokens = TOKENIZER.tokenize(sentence);
106 |             String[] tags = TAGGER.tag(tokens);
107 |             List<Token> result = new ArrayList<>();
108 |             for (int i = 0; i < tokens.length; i++) {
109 |                 if (isWord(tokens[i])) {
110 |                     Token token = new Token(tokens[i], tags[i]);
111 |                     result.add(token);
112 |                 }
113 |             }
114 |             return result.toArray(new Token[result.size()]);
115 |         }
116 | 
117 |         private static boolean isWord(String symbol) {
118 |             symbol = requireNonNull(symbol, "'symbol' is null").trim();
119 |             if (symbol.isEmpty()) {
120 |                 throw new IllegalArgumentException("'symbol' is empty");
121 |             }
122 | 
123 |             return symbol.matches("^(?=.*[\\p{L}\\p{N}'-]).+$");
124 |         }
125 | 
126 |         public Label getLabel() {
127 |             return LABELS.computeIfAbsent(tag, k -> Label.label(tag));
128 |         }
129 | 
130 |         public String getText() {
131 |             return text;
132 |         }
133 | 
134 |         public String getWord() {
135 |             return word;
136 |         }
137 | 
138 |         public String getTag() {
139 |             return tag;
140 |         }
141 | 
142 |         public boolean isStopWord(Collection<String> stopWords) {
143 |             requireNonNull(stopWords, "'stopWords' is null");
144 | 
145 |             return stopWords.contains(text);
146 |         }
147 | 
148 |     }
149 | 
150 |     /**
151 |      * A {@code context} for a {@code word} with {@code matches} and {@code occurrences}.
152 |      */
153 |     final class Context implements Comparable<Context> {
154 | 
155 |         private final Node node;
156 | 
157 |         private final int matches;
158 | 
159 |         private final double occurrences;
160 | 
161 |         public Context(Node node, int matches, double occurrences) {
162 |             this.node = requireNonNull(node, "'node' is null");
163 |             this.matches = matches;
164 |             this.occurrences = occurrences;
165 |         }
166 | 
167 |         public Node getNode() {
168 |             return node;
169 |         }
170 | 
171 |         public boolean isEmpty() {
172 |             return matches <= 0;
173 |         }
174 | 
175 |         @Override
176 |         public int compareTo(Context other) {
177 |             requireNonNull(other, "'other' is null");
178 | 
179 |             int result = Integer.compare(other.matches, this.matches);
180 |             if (result == 0) {
181 |                 result = Double.compare(other.occurrences, this.occurrences);
182 |                 if (result == 0) {
183 |                     if (node.equals(other.node)) {
184 |                         result = 0;
185 |                     } else {
186 |                         result = (int) System.currentTimeMillis() % 2;
187 |                     }
188 |                 }
189 |             }
190 |             return result;
191 |         }
192 |     }
193 | }
194 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------