├── images ├── output.png └── word-graph.png ├── src └── main │ ├── resources │ ├── en-sent.bin │ ├── en-token.bin │ ├── en-pos-maxent.bin │ └── logback.xml │ └── java │ └── org │ └── stefano │ └── distributional │ ├── model │ ├── components │ │ ├── GraphWeigher.java │ │ ├── impl │ │ │ ├── NaiveGraphWeigher.java │ │ │ ├── DefaultPathCompressor.java │ │ │ ├── AdvancedGraphWeigher.java │ │ │ └── DefaultGraphEncoder.java │ │ ├── GraphModel.java │ │ ├── PathCompressor.java │ │ └── GraphEncoder.java │ └── Summarizer.java │ ├── Main.java │ └── utils │ └── OpenNLP.java ├── settings.gradle ├── .gitignore ├── README.md └── LICENSE /images/output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefano-bragaglia/Multi-Sentence-Compression/HEAD/images/output.png -------------------------------------------------------------------------------- /images/word-graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefano-bragaglia/Multi-Sentence-Compression/HEAD/images/word-graph.png -------------------------------------------------------------------------------- /src/main/resources/en-sent.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefano-bragaglia/Multi-Sentence-Compression/HEAD/src/main/resources/en-sent.bin -------------------------------------------------------------------------------- /src/main/resources/en-token.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefano-bragaglia/Multi-Sentence-Compression/HEAD/src/main/resources/en-token.bin -------------------------------------------------------------------------------- /src/main/resources/en-pos-maxent.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefano-bragaglia/Multi-Sentence-Compression/HEAD/src/main/resources/en-pos-maxent.bin -------------------------------------------------------------------------------- /settings.gradle: -------------------------------------------------------------------------------- 1 | /* 2 | * This settings file was auto generated by the Gradle buildInit task 3 | * by 'stefano' at '23/01/17 15:46' with Gradle 3.2.1 4 | * 5 | * The settings file is used to specify which projects to include in your build. 6 | * In a single project build this file can be empty or even removed. 7 | * 8 | * Detailed information about configuring a multi-project build in Gradle can be found 9 | * in the user guide at https://docs.gradle.org/3.2.1/userguide/multi_project_builds.html 10 | */ 11 | 12 | rootProject.name = 'Multi-Sentence-Compression' 13 | -------------------------------------------------------------------------------- /src/main/java/org/stefano/distributional/model/components/GraphWeigher.java: -------------------------------------------------------------------------------- 1 | package org.stefano.distributional.model.components; 2 | 3 | import org.neo4j.graphdb.GraphDatabaseService; 4 | 5 | /** 6 | * This interface provide a method to weight the {@code FOLLOWS} relationships in a {@code word graph}. 7 | */ 8 | public interface GraphWeigher { 9 | 10 | /** 11 | * This method weights the {@code FOLLOWS} relationships in the given {@code graph}. 12 | * 13 | * @param graph the {@link GraphDatabaseService} whose {@code FOLLOWS} relationships have to be weighted 14 | */ 15 | void weight(GraphDatabaseService graph); 16 | } 17 | -------------------------------------------------------------------------------- /src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | System.out 6 | true 7 | 8 | %green(%d{HH:mm:ss.SSS}) %boldYellow([%thread]) %highlight(%-5level) %cyan(%logger{36}) - %white(%msg%n) 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /src/main/java/org/stefano/distributional/model/components/impl/NaiveGraphWeigher.java: -------------------------------------------------------------------------------- 1 | package org.stefano.distributional.model.components.impl; 2 | 3 | import org.neo4j.graphdb.GraphDatabaseService; 4 | import org.neo4j.graphdb.Relationship; 5 | import org.neo4j.graphdb.Transaction; 6 | import org.slf4j.Logger; 7 | import org.slf4j.LoggerFactory; 8 | import org.stefano.distributional.model.components.GraphWeigher; 9 | 10 | import static java.util.Objects.requireNonNull; 11 | import static org.stefano.distributional.model.components.GraphModel.FOLLOWS; 12 | 13 | /** 14 | * This class provides a naive method to weight the {@code FOLLOWS} relationships in a {@code word graph}. 15 | * This method generate weights that are inversely proportional to their frequency. 16 | */ 17 | public final class NaiveGraphWeigher implements GraphWeigher { 18 | 19 | private static final Logger logger = LoggerFactory.getLogger(NaiveGraphWeigher.class); 20 | 21 | @Override 22 | public void weight(GraphDatabaseService graph) { 23 | requireNonNull(graph, "'graph' is null"); 24 | 25 | int total = 0; 26 | try (Transaction tx = graph.beginTx()) { 27 | long elapsed = System.nanoTime(); 28 | logger.debug("Computing weights between words..."); 29 | for (Relationship follows : graph.getAllRelationships()) { 30 | if (follows.isType(FOLLOWS)) { 31 | double weight = 1.0 / (double) follows.getProperty("freq", 1.0); 32 | follows.setProperty("weight", weight); 33 | total += 1; 34 | if (total % 50 == 0) { 35 | logger.debug("{} relationships analysed so far...", total); 36 | } 37 | } 38 | } 39 | elapsed = System.nanoTime() - elapsed; 40 | logger.info("{} relationship/s analysed in {} ms.", 41 | total, String.format("%,.3f", elapsed / 1_000_000_000.0)); 42 | tx.success(); 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/org/stefano/distributional/model/components/impl/DefaultPathCompressor.java: -------------------------------------------------------------------------------- 1 | package org.stefano.distributional.model.components.impl; 2 | 3 | import org.neo4j.graphalgo.GraphAlgoFactory; 4 | import org.neo4j.graphalgo.PathFinder; 5 | import org.neo4j.graphdb.GraphDatabaseService; 6 | import org.neo4j.graphdb.Path; 7 | import org.neo4j.graphdb.Relationship; 8 | import org.neo4j.graphdb.Transaction; 9 | import org.slf4j.Logger; 10 | import org.slf4j.LoggerFactory; 11 | import org.stefano.distributional.model.components.GraphModel; 12 | import org.stefano.distributional.model.components.PathCompressor; 13 | 14 | import java.util.Optional; 15 | import java.util.Set; 16 | import java.util.TreeSet; 17 | 18 | /** 19 | * This class provides the default method to generate a compressive summary from a {@code word graph}. 20 | */ 21 | public final class DefaultPathCompressor implements PathCompressor { 22 | 23 | private static final Logger logger = LoggerFactory.getLogger(DefaultPathCompressor.class); 24 | 25 | @Override 26 | public Optional compress(GraphDatabaseService graph, int maxDepth) { 27 | try (Transaction tx = graph.beginTx()) { 28 | long elapsed = System.nanoTime(); 29 | logger.debug("Computing all the paths between START and END nodes and their costs..."); 30 | int total = 0; 31 | Set paths = new TreeSet<>(); 32 | PathFinder finder = GraphAlgoFactory.allPaths(EXPANDER, maxDepth); 33 | for (Path path : finder.findAllPaths(GraphModel.start(graph), GraphModel.end(graph))) { 34 | if (path.length() >= PathCompressor.MIN_DEPTH && PathCompressor.hasVerb(path)) { 35 | double cost = 0.0; 36 | for (Relationship follows : path.relationships()) { 37 | cost += (double) follows.getProperty("weight", 1.0); 38 | } 39 | paths.add(new CostPath(path, cost)); 40 | } 41 | total += 1; 42 | } 43 | logger.info("{} valid path/s found (out of {} possible) in {} ms.", 44 | paths.size(), total, String.format("%,.3f", elapsed / 1_000_000_000.0)); 45 | if (paths.isEmpty()) { 46 | return Optional.empty(); 47 | } 48 | logger.debug("Generating the compressive summary"); 49 | return PathCompressor.decode(paths.iterator().next().getPath()); 50 | } 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/org/stefano/distributional/model/components/GraphModel.java: -------------------------------------------------------------------------------- 1 | package org.stefano.distributional.model.components; 2 | 3 | import org.neo4j.graphdb.*; 4 | 5 | import static java.util.Objects.requireNonNull; 6 | 7 | /** 8 | * Definitions for {@code word graphs}. 9 | */ 10 | public class GraphModel { 11 | 12 | public static final Label SENTENCE = Label.label("SENTENCE"); 13 | public static final Label START = Label.label("START"); 14 | public static final Label WORD = Label.label("WORD"); 15 | public static final Label END = Label.label("END"); 16 | public static final Label VERB = Label.label("VERB"); 17 | public static final RelationshipType FOLLOWS = RelationshipType.withName("FOLLOWS"); 18 | public static final RelationshipType CONTAINS = RelationshipType.withName("CONTAINS"); 19 | 20 | private GraphModel() { 21 | throw new UnsupportedOperationException("'GraphModel' class should not be instantiated"); 22 | } 23 | 24 | /** 25 | * Returns the {@code START} node in the given {@code graph}. 26 | * If such node exists, its {@code frequency} is increased by 1 and eventually returned. 27 | * If it doesn't, the node is created and initialised to {@code frequency} == 1 and returned. 28 | * 29 | * @param graph the target {@link GraphDatabaseService} 30 | * @return the updated {@code START} node, or a newly created instance 31 | */ 32 | public static Node start(GraphDatabaseService graph) { 33 | requireNonNull(graph, "'graph' is null"); 34 | 35 | return terminal(graph, START); 36 | } 37 | 38 | /** 39 | * Returns the {@code END} node in the given {@code graph}. 40 | * If such node exists, its {@code frequency} is increased by 1 and eventually returned. 41 | * If it doesn't, the node is created and initialised to {@code frequency} == 1 and returned. 42 | * 43 | * @param graph the target {@link GraphDatabaseService} 44 | * @return the updated {@code END} node, or a newly created instance 45 | */ 46 | public static Node end(GraphDatabaseService graph) { 47 | requireNonNull(graph, "'graph' is null"); 48 | 49 | return terminal(graph, END); 50 | } 51 | 52 | private static Node terminal(GraphDatabaseService graph, Label label) { 53 | requireNonNull(graph, "'graph' is null"); 54 | 55 | ResourceIterator nodes = graph.findNodes(label); 56 | if (nodes.hasNext()) { 57 | Node node = nodes.next(); 58 | double freq = (double) node.getProperty("freq", 1.0); 59 | node.setProperty("freq", 1.0 + freq); 60 | return node; 61 | } 62 | Node node = graph.createNode(label); 63 | node.setProperty("freq", 1.0); 64 | return node; 65 | } 66 | 67 | } 68 | -------------------------------------------------------------------------------- /src/main/java/org/stefano/distributional/model/components/impl/AdvancedGraphWeigher.java: -------------------------------------------------------------------------------- 1 | package org.stefano.distributional.model.components.impl; 2 | 3 | import org.neo4j.graphdb.*; 4 | import org.slf4j.Logger; 5 | import org.slf4j.LoggerFactory; 6 | import org.stefano.distributional.model.components.GraphWeigher; 7 | 8 | import static java.util.Objects.requireNonNull; 9 | import static org.stefano.distributional.model.components.GraphModel.CONTAINS; 10 | import static org.stefano.distributional.model.components.GraphModel.FOLLOWS; 11 | 12 | /** 13 | * This class provides an advanced method to weight the {@code FOLLOWS} relationships in a {@code word graph}. 14 | * This method generate weights that are inversely proportional to the frequency of paths scaled down by their length. 15 | */ 16 | public final class AdvancedGraphWeigher implements GraphWeigher { 17 | 18 | private static final Logger logger = LoggerFactory.getLogger(AdvancedGraphWeigher.class); 19 | 20 | @Override 21 | public void weight(GraphDatabaseService graph) { 22 | requireNonNull(graph, "'graph' is null"); 23 | 24 | int total = 0; 25 | try (Transaction tx = graph.beginTx()) { 26 | long elapsed = System.nanoTime(); 27 | logger.debug("Computing weights between words..."); 28 | for (Relationship follows : graph.getAllRelationships()) { 29 | if (follows.isType(FOLLOWS)) { 30 | Node tail = follows.getStartNode(); 31 | Node head = follows.getEndNode(); 32 | double freqTail = (double) tail.getProperty("freq", 1.0); 33 | double freqHead = (double) head.getProperty("freq", 1.0); 34 | double denom = 0.0; 35 | for (Relationship containsTail : tail.getRelationships(CONTAINS, Direction.INCOMING)) { 36 | int posTail = (int) containsTail.getProperty("pos", 0); 37 | Node sentence = containsTail.getStartNode(); 38 | for (Relationship containsHead : sentence.getRelationships(CONTAINS, Direction.OUTGOING)) { 39 | if (containsHead.getEndNode().equals(head)) { 40 | int posHead = (int) containsHead.getProperty("pos", 0); 41 | denom += 1.0 / (posHead - posTail); 42 | } 43 | } 44 | } 45 | double weight = (freqTail + freqHead) / denom; 46 | weight = (weight) / (freqTail * freqHead); 47 | follows.setProperty("weight", weight); 48 | total += 1; 49 | if (total % 50 == 0) { 50 | logger.debug("{} relationships analysed so far...", total); 51 | } 52 | } 53 | } 54 | elapsed = System.nanoTime() - elapsed; 55 | logger.info("{} relationship/s analysed in {} ms.", 56 | total, String.format("%,.3f", elapsed / 1_000_000_000.0)); 57 | tx.success(); 58 | } 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### JetBrains template 3 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 4 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 5 | 6 | # User-specific stuff: 7 | .idea 8 | .idea/workspace.xml 9 | .idea/tasks.xml 10 | .idea/dictionaries 11 | .idea/vcs.xml 12 | .idea/jsLibraryMappings.xml 13 | 14 | # Sensitive or high-churn files: 15 | .idea/dataSources.ids 16 | .idea/dataSources.xml 17 | .idea/dataSources.local.xml 18 | .idea/sqlDataSources.xml 19 | .idea/dynamic.xml 20 | .idea/uiDesigner.xml 21 | 22 | # Gradle: 23 | .idea/gradle.xml 24 | .idea/libraries 25 | 26 | # Mongo Explorer plugin: 27 | .idea/mongoSettings.xml 28 | 29 | ## File-based project format: 30 | *.iws 31 | 32 | ## Plugin-specific files: 33 | 34 | # IntelliJ 35 | /out/ 36 | 37 | # mpeltonen/sbt-idea plugin 38 | .idea_modules/ 39 | 40 | # JIRA plugin 41 | atlassian-ide-plugin.xml 42 | 43 | # Crashlytics plugin (for Android Studio and IntelliJ) 44 | com_crashlytics_export_strings.xml 45 | crashlytics.properties 46 | crashlytics-build.properties 47 | fabric.properties 48 | ### Java template 49 | *.class 50 | 51 | # Mobile Tools for Java (J2ME) 52 | .mtj.tmp/ 53 | 54 | # Package Files # 55 | *.jar 56 | *.war 57 | *.ear 58 | 59 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 60 | hs_err_pid* 61 | ### Maven template 62 | target/ 63 | pom.xml.tag 64 | pom.xml.releaseBackup 65 | pom.xml.versionsBackup 66 | pom.xml.next 67 | release.properties 68 | dependency-reduced-pom.xml 69 | buildNumber.properties 70 | .mvn/timing.properties 71 | ### OSX template 72 | *.DS_Store 73 | .AppleDouble 74 | .LSOverride 75 | 76 | # Icon must end with two \r 77 | Icon 78 | 79 | # Thumbnails 80 | ._* 81 | 82 | # Files that might appear in the root of a volume 83 | .DocumentRevisions-V100 84 | .fseventsd 85 | .Spotlight-V100 86 | .TemporaryItems 87 | .Trashes 88 | .VolumeIcon.icns 89 | .com.apple.timemachine.donotpresent 90 | 91 | # Directories potentially created on remote AFP share 92 | .AppleDB 93 | .AppleDesktop 94 | Network Trash Folder 95 | Temporary Items 96 | .apdisk 97 | ### Windows template 98 | # Windows image file caches 99 | Thumbs.db 100 | ehthumbs.db 101 | 102 | # Folder config file 103 | Desktop.ini 104 | 105 | # Recycle Bin used on file shares 106 | $RECYCLE.BIN/ 107 | 108 | # Windows Installer files 109 | *.cab 110 | *.msi 111 | *.msm 112 | *.msp 113 | 114 | # Windows shortcuts 115 | *.lnk 116 | ### Linux template 117 | *~ 118 | 119 | # temporary files which can be created if a process still has a handle open of a deleted file 120 | .fuse_hidden* 121 | 122 | # KDE directory preferences 123 | .directory 124 | 125 | # Linux trash folder which might appear on any partition or disk 126 | .Trash-* 127 | ### Gradle template 128 | .gradle 129 | build/ 130 | 131 | # Ignore Gradle GUI config 132 | gradle-app.setting 133 | 134 | # Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored) 135 | !gradle-wrapper.jar 136 | 137 | # Cache of project 138 | .gradletasknamecache 139 | 140 | # # Work around https://youtrack.jetbrains.com/issue/IDEA-116898 141 | # gradle/wrapper/gradle-wrapper.properties 142 | .idea/ 143 | src/test/ 144 | -------------------------------------------------------------------------------- /src/main/java/org/stefano/distributional/Main.java: -------------------------------------------------------------------------------- 1 | package org.stefano.distributional; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | import org.stefano.distributional.model.Summarizer; 6 | import org.stefano.distributional.model.components.impl.AdvancedGraphWeigher; 7 | import org.stefano.distributional.model.components.impl.DefaultGraphEncoder; 8 | import org.stefano.distributional.model.components.impl.DefaultPathCompressor; 9 | import org.stefano.distributional.utils.OpenNLP; 10 | 11 | import java.io.IOException; 12 | import java.nio.file.Path; 13 | import java.nio.file.Paths; 14 | import java.util.Arrays; 15 | import java.util.Collection; 16 | import java.util.List; 17 | import java.util.Optional; 18 | 19 | /** 20 | * TODO Replace with proper description... 21 | *

22 | * Created by stefano on 23/01/2017. 23 | */ 24 | public class Main { 25 | 26 | private static final Logger logger = LoggerFactory.getLogger(Main.class); 27 | 28 | public static void main(String[] args) throws IOException { 29 | Path folder = Paths.get(args[0]); 30 | 31 | List sentences = Arrays.asList( 32 | "The wife of a former U.S. president Bill Clinton, Hillary Clinton, visited China last Monday.", 33 | "Hillary Clinton wanted to visit China last month but postponed her plans till Monday last week.", 34 | "Hillary Clinton paid a visit to the People Republic of China on Monday.", 35 | "Last week the Secretary State Ms. Clinton visited Chinese officials."); 36 | 37 | Collection stopWords = Arrays.asList("a", "able", "about", "above", "after", "all", "also", "an", 38 | "and", "any", "as", "ask", "at", "back", "bad", "be", "because", "beneath", "big", "but", "by", 39 | "call", "can", "case", "child", "come", "company", "could", "day", "different", "do", "early", "even", 40 | "eye", "fact", "feel", "few", "find", "first", "for", "from", "get", "give", "go", "good", 41 | "government", "great", "group", "hand", "have", "he", "her", "high", "him", "his", "how", "i", "if", 42 | "important", "in", "into", "it", "its", "just", "know", "large", "last", "leave", "life", "like", 43 | "little", "long", "look", "make", "man", "me", "most", "my", "new", "next", "no", "not", "now", 44 | "number", "of", "old", "on", "one", "only", "or", "other", "our", "out", "over", "own", "part", 45 | "people", "person", "place", "point", "problem", "public", "right", "same", "say", "see", "seem", 46 | "she", "small", "so", "some", "take", "tell", "than", "that", "the", "their", "them", "then", "there", 47 | "these", "they", "thing", "think", "this", "time", "to", "try", "two", "under", "up", "us", "use", 48 | "want", "way", "we", "week", "well", "what", "when", "which", "who", "will", "with", "woman", "work", 49 | "world", "would", "year", "you", "young", "your"); 50 | 51 | Summarizer summarizer = Summarizer.builder() 52 | .on(folder) 53 | .withEncoder(new DefaultGraphEncoder()) 54 | .withWeigher(new AdvancedGraphWeigher()) 55 | .withCompressor(new DefaultPathCompressor()) 56 | .build(); 57 | Optional summary = summarizer.process(sentences, stopWords); 58 | if (summary.isPresent()) { 59 | System.out.println(" >> " +summary.get()); 60 | } else { 61 | logger.info("No summary available."); 62 | } 63 | logger.info("Done."); 64 | } 65 | 66 | } 67 | -------------------------------------------------------------------------------- /src/main/java/org/stefano/distributional/utils/OpenNLP.java: -------------------------------------------------------------------------------- 1 | package org.stefano.distributional.utils; 2 | 3 | import opennlp.tools.postag.POSModel; 4 | import opennlp.tools.postag.POSTagger; 5 | import opennlp.tools.postag.POSTaggerME; 6 | import opennlp.tools.sentdetect.SentenceDetector; 7 | import opennlp.tools.sentdetect.SentenceDetectorME; 8 | import opennlp.tools.sentdetect.SentenceModel; 9 | import opennlp.tools.tokenize.Tokenizer; 10 | import opennlp.tools.tokenize.TokenizerME; 11 | import opennlp.tools.tokenize.TokenizerModel; 12 | import org.slf4j.Logger; 13 | import org.slf4j.LoggerFactory; 14 | 15 | import java.io.IOException; 16 | import java.io.InputStream; 17 | 18 | /** 19 | * TODO Replace with proper description... 20 | *

21 | * Created by stefano on 23/01/2017. 22 | */ 23 | public class OpenNLP { 24 | 25 | private static final Logger logger = LoggerFactory.getLogger(OpenNLP.class); 26 | private static SentenceDetector detector = null; 27 | private static Tokenizer tokenizer = null; 28 | private static POSTagger tagger = null; 29 | 30 | private OpenNLP() { 31 | throw new UnsupportedOperationException("'OpenNLP' class should not be instantiated"); 32 | } 33 | 34 | public static SentenceDetector getSentenceDetector() { 35 | if (detector == null) { 36 | InputStream stream = OpenNLP.class.getResourceAsStream("/en-sent.bin"); 37 | try { 38 | SentenceModel model = new SentenceModel(stream); 39 | detector = new SentenceDetectorME(model); 40 | logger.info("OpenNLP sentence detector lazily initialised"); 41 | } catch (IOException e) { 42 | e.printStackTrace(); 43 | } finally { 44 | if (stream != null) { 45 | try { 46 | stream.close(); 47 | } catch (IOException e) { 48 | e.printStackTrace(); 49 | } 50 | } 51 | } 52 | } 53 | return detector; 54 | } 55 | 56 | public static Tokenizer getTokenizer() { 57 | if (tokenizer == null) { 58 | InputStream stream = OpenNLP.class.getResourceAsStream("/en-token.bin"); 59 | try { 60 | TokenizerModel model = new TokenizerModel(stream); 61 | tokenizer = new TokenizerME(model); 62 | logger.info("OpenNLP tokenizer lazily initialised"); 63 | } catch (IOException e) { 64 | e.printStackTrace(); 65 | } finally { 66 | if (stream != null) { 67 | try { 68 | stream.close(); 69 | } catch (IOException e) { 70 | e.printStackTrace(); 71 | } 72 | } 73 | } 74 | } 75 | return tokenizer; 76 | } 77 | 78 | public static POSTagger getPOSTagger() { 79 | if (tagger == null) { 80 | InputStream stream = OpenNLP.class.getResourceAsStream("/en-pos-maxent.bin"); 81 | try { 82 | POSModel model = new POSModel(stream); 83 | tagger = new POSTaggerME(model); 84 | logger.info("OpenNLP POS tagger lazily initialised"); 85 | } catch (IOException e) { 86 | e.printStackTrace(); 87 | } finally { 88 | if (stream != null) { 89 | try { 90 | stream.close(); 91 | } catch (IOException e) { 92 | e.printStackTrace(); 93 | } 94 | } 95 | } 96 | } 97 | return tagger; 98 | } 99 | 100 | } 101 | -------------------------------------------------------------------------------- /src/main/java/org/stefano/distributional/model/components/PathCompressor.java: -------------------------------------------------------------------------------- 1 | package org.stefano.distributional.model.components; 2 | 3 | import org.neo4j.graphdb.*; 4 | 5 | import java.util.Optional; 6 | 7 | import static java.util.Objects.requireNonNull; 8 | 9 | /** 10 | * This interface provides a method to generate a compressive summary from a {@code word graph}. 11 | */ 12 | public interface PathCompressor { 13 | 14 | PathExpander EXPANDER = PathExpanderBuilder.empty().add(GraphModel.FOLLOWS, Direction.OUTGOING).build(); 15 | 16 | int MIN_DEPTH = 8; 17 | 18 | /** 19 | * Checks whether the given {@code path} contains at least a {@code verb}. 20 | * 21 | * @param path the {@link Path} to be checked 22 | * @return {@code true} if the given {@code path} contains a {@code verb}, {@code false} otherwise 23 | */ 24 | static boolean hasVerb(Path path) { 25 | requireNonNull(path, "'path' is null"); 26 | 27 | for (Node node : path.nodes()) { 28 | if (node.hasLabel(GraphModel.VERB)) { 29 | return true; 30 | } 31 | } 32 | return false; 33 | } 34 | 35 | /** 36 | * Decodes the given {@code path} into a string and returns it. 37 | * 38 | * @param path the {@link Path} to be decoded 39 | * @return the (possibly empty) string from the given {@code path} 40 | */ 41 | static Optional decode(Path path) { 42 | requireNonNull(path, "'path' is null"); 43 | 44 | String sentence = ""; 45 | for (Node node : path.nodes()) { 46 | sentence = (sentence + " " + node.getProperty("word", "")).trim(); 47 | } 48 | if (!sentence.isEmpty()) { 49 | return Optional.of(sentence + "."); 50 | } 51 | return Optional.empty(); 52 | } 53 | 54 | /** 55 | * This method finds in the given {@code graph} all the paths from {@code start} to {@code end} that only use 56 | * {@code FOLLOWS} relationships and are no longer than the given {@code maxDepth}. 57 | * The paths that have no verbs or shorter than {@link this.MIN_DEPTH} are ignored because they likely lead 58 | * to poor summaries. 59 | * All the other paths are sorted by increasing cost, which is the sum of the (logarithmic) weights 60 | * on the {@code FOLLOWS} relationships of each path. 61 | * The minimal cost path most likely contains the most important concepts that summarises the graph 62 | * in a grammatically sounded way. 63 | * If such path exists, it is compressed to generate the summary to return. 64 | * 65 | * @param graph the {@link GraphDatabaseService} with the {@code word graph} to be summarised 66 | * @param maxDepth the upper bound limit on the paths' length 67 | * @return the string that best summarises the graph, if any 68 | */ 69 | Optional compress(GraphDatabaseService graph, int maxDepth); 70 | 71 | /** 72 | * A {@link Path} associated with its {@code cost}. 73 | */ 74 | final class CostPath implements Comparable { 75 | private final Path path; 76 | private final double cost; 77 | 78 | public CostPath(Path path, double cost) { 79 | this.path = requireNonNull(path, "'path' is null"); 80 | this.cost = cost; 81 | } 82 | 83 | public Path getPath() { 84 | return path; 85 | } 86 | 87 | public double getCost() { 88 | return cost; 89 | } 90 | 91 | @Override 92 | public int compareTo(CostPath other) { 93 | requireNonNull(other, "'other' is null"); 94 | 95 | int result = Double.compare(cost, other.cost); 96 | if (result == 0) { 97 | if (!path.equals(other.path)) { 98 | result = Integer.compare(path.length(), other.path.length()); 99 | if (result == 0) { 100 | result = (int) System.nanoTime() % 2; 101 | } 102 | } 103 | } 104 | return result; 105 | } 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Multi-Sentence Compression 2 | ==== 3 | 4 | Compressing a cluster of related sentences into a single sentence that retains the most relevant non-redundant concepts from the original cluster and is grammatically sound is a complex task. 5 | 6 | This project implements the method suggested in ["Multi-Sentence Compressing: Finding Shortest Paths in Word Graphs"](http://www.aclweb.org/anthology/C10-1037) (**Katja Filippova.** Google Inc. _In Proc of 23rd Intl Conf COLING, 2010._) which is based upon shortest paths in word graphs. 7 | 8 | Specifically, we use: 9 | * [OpenNLP](https://opennlp.apache.org) for basic sentence detection, tokenisation and POD tagging 10 | * [Neo4j](https://neo4j.com) for graph generation and traversal 11 | * Wikipedia's list of [most common words in English](https://en.wikipedia.org/wiki/Most_common_words_in_English). 12 | 13 | The procedure consists in: 14 | * generating a `word graph` 15 | * weighting the **edges** between _words_ 16 | * compressing the graph into a _meaningful summary_. 17 | 18 | Word graph 19 | ---- 20 | 21 | A `word graph` (or `adjacency text graph`) is a directed graph where: 22 | * _words_ become **nodes** (punctuation is ignored) 23 | * _adjacent words_ are connected by **edges** (type: _FOLLOWS_) 24 | * _frequencies_ of words and adjacencies are saved on both **nodes** and **edges**. 25 | 26 | The **lower case text** and **POS tag** of each _word_ act as key, so that words with the same grammatical usage are unique in the graph. 27 | The only exception to this rule is for [stop-words](https://en.wikipedia.org/wiki/Most_common_words_in_English) which are always duplicated (if not involved in a _sintagmatic association_ with a relevant word) to keep their _frequencies_ (and importance in the graph) low. 28 | 29 | Our data model also includes a **node** to represent each _sentence_ (with _id_) and as many _CONTAINS_ **edges** as _words_ in each sentence (with their relative _pos_). The chain of _words_ of each sentence is also preceded by a _START_ **node** and followed by an _END_ **node**. 30 | 31 | Given the following cluster of related sentences: 32 | 33 | 1. _The wife of a former U.S. president Bill Clinton, Hillary Clinton, visited China last Monday._ 34 | 2. _Hillary Clinton wanted to visit China last month but postponed her plans till Monday last week._ 35 | 3. _Hillary Clinton paid a visit to the People Republic of China on Monday._ 36 | 4. _Last week the Secretary State Ms. Clinton visited Chinese officials._ 37 | 38 | the resulting `word graph` is presented below. 39 | 40 | ![Word graph for the example cluster](/images/word-graph.png) 41 | 42 | Weights 43 | ---- 44 | 45 | Both weight methods discussed in the [original paper](http://www.aclweb.org/anthology/C10-1037) have been implemented. 46 | 47 | The **naive** method simply considers the inverse of the _frequency_ of each _FOLLOWS_ **edge**. 48 | 49 | The **advanced** method is more sophisticated as it keeps into account **sintagmatic associations** scaled by 50 | the relative distance of the terms in their enclosing sentences. 51 | 52 | In particular: 53 | 54 | freq(i) + freq(j) 55 | w'(edge(i, j)) = ------------------------------ 56 | SUM(s in S) diff(s, i, j)^-1 57 | 58 | | pos(s, j) - pos(s, i) if pos(s, i) < pos(s, j) 59 | diff(s, i, j) = | 60 | | 0 otherwise 61 | 62 | w'(edge(i, j)) 63 | w"(edge(i, j) = ------------------- 64 | freq(i) x freq(j) 65 | 66 | Notice that these weights are costs: the lower, the better. 67 | 68 | Compression 69 | ---- 70 | 71 | The goal of this step is to generalise the input sentences by generating an appropriate compression (inductive task). 72 | All the **paths** from _START_ to _END_ describe all the possible _worlds_ that can be reached upon summarisation. 73 | 74 | In order to obtain sound summaries, we require paths to be at least **8 words** long and to contain **at least a verb**. 75 | The remaining paths are ranked by **increasing cost**, which is the sum of the weights on their **edges** normalised by **path length**. 76 | 77 | By visiting the _words_ in the **minimal cost path** (if any), the desired compression summary is generated. 78 | 79 | Results 80 | ---- 81 | 82 | The project is organised as a [Gradle Application](https://docs.gradle.org/current/userguide/application_plugin.html), 83 | therefore it is sufficient to issue the following command on the terminal in the root folder of the project 84 | (provided that [Gradle]() is installed locally): 85 | 86 | gradle clean run 87 | 88 | The example introduced above, for instance, produces the following output: 89 | 90 | ![Output for the example cluster](/images/output.png) 91 | 92 | which includes the following summary: 93 | 94 | Hillary Clinton wanted to visit China last week. 95 | 96 | The algorithm has been successfully applied to English and Spanish by using an _ad-hoc_ **stop-word list** of 600 term ca. 97 | The experimental results are discussed in the [original paper](http://www.aclweb.org/anthology/C10-1037). -------------------------------------------------------------------------------- /src/main/java/org/stefano/distributional/model/Summarizer.java: -------------------------------------------------------------------------------- 1 | package org.stefano.distributional.model; 2 | 3 | import org.neo4j.graphdb.GraphDatabaseService; 4 | import org.neo4j.graphdb.factory.GraphDatabaseFactory; 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | import org.stefano.distributional.model.components.GraphEncoder; 8 | import org.stefano.distributional.model.components.GraphWeigher; 9 | import org.stefano.distributional.model.components.PathCompressor; 10 | 11 | import java.io.IOException; 12 | import java.nio.file.FileVisitResult; 13 | import java.nio.file.Files; 14 | import java.nio.file.Path; 15 | import java.nio.file.SimpleFileVisitor; 16 | import java.nio.file.attribute.BasicFileAttributes; 17 | import java.util.Collection; 18 | import java.util.List; 19 | import java.util.Optional; 20 | 21 | import static java.util.Objects.requireNonNull; 22 | 23 | /** 24 | * A facade for {@link GraphEncoder}, {@link GraphWeigher} and {@link PathCompressor} to summarise {@code sentences}. 25 | */ 26 | public final class Summarizer { 27 | 28 | private static final Logger logger = LoggerFactory.getLogger(Summarizer.class); 29 | 30 | private final Path folder; 31 | private final GraphEncoder encoder; 32 | private final GraphWeigher weigher; 33 | private final PathCompressor compressor; 34 | 35 | private Summarizer(SummarizerBuilder builder) { 36 | requireNonNull(builder, "'builder' is null"); 37 | this.folder = builder.currentFolder; 38 | this.encoder = builder.currentEncoder; 39 | this.weigher = builder.currentWeigher; 40 | this.compressor = builder.currentCompressor; 41 | } 42 | 43 | /** 44 | * Returns a {@code builder} for {@link Summarizer}. 45 | * 46 | * @return a {@code builder} for {@link Summarizer} 47 | */ 48 | public static RequiresFolder builder() { 49 | return new SummarizerBuilder(); 50 | } 51 | 52 | /** 53 | * Process the given {@code sentences} with respect to the given {@code stopWords} and returns 54 | * the equivalent {@code multi-sentence compression}, if any. 55 | * 56 | * @param sentences the {@link List} to compress 57 | * @param stopWords the {@link Collection} of common words 58 | * @return the equivalent {@code multi-sentence compression}, if any 59 | */ 60 | public Optional process(List sentences, Collection stopWords) { 61 | requireNonNull(sentences, "'sentences' is null"); 62 | requireNonNull(stopWords, "'stopWords' is null"); 63 | 64 | if (sentences.isEmpty()) { 65 | return Optional.empty(); 66 | } 67 | long elapsed = System.nanoTime(); 68 | logger.debug("Compressing the following sentences:\n\t{}", String.join("\n\t", sentences)); 69 | cleanup(); 70 | GraphDatabaseService graph = new GraphDatabaseFactory().newEmbeddedDatabase(folder.toFile()); 71 | int maxLength = encoder.encode(graph, sentences, stopWords); 72 | weigher.weight(graph); 73 | Optional summary = compressor.compress(graph, maxLength); 74 | graph.shutdown(); 75 | elapsed = System.nanoTime() - elapsed; 76 | logger.info("Compression completed in {} ms.", String.format("%,.3f", elapsed / 1_000_000_000.0)); 77 | return summary; 78 | } 79 | 80 | private void cleanup() { 81 | long elapsed = System.nanoTime(); 82 | logger.debug("Preparing database folder..."); 83 | if (Files.notExists(folder)) { 84 | try { 85 | Files.createDirectories(folder); 86 | } catch (IOException e) { 87 | throw new IllegalArgumentException("'folder' can't be created: " + folder, e); 88 | } 89 | } else { 90 | try { 91 | Files.walkFileTree(folder, new SimpleFileVisitor() { 92 | @Override 93 | public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { 94 | Files.delete(file); 95 | return FileVisitResult.CONTINUE; 96 | } 97 | 98 | @Override 99 | public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException { 100 | Files.delete(dir); 101 | return FileVisitResult.CONTINUE; 102 | } 103 | }); 104 | } catch (IOException e) { 105 | throw new IllegalArgumentException("'folder' can't be deleted: " + folder, e); 106 | } 107 | } 108 | elapsed = System.nanoTime() - elapsed; 109 | logger.debug("Database ready in {} ms.", String.format("%,.3f", elapsed / 1_000_000_000.0)); 110 | } 111 | 112 | /** 113 | * An helper class to build a {@link Summarizer}. 114 | */ 115 | public interface RequiresFolder { 116 | RequiresEncoder on(Path folder); 117 | } 118 | 119 | /** 120 | * An helper class to build a {@link Summarizer}. 121 | */ 122 | public interface RequiresEncoder extends RequiresFolder { 123 | RequiresWeigher withEncoder(GraphEncoder encoder); 124 | } 125 | 126 | /** 127 | * An helper class to build a {@link Summarizer}. 128 | */ 129 | public interface RequiresWeigher extends RequiresEncoder { 130 | RequiresCompressor withWeigher(GraphWeigher weigher); 131 | } 132 | 133 | /** 134 | * An helper class to build a {@link Summarizer}. 135 | */ 136 | public interface RequiresCompressor extends RequiresWeigher { 137 | SummarizerBuilder withCompressor(PathCompressor compressor); 138 | } 139 | 140 | /** 141 | * An helper class to build a {@link Summarizer}. 142 | */ 143 | public static class SummarizerBuilder implements RequiresCompressor { 144 | 145 | private Path currentFolder; 146 | private GraphEncoder currentEncoder; 147 | private GraphWeigher currentWeigher; 148 | private PathCompressor currentCompressor; 149 | 150 | private SummarizerBuilder() { 151 | } 152 | 153 | @Override 154 | public RequiresEncoder on(Path folder) { 155 | requireNonNull(folder, "'graph' is null"); 156 | folder = folder.toAbsolutePath().normalize(); 157 | if (Files.exists(folder) && !Files.isDirectory(folder)) { 158 | throw new IllegalArgumentException("'graph' is not a folder: " + folder); 159 | } 160 | currentFolder = folder; 161 | return this; 162 | } 163 | 164 | @Override 165 | public RequiresWeigher withEncoder(GraphEncoder encoder) { 166 | requireNonNull(encoder, "'encoder' is null"); 167 | currentEncoder = encoder; 168 | return this; 169 | } 170 | 171 | @Override 172 | public RequiresCompressor withWeigher(GraphWeigher weigher) { 173 | requireNonNull(weigher, "'weigher' is null"); 174 | currentWeigher = weigher; 175 | return this; 176 | } 177 | 178 | @Override 179 | public SummarizerBuilder withCompressor(PathCompressor compressor) { 180 | requireNonNull(compressor, "'compressor' is null"); 181 | currentCompressor = compressor; 182 | return this; 183 | } 184 | 185 | public Summarizer build() { 186 | return new Summarizer(this); 187 | } 188 | } 189 | } 190 | -------------------------------------------------------------------------------- /src/main/java/org/stefano/distributional/model/components/impl/DefaultGraphEncoder.java: -------------------------------------------------------------------------------- 1 | package org.stefano.distributional.model.components.impl; 2 | 3 | import opennlp.tools.sentdetect.SentenceDetector; 4 | import org.neo4j.graphdb.*; 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | import org.stefano.distributional.model.components.GraphEncoder; 8 | import org.stefano.distributional.model.components.GraphModel; 9 | import org.stefano.distributional.utils.OpenNLP; 10 | 11 | import java.util.*; 12 | 13 | import static java.util.Objects.requireNonNull; 14 | import static org.stefano.distributional.model.components.GraphModel.*; 15 | 16 | /** 17 | * This class provides the default method to encode some {@code sentences} into a {@code word graph}. 18 | */ 19 | public final class DefaultGraphEncoder implements GraphEncoder { 20 | 21 | private static final Logger logger = LoggerFactory.getLogger(DefaultGraphEncoder.class); 22 | 23 | private static final SentenceDetector DETECTOR = OpenNLP.getSentenceDetector(); 24 | 25 | @Override 26 | public int encode(GraphDatabaseService graph, List sentences, Collection stopWords) { 27 | requireNonNull(graph, "'graph' is null"); 28 | requireNonNull(sentences, "'sentences' is null"); 29 | requireNonNull(stopWords, "'stopWords' is null"); 30 | 31 | int maxLength = 0; 32 | try (Transaction tx = graph.beginTx()) { 33 | long elapsed = System.nanoTime(); 34 | logger.debug("Starting encoding..."); 35 | int id = 0; 36 | for (String content : sentences) { 37 | for (String sentence : DETECTOR.sentDetect(content)) { 38 | Token[] tokens = Token.parse(sentence); 39 | logger.debug("Encoding sentence #{} ({} word/s; punctuation is ignored)...", id, tokens.length); 40 | Node parent = graph.createNode(SENTENCE); 41 | parent.setProperty("id", id++); 42 | parent.setProperty("length", tokens.length); 43 | int pos; 44 | Node previous = GraphModel.start(graph); 45 | for (pos = 0; pos < tokens.length; pos++) { 46 | Node current = tokens[pos].isStopWord(stopWords) ? 47 | getStopWord(graph, tokens, pos) : 48 | getWord(graph, tokens, pos); 49 | parent.createRelationshipTo(current, CONTAINS).setProperty("pos", pos); 50 | GraphEncoder.link(previous, current); 51 | previous = current; 52 | } 53 | GraphEncoder.link(previous, GraphModel.end(graph)); 54 | maxLength = Integer.max(pos, maxLength); 55 | } 56 | } 57 | elapsed = System.nanoTime() - elapsed; 58 | logger.info("Word graph generated in {} ms.", 59 | String.format("%,.3f", elapsed / 1_000_000_000.0)); 60 | tx.success(); 61 | } 62 | return maxLength; 63 | } 64 | 65 | private Node getStopWord(GraphDatabaseService graph, Token[] tokens, int pos) { 66 | Label label = tokens[pos].getLabel(); 67 | String text = tokens[pos].getText(); 68 | ResourceIterator nodes = graph.findNodes(label, "text", text); 69 | if (nodes.hasNext()) { 70 | List contexts = new ArrayList<>(); 71 | while (nodes.hasNext()) { 72 | Node node = nodes.next(); 73 | Context context = getContext(graph, tokens, pos, node); 74 | if (!context.isEmpty()) { 75 | contexts.add(context); 76 | } 77 | } 78 | if (!contexts.isEmpty()) { 79 | Collections.sort(contexts); 80 | Node node = contexts.get(0).getNode(); 81 | node.setProperty("freq", (double) node.getProperty("freq", 1.0) + 1.0); 82 | return node; 83 | } 84 | return GraphEncoder.word(graph, tokens[pos], true); 85 | } 86 | return GraphEncoder.word(graph, tokens[pos], true); 87 | } 88 | 89 | private Node getWord(GraphDatabaseService graph, Token[] tokens, int pos) { 90 | Label label = tokens[pos].getLabel(); 91 | String text = tokens[pos].getText(); 92 | ResourceIterator nodes = graph.findNodes(label, "text", text); 93 | if (nodes.hasNext()) { 94 | List contexts = new ArrayList<>(); 95 | while (nodes.hasNext()) { 96 | Node node = nodes.next(); 97 | Context context = getContext(graph, tokens, pos, node); 98 | contexts.add(context); 99 | } 100 | Collections.sort(contexts); 101 | Node node = contexts.get(0).getNode(); 102 | node.setProperty("freq", (double) node.getProperty("freq", 1.0) + 1.0); 103 | return node; 104 | } 105 | return GraphEncoder.word(graph, tokens[pos], false); 106 | } 107 | 108 | private Context getContext(GraphDatabaseService graph, Token[] tokens, int pos, Node node) { 109 | int count = 0; 110 | double freq = 0.0; 111 | for (Direction direction : new Direction[]{Direction.INCOMING, Direction.OUTGOING}) { 112 | Collection texts = getTextsFromToken(tokens, pos, direction, 3); 113 | if (!texts.isEmpty()) { 114 | Map freqTexts = getTextsFromNode(node, direction, 3); 115 | texts.retainAll(freqTexts.keySet()); 116 | count += texts.size(); 117 | for (String t : texts) { 118 | freq += freqTexts.getOrDefault(t, 1.0); 119 | } 120 | } 121 | } 122 | return new Context(node, count, freq); 123 | } 124 | 125 | private Collection getTextsFromToken(Token[] tokens, int pos, Direction direction, int distance) { 126 | if (direction == Direction.BOTH) { 127 | return Collections.emptySet(); 128 | } 129 | Set result = new HashSet<>(); 130 | int min = direction == Direction.INCOMING ? 131 | Integer.max(0, pos - distance) : 132 | pos + 1; 133 | int max = direction == Direction.INCOMING ? 134 | pos : Integer.min(tokens.length, pos + distance + 1); 135 | for (int i = min; i < max; i++) { 136 | result.add(tokens[i].getText()); 137 | } 138 | return result; 139 | } 140 | 141 | private Map getTextsFromNode(Node node, Direction direction, int distance) { 142 | if (distance < 0) { 143 | return Collections.emptyMap(); 144 | } 145 | Map result = new HashMap<>(); 146 | for (Relationship relationship : node.getRelationships(FOLLOWS, direction)) { 147 | Node other = relationship.getOtherNode(node); 148 | String text = (String) other.getProperty("text", ""); 149 | double freq = (double) other.getProperty("freq", 1.0); 150 | if (!text.isEmpty()) { 151 | result.put(text, result.getOrDefault(text, 1.0) + freq); 152 | if (distance > 1) { 153 | Map map = getTextsFromNode(other, direction, distance - 1); 154 | for (String mapText : map.keySet()) { 155 | double mapFreq = map.get(mapText); 156 | result.put(mapText, result.getOrDefault(mapText, 1.0) + mapFreq); 157 | } 158 | } 159 | } 160 | } 161 | return result; 162 | } 163 | 164 | 165 | } 166 | -------------------------------------------------------------------------------- /src/main/java/org/stefano/distributional/model/components/GraphEncoder.java: -------------------------------------------------------------------------------- 1 | package org.stefano.distributional.model.components; 2 | 3 | import opennlp.tools.postag.POSTagger; 4 | import opennlp.tools.tokenize.Tokenizer; 5 | import org.neo4j.graphdb.*; 6 | import org.stefano.distributional.utils.OpenNLP; 7 | 8 | import java.util.*; 9 | 10 | import static java.util.Objects.requireNonNull; 11 | 12 | /** 13 | * This interface provides a method to encode some {@code sentences} into a {@code word graph}. 14 | */ 15 | public interface GraphEncoder { 16 | 17 | /** 18 | * Creates a {@code FOLLOWS} relationship between the given {@code tail} and {@code head} nodes 19 | * with {@code frequency} {@code 1.0} if no such relationship already exists, or updates 20 | * the {@code frequency} of the existing and eventually returns it. 21 | * 22 | * @param tail the start {@link Node} of the link to handle 23 | * @param head the end {@link Node} of the link to handle 24 | * @return the relationship between {@code tail} and {@code head} with updated {@code frequency} 25 | * if exists, a newly created relationship with {@code frequency} {@code 1.0} otherwise 26 | */ 27 | static Relationship link(Node tail, Node head) { 28 | for (Relationship relationship : tail.getRelationships(GraphModel.FOLLOWS, Direction.OUTGOING)) { 29 | if (relationship.getOtherNode(tail).equals(head)) { 30 | double weight = (double) relationship.getProperty("freq", 1.0); 31 | relationship.setProperty("freq", 1.0 + weight); 32 | return relationship; 33 | } 34 | } 35 | Relationship relationship = tail.createRelationshipTo(head, GraphModel.FOLLOWS); 36 | relationship.setProperty("freq", 1.0); 37 | return relationship; 38 | } 39 | 40 | /** 41 | * Creates a {@link Node} in the given {@code graph} using the given {@code token} and {@code stopWord} flag. 42 | * 43 | * @param graph the {@link GraphDatabaseService} where to create a node 44 | * @param token the {@link Token} to convert into a node 45 | * @param stopWord a flag which tells if the node refers to a common word or not 46 | * @return the resulting {@link Node} 47 | */ 48 | static Node word(GraphDatabaseService graph, Token token, boolean stopWord) { 49 | Label label = token.getLabel(); 50 | Node node = graph.createNode(GraphModel.WORD, label); 51 | if (token.getTag().startsWith("VB")) { 52 | node.addLabel(GraphModel.VERB); 53 | } 54 | node.setProperty("text", token.getText()); 55 | node.setProperty("word", token.getWord()); 56 | node.setProperty("freq", 1.0); 57 | node.setProperty("stop", stopWord); 58 | return node; 59 | } 60 | 61 | 62 | /** 63 | * Encodes the given {@code sentences} as a {@code word graph} using the given {@code stopWords} 64 | * into the given {@code graph}, returning the length of the longest sentence. 65 | * Notice that punctuation is ignored and common words tend to build secondary paths. 66 | * 67 | * @param graph the {@link GraphDatabaseService} where the given {@code sentences} are going to be saved 68 | * @param sentences the {@link List} to be encoded into the given {@code graph} 69 | * @param stopWords the {@link Collection} to identify common words 70 | * @return the number of words of the longest sentence among the given {@code sentences} 71 | */ 72 | int encode(GraphDatabaseService graph, List sentences, Collection stopWords); 73 | 74 | /** 75 | * A {@code token} with (lower) text, word and POS tag. 76 | */ 77 | final class Token { 78 | 79 | private static final Tokenizer TOKENIZER = OpenNLP.getTokenizer(); 80 | private static final POSTagger TAGGER = OpenNLP.getPOSTagger(); 81 | private static final Map LABELS = new HashMap<>(); 82 | private final String text; 83 | private final String word; 84 | private final String tag; 85 | 86 | private Token(String token, String tag) { 87 | token = requireNonNull(token, "'token' is null").trim(); 88 | if (token.isEmpty()) { 89 | throw new IllegalArgumentException("'token' is empty"); 90 | } 91 | this.tag = requireNonNull(tag, "'tag' is null").trim(); 92 | if (this.tag.isEmpty()) { 93 | throw new IllegalArgumentException("'tag' is empty"); 94 | } 95 | this.text = token.toLowerCase(); 96 | this.word = token; 97 | } 98 | 99 | public static Token[] parse(String sentence) { 100 | sentence = requireNonNull(sentence, "'sentence' is null").trim(); 101 | if (sentence.isEmpty()) { 102 | throw new IllegalArgumentException("'sentence' is empty"); 103 | } 104 | 105 | String[] tokens = TOKENIZER.tokenize(sentence); 106 | String[] tags = TAGGER.tag(tokens); 107 | List result = new ArrayList<>(); 108 | for (int i = 0; i < tokens.length; i++) { 109 | if (isWord(tokens[i])) { 110 | Token token = new Token(tokens[i], tags[i]); 111 | result.add(token); 112 | } 113 | } 114 | return result.toArray(new Token[result.size()]); 115 | } 116 | 117 | private static boolean isWord(String symbol) { 118 | symbol = requireNonNull(symbol, "'symbol' is null").trim(); 119 | if (symbol.isEmpty()) { 120 | throw new IllegalArgumentException("'symbol' is empty"); 121 | } 122 | 123 | return symbol.matches("^(?=.*[\\p{L}\\p{N}'-]).+$"); 124 | } 125 | 126 | public Label getLabel() { 127 | return LABELS.computeIfAbsent(tag, k -> Label.label(tag)); 128 | } 129 | 130 | public String getText() { 131 | return text; 132 | } 133 | 134 | public String getWord() { 135 | return word; 136 | } 137 | 138 | public String getTag() { 139 | return tag; 140 | } 141 | 142 | public boolean isStopWord(Collection stopWords) { 143 | requireNonNull(stopWords, "'stopWords' is null"); 144 | 145 | return stopWords.contains(text); 146 | } 147 | 148 | } 149 | 150 | /** 151 | * A {@code context} for a {@code word} with {@code matches} and {@code occurrences}. 152 | */ 153 | final class Context implements Comparable { 154 | 155 | private final Node node; 156 | 157 | private final int matches; 158 | 159 | private final double occurrences; 160 | 161 | public Context(Node node, int matches, double occurrences) { 162 | this.node = requireNonNull(node, "'node' is null"); 163 | this.matches = matches; 164 | this.occurrences = occurrences; 165 | } 166 | 167 | public Node getNode() { 168 | return node; 169 | } 170 | 171 | public boolean isEmpty() { 172 | return matches <= 0; 173 | } 174 | 175 | @Override 176 | public int compareTo(Context other) { 177 | requireNonNull(other, "'other' is null"); 178 | 179 | int result = Integer.compare(other.matches, this.matches); 180 | if (result == 0) { 181 | result = Double.compare(other.occurrences, this.occurrences); 182 | if (result == 0) { 183 | if (node.equals(other.node)) { 184 | result = 0; 185 | } else { 186 | result = (int) System.currentTimeMillis() % 2; 187 | } 188 | } 189 | } 190 | return result; 191 | } 192 | } 193 | } 194 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | --------------------------------------------------------------------------------