nodes = graph.findNodes(label);
56 | if (nodes.hasNext()) {
57 | Node node = nodes.next();
58 | double freq = (double) node.getProperty("freq", 1.0);
59 | node.setProperty("freq", 1.0 + freq);
60 | return node;
61 | }
62 | Node node = graph.createNode(label);
63 | node.setProperty("freq", 1.0);
64 | return node;
65 | }
66 |
67 | }
68 |
--------------------------------------------------------------------------------
/src/main/java/org/stefano/distributional/model/components/impl/AdvancedGraphWeigher.java:
--------------------------------------------------------------------------------
1 | package org.stefano.distributional.model.components.impl;
2 |
3 | import org.neo4j.graphdb.*;
4 | import org.slf4j.Logger;
5 | import org.slf4j.LoggerFactory;
6 | import org.stefano.distributional.model.components.GraphWeigher;
7 |
8 | import static java.util.Objects.requireNonNull;
9 | import static org.stefano.distributional.model.components.GraphModel.CONTAINS;
10 | import static org.stefano.distributional.model.components.GraphModel.FOLLOWS;
11 |
12 | /**
13 | * This class provides an advanced method to weight the {@code FOLLOWS} relationships in a {@code word graph}.
14 | * This method generate weights that are inversely proportional to the frequency of paths scaled down by their length.
15 | */
16 | public final class AdvancedGraphWeigher implements GraphWeigher {
17 |
18 | private static final Logger logger = LoggerFactory.getLogger(AdvancedGraphWeigher.class);
19 |
20 | @Override
21 | public void weight(GraphDatabaseService graph) {
22 | requireNonNull(graph, "'graph' is null");
23 |
24 | int total = 0;
25 | try (Transaction tx = graph.beginTx()) {
26 | long elapsed = System.nanoTime();
27 | logger.debug("Computing weights between words...");
28 | for (Relationship follows : graph.getAllRelationships()) {
29 | if (follows.isType(FOLLOWS)) {
30 | Node tail = follows.getStartNode();
31 | Node head = follows.getEndNode();
32 | double freqTail = (double) tail.getProperty("freq", 1.0);
33 | double freqHead = (double) head.getProperty("freq", 1.0);
34 | double denom = 0.0;
35 | for (Relationship containsTail : tail.getRelationships(CONTAINS, Direction.INCOMING)) {
36 | int posTail = (int) containsTail.getProperty("pos", 0);
37 | Node sentence = containsTail.getStartNode();
38 | for (Relationship containsHead : sentence.getRelationships(CONTAINS, Direction.OUTGOING)) {
39 | if (containsHead.getEndNode().equals(head)) {
40 | int posHead = (int) containsHead.getProperty("pos", 0);
41 | denom += 1.0 / (posHead - posTail);
42 | }
43 | }
44 | }
45 | double weight = (freqTail + freqHead) / denom;
46 | weight = (weight) / (freqTail * freqHead);
47 | follows.setProperty("weight", weight);
48 | total += 1;
49 | if (total % 50 == 0) {
50 | logger.debug("{} relationships analysed so far...", total);
51 | }
52 | }
53 | }
54 | elapsed = System.nanoTime() - elapsed;
55 | logger.info("{} relationship/s analysed in {} ms.",
56 | total, String.format("%,.3f", elapsed / 1_000_000_000.0));
57 | tx.success();
58 | }
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by .ignore support plugin (hsz.mobi)
2 | ### JetBrains template
3 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
4 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
5 |
6 | # User-specific stuff:
7 | .idea
8 | .idea/workspace.xml
9 | .idea/tasks.xml
10 | .idea/dictionaries
11 | .idea/vcs.xml
12 | .idea/jsLibraryMappings.xml
13 |
14 | # Sensitive or high-churn files:
15 | .idea/dataSources.ids
16 | .idea/dataSources.xml
17 | .idea/dataSources.local.xml
18 | .idea/sqlDataSources.xml
19 | .idea/dynamic.xml
20 | .idea/uiDesigner.xml
21 |
22 | # Gradle:
23 | .idea/gradle.xml
24 | .idea/libraries
25 |
26 | # Mongo Explorer plugin:
27 | .idea/mongoSettings.xml
28 |
29 | ## File-based project format:
30 | *.iws
31 |
32 | ## Plugin-specific files:
33 |
34 | # IntelliJ
35 | /out/
36 |
37 | # mpeltonen/sbt-idea plugin
38 | .idea_modules/
39 |
40 | # JIRA plugin
41 | atlassian-ide-plugin.xml
42 |
43 | # Crashlytics plugin (for Android Studio and IntelliJ)
44 | com_crashlytics_export_strings.xml
45 | crashlytics.properties
46 | crashlytics-build.properties
47 | fabric.properties
48 | ### Java template
49 | *.class
50 |
51 | # Mobile Tools for Java (J2ME)
52 | .mtj.tmp/
53 |
54 | # Package Files #
55 | *.jar
56 | *.war
57 | *.ear
58 |
59 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
60 | hs_err_pid*
61 | ### Maven template
62 | target/
63 | pom.xml.tag
64 | pom.xml.releaseBackup
65 | pom.xml.versionsBackup
66 | pom.xml.next
67 | release.properties
68 | dependency-reduced-pom.xml
69 | buildNumber.properties
70 | .mvn/timing.properties
71 | ### OSX template
72 | *.DS_Store
73 | .AppleDouble
74 | .LSOverride
75 |
76 | # Icon must end with two \r
77 | Icon
78 |
79 | # Thumbnails
80 | ._*
81 |
82 | # Files that might appear in the root of a volume
83 | .DocumentRevisions-V100
84 | .fseventsd
85 | .Spotlight-V100
86 | .TemporaryItems
87 | .Trashes
88 | .VolumeIcon.icns
89 | .com.apple.timemachine.donotpresent
90 |
91 | # Directories potentially created on remote AFP share
92 | .AppleDB
93 | .AppleDesktop
94 | Network Trash Folder
95 | Temporary Items
96 | .apdisk
97 | ### Windows template
98 | # Windows image file caches
99 | Thumbs.db
100 | ehthumbs.db
101 |
102 | # Folder config file
103 | Desktop.ini
104 |
105 | # Recycle Bin used on file shares
106 | $RECYCLE.BIN/
107 |
108 | # Windows Installer files
109 | *.cab
110 | *.msi
111 | *.msm
112 | *.msp
113 |
114 | # Windows shortcuts
115 | *.lnk
116 | ### Linux template
117 | *~
118 |
119 | # temporary files which can be created if a process still has a handle open of a deleted file
120 | .fuse_hidden*
121 |
122 | # KDE directory preferences
123 | .directory
124 |
125 | # Linux trash folder which might appear on any partition or disk
126 | .Trash-*
127 | ### Gradle template
128 | .gradle
129 | build/
130 |
131 | # Ignore Gradle GUI config
132 | gradle-app.setting
133 |
134 | # Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored)
135 | !gradle-wrapper.jar
136 |
137 | # Cache of project
138 | .gradletasknamecache
139 |
140 | # # Work around https://youtrack.jetbrains.com/issue/IDEA-116898
141 | # gradle/wrapper/gradle-wrapper.properties
142 | .idea/
143 | src/test/
144 |
--------------------------------------------------------------------------------
/src/main/java/org/stefano/distributional/Main.java:
--------------------------------------------------------------------------------
1 | package org.stefano.distributional;
2 |
3 | import org.slf4j.Logger;
4 | import org.slf4j.LoggerFactory;
5 | import org.stefano.distributional.model.Summarizer;
6 | import org.stefano.distributional.model.components.impl.AdvancedGraphWeigher;
7 | import org.stefano.distributional.model.components.impl.DefaultGraphEncoder;
8 | import org.stefano.distributional.model.components.impl.DefaultPathCompressor;
9 | import org.stefano.distributional.utils.OpenNLP;
10 |
11 | import java.io.IOException;
12 | import java.nio.file.Path;
13 | import java.nio.file.Paths;
14 | import java.util.Arrays;
15 | import java.util.Collection;
16 | import java.util.List;
17 | import java.util.Optional;
18 |
19 | /**
20 | * TODO Replace with proper description...
21 | *
22 | * Created by stefano on 23/01/2017.
23 | */
24 | public class Main {
25 |
26 | private static final Logger logger = LoggerFactory.getLogger(Main.class);
27 |
28 | public static void main(String[] args) throws IOException {
29 | Path folder = Paths.get(args[0]);
30 |
31 | List sentences = Arrays.asList(
32 | "The wife of a former U.S. president Bill Clinton, Hillary Clinton, visited China last Monday.",
33 | "Hillary Clinton wanted to visit China last month but postponed her plans till Monday last week.",
34 | "Hillary Clinton paid a visit to the People Republic of China on Monday.",
35 | "Last week the Secretary State Ms. Clinton visited Chinese officials.");
36 |
37 | Collection stopWords = Arrays.asList("a", "able", "about", "above", "after", "all", "also", "an",
38 | "and", "any", "as", "ask", "at", "back", "bad", "be", "because", "beneath", "big", "but", "by",
39 | "call", "can", "case", "child", "come", "company", "could", "day", "different", "do", "early", "even",
40 | "eye", "fact", "feel", "few", "find", "first", "for", "from", "get", "give", "go", "good",
41 | "government", "great", "group", "hand", "have", "he", "her", "high", "him", "his", "how", "i", "if",
42 | "important", "in", "into", "it", "its", "just", "know", "large", "last", "leave", "life", "like",
43 | "little", "long", "look", "make", "man", "me", "most", "my", "new", "next", "no", "not", "now",
44 | "number", "of", "old", "on", "one", "only", "or", "other", "our", "out", "over", "own", "part",
45 | "people", "person", "place", "point", "problem", "public", "right", "same", "say", "see", "seem",
46 | "she", "small", "so", "some", "take", "tell", "than", "that", "the", "their", "them", "then", "there",
47 | "these", "they", "thing", "think", "this", "time", "to", "try", "two", "under", "up", "us", "use",
48 | "want", "way", "we", "week", "well", "what", "when", "which", "who", "will", "with", "woman", "work",
49 | "world", "would", "year", "you", "young", "your");
50 |
51 | Summarizer summarizer = Summarizer.builder()
52 | .on(folder)
53 | .withEncoder(new DefaultGraphEncoder())
54 | .withWeigher(new AdvancedGraphWeigher())
55 | .withCompressor(new DefaultPathCompressor())
56 | .build();
57 | Optional summary = summarizer.process(sentences, stopWords);
58 | if (summary.isPresent()) {
59 | System.out.println(" >> " +summary.get());
60 | } else {
61 | logger.info("No summary available.");
62 | }
63 | logger.info("Done.");
64 | }
65 |
66 | }
67 |
--------------------------------------------------------------------------------
/src/main/java/org/stefano/distributional/utils/OpenNLP.java:
--------------------------------------------------------------------------------
1 | package org.stefano.distributional.utils;
2 |
3 | import opennlp.tools.postag.POSModel;
4 | import opennlp.tools.postag.POSTagger;
5 | import opennlp.tools.postag.POSTaggerME;
6 | import opennlp.tools.sentdetect.SentenceDetector;
7 | import opennlp.tools.sentdetect.SentenceDetectorME;
8 | import opennlp.tools.sentdetect.SentenceModel;
9 | import opennlp.tools.tokenize.Tokenizer;
10 | import opennlp.tools.tokenize.TokenizerME;
11 | import opennlp.tools.tokenize.TokenizerModel;
12 | import org.slf4j.Logger;
13 | import org.slf4j.LoggerFactory;
14 |
15 | import java.io.IOException;
16 | import java.io.InputStream;
17 |
18 | /**
19 | * TODO Replace with proper description...
20 | *
21 | * Created by stefano on 23/01/2017.
22 | */
23 | public class OpenNLP {
24 |
25 | private static final Logger logger = LoggerFactory.getLogger(OpenNLP.class);
26 | private static SentenceDetector detector = null;
27 | private static Tokenizer tokenizer = null;
28 | private static POSTagger tagger = null;
29 |
30 | private OpenNLP() {
31 | throw new UnsupportedOperationException("'OpenNLP' class should not be instantiated");
32 | }
33 |
34 | public static SentenceDetector getSentenceDetector() {
35 | if (detector == null) {
36 | InputStream stream = OpenNLP.class.getResourceAsStream("/en-sent.bin");
37 | try {
38 | SentenceModel model = new SentenceModel(stream);
39 | detector = new SentenceDetectorME(model);
40 | logger.info("OpenNLP sentence detector lazily initialised");
41 | } catch (IOException e) {
42 | e.printStackTrace();
43 | } finally {
44 | if (stream != null) {
45 | try {
46 | stream.close();
47 | } catch (IOException e) {
48 | e.printStackTrace();
49 | }
50 | }
51 | }
52 | }
53 | return detector;
54 | }
55 |
56 | public static Tokenizer getTokenizer() {
57 | if (tokenizer == null) {
58 | InputStream stream = OpenNLP.class.getResourceAsStream("/en-token.bin");
59 | try {
60 | TokenizerModel model = new TokenizerModel(stream);
61 | tokenizer = new TokenizerME(model);
62 | logger.info("OpenNLP tokenizer lazily initialised");
63 | } catch (IOException e) {
64 | e.printStackTrace();
65 | } finally {
66 | if (stream != null) {
67 | try {
68 | stream.close();
69 | } catch (IOException e) {
70 | e.printStackTrace();
71 | }
72 | }
73 | }
74 | }
75 | return tokenizer;
76 | }
77 |
78 | public static POSTagger getPOSTagger() {
79 | if (tagger == null) {
80 | InputStream stream = OpenNLP.class.getResourceAsStream("/en-pos-maxent.bin");
81 | try {
82 | POSModel model = new POSModel(stream);
83 | tagger = new POSTaggerME(model);
84 | logger.info("OpenNLP POS tagger lazily initialised");
85 | } catch (IOException e) {
86 | e.printStackTrace();
87 | } finally {
88 | if (stream != null) {
89 | try {
90 | stream.close();
91 | } catch (IOException e) {
92 | e.printStackTrace();
93 | }
94 | }
95 | }
96 | }
97 | return tagger;
98 | }
99 |
100 | }
101 |
--------------------------------------------------------------------------------
/src/main/java/org/stefano/distributional/model/components/PathCompressor.java:
--------------------------------------------------------------------------------
1 | package org.stefano.distributional.model.components;
2 |
3 | import org.neo4j.graphdb.*;
4 |
5 | import java.util.Optional;
6 |
7 | import static java.util.Objects.requireNonNull;
8 |
9 | /**
10 | * This interface provides a method to generate a compressive summary from a {@code word graph}.
11 | */
12 | public interface PathCompressor {
13 |
14 | PathExpander