├── .gitignore
├── README.md
├── input
└── content.txt
├── output
└── .gitignore
├── pom.xml
└── src
└── main
└── java
└── com
└── technobium
└── word2vec
└── Word2VecDemo.java
/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 |
3 | # Mobile Tools for Java (J2ME)
4 | .mtj.tmp/
5 |
6 | # Package Files #
7 | *.jar
8 | *.war
9 | *.ear
10 |
11 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
12 | hs_err_pid*
13 |
14 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
15 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
16 |
17 | # User-specific stuff:
18 | .idea/workspace.xml
19 | .idea/tasks.xml
20 |
21 | # Sensitive or high-churn files:
22 | .idea/dataSources/
23 | .idea/dataSources.ids
24 | .idea/dataSources.xml
25 | .idea/dataSources.local.xml
26 | .idea/sqlDataSources.xml
27 | .idea/dynamic.xml
28 | .idea/uiDesigner.xml
29 |
30 | # Gradle:
31 | .idea/gradle.xml
32 | .idea/libraries
33 |
34 | # Mongo Explorer plugin:
35 | .idea/mongoSettings.xml
36 |
37 | ## File-based project format:
38 | *.iws
39 |
40 | ## Plugin-specific files:
41 |
42 | # IntelliJ
43 | /out/
44 |
45 | # mpeltonen/sbt-idea plugin
46 | .idea_modules/
47 |
48 | # JIRA plugin
49 | atlassian-ide-plugin.xml
50 |
51 | # Crashlytics plugin (for Android Studio and IntelliJ)
52 | com_crashlytics_export_strings.xml
53 | crashlytics.properties
54 | crashlytics-build.properties
55 | fabric.properties
56 | .idea/
57 | target/
58 | word2vec-demo.iml
59 | output/word2vec.bin
60 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # dl4j-word2vec
2 | Deeplearning4j Word2Vec Demo
3 |
4 | Get the content of the War and Peace book from the following location http://www.textfiles.com/etext/FICTION/warpeace.txt. Copy and paste it into input/content.txt file.
5 |
6 |
--------------------------------------------------------------------------------
/input/content.txt:
--------------------------------------------------------------------------------
1 | # TODO - copy content from http://www.textfiles.com/etext/FICTION/warpeace.txt
--------------------------------------------------------------------------------
/output/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
5 |
6 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | com.technobium
8 | dl4j-word2vec
9 | 1.0-SNAPSHOT
10 |
11 |
12 | 0.7.1
13 | 0.7.1
14 |
15 |
16 |
17 |
18 | org.deeplearning4j
19 | deeplearning4j-ui-model
20 | ${dl4j.version}
21 |
22 |
23 | org.deeplearning4j
24 | deeplearning4j-nlp
25 | ${dl4j.version}
26 |
27 |
28 | org.nd4j
29 | nd4j-native
30 | ${nd4j.version}
31 |
32 |
33 |
34 |
35 |
--------------------------------------------------------------------------------
/src/main/java/com/technobium/word2vec/Word2VecDemo.java:
--------------------------------------------------------------------------------
1 | package com.technobium.word2vec;
2 |
3 | import org.deeplearning4j.models.embeddings.learning.impl.elements.SkipGram;
4 | import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer;
5 | import org.deeplearning4j.models.word2vec.VocabWord;
6 | import org.deeplearning4j.models.word2vec.Word2Vec;
7 | import org.deeplearning4j.text.sentenceiterator.FileSentenceIterator;
8 | import org.deeplearning4j.text.sentenceiterator.SentenceIterator;
9 | import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor;
10 | import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;
11 | import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory;
12 |
13 | import java.io.File;
14 | import java.io.IOException;
15 | import java.util.Collection;
16 |
17 | public class Word2VecDemo {
18 |
19 | private String inputFilePath = "input/content.txt";
20 | private String modelFilePath = "output/word2vec.bin";
21 |
22 | public static void main(String[] args) throws IOException {
23 |
24 | Word2VecDemo word2VecDemo = new Word2VecDemo();
25 | // Before starting the training, don't forget to add the text into input/content.txt
26 | word2VecDemo.train();
27 |
28 | Word2Vec word2VecModel = WordVectorSerializer.readWord2VecModel(new File(word2VecDemo.modelFilePath));
29 |
30 | Collection list = word2VecModel.wordsNearest("boy" , 10);
31 | System.out.println(" boy: "+ list);
32 |
33 | list = word2VecModel.wordsNearest("girl" , 10);
34 | System.out.println("girl: " + list);
35 |
36 | Collection stringList = word2VecModel.wordsNearest("day", 10);
37 | System.out.println(" day: " + stringList);
38 | }
39 |
40 | public void train() throws IOException {
41 | SentenceIterator sentenceIterator = new FileSentenceIterator(new File(inputFilePath));
42 | TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory();
43 | tokenizerFactory.setTokenPreProcessor(new CommonPreprocessor());
44 |
45 | Word2Vec vec = new Word2Vec.Builder()
46 | .minWordFrequency(2)
47 | .layerSize(300)
48 | .windowSize(5)
49 | .seed(42)
50 | .epochs(3)
51 | .elementsLearningAlgorithm(new SkipGram())
52 | .iterate(sentenceIterator)
53 | .tokenizerFactory(tokenizerFactory)
54 | .build();
55 | vec.fit();
56 |
57 | WordVectorSerializer.writeWordVectors(vec, "output/word2vec.bin");
58 | }
59 | }
60 |
--------------------------------------------------------------------------------