├── .gitignore ├── README.md ├── input └── content.txt ├── output └── .gitignore ├── pom.xml └── src └── main └── java └── com └── technobium └── word2vec └── Word2VecDemo.java /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | 3 | # Mobile Tools for Java (J2ME) 4 | .mtj.tmp/ 5 | 6 | # Package Files # 7 | *.jar 8 | *.war 9 | *.ear 10 | 11 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 12 | hs_err_pid* 13 | 14 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 15 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 16 | 17 | # User-specific stuff: 18 | .idea/workspace.xml 19 | .idea/tasks.xml 20 | 21 | # Sensitive or high-churn files: 22 | .idea/dataSources/ 23 | .idea/dataSources.ids 24 | .idea/dataSources.xml 25 | .idea/dataSources.local.xml 26 | .idea/sqlDataSources.xml 27 | .idea/dynamic.xml 28 | .idea/uiDesigner.xml 29 | 30 | # Gradle: 31 | .idea/gradle.xml 32 | .idea/libraries 33 | 34 | # Mongo Explorer plugin: 35 | .idea/mongoSettings.xml 36 | 37 | ## File-based project format: 38 | *.iws 39 | 40 | ## Plugin-specific files: 41 | 42 | # IntelliJ 43 | /out/ 44 | 45 | # mpeltonen/sbt-idea plugin 46 | .idea_modules/ 47 | 48 | # JIRA plugin 49 | atlassian-ide-plugin.xml 50 | 51 | # Crashlytics plugin (for Android Studio and IntelliJ) 52 | com_crashlytics_export_strings.xml 53 | crashlytics.properties 54 | crashlytics-build.properties 55 | fabric.properties 56 | .idea/ 57 | target/ 58 | word2vec-demo.iml 59 | output/word2vec.bin 60 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dl4j-word2vec 2 | Deeplearning4j Word2Vec Demo 3 | 4 | Get the content of the War and Peace book from the following location http://www.textfiles.com/etext/FICTION/warpeace.txt. Copy and paste it into input/content.txt file. 5 | 6 | -------------------------------------------------------------------------------- /input/content.txt: -------------------------------------------------------------------------------- 1 | # TODO - copy content from http://www.textfiles.com/etext/FICTION/warpeace.txt -------------------------------------------------------------------------------- /output/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore 5 | 6 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.technobium 8 | dl4j-word2vec 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 0.7.1 13 | 0.7.1 14 | 15 | 16 | 17 | 18 | org.deeplearning4j 19 | deeplearning4j-ui-model 20 | ${dl4j.version} 21 | 22 | 23 | org.deeplearning4j 24 | deeplearning4j-nlp 25 | ${dl4j.version} 26 | 27 | 28 | org.nd4j 29 | nd4j-native 30 | ${nd4j.version} 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /src/main/java/com/technobium/word2vec/Word2VecDemo.java: -------------------------------------------------------------------------------- 1 | package com.technobium.word2vec; 2 | 3 | import org.deeplearning4j.models.embeddings.learning.impl.elements.SkipGram; 4 | import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; 5 | import org.deeplearning4j.models.word2vec.VocabWord; 6 | import org.deeplearning4j.models.word2vec.Word2Vec; 7 | import org.deeplearning4j.text.sentenceiterator.FileSentenceIterator; 8 | import org.deeplearning4j.text.sentenceiterator.SentenceIterator; 9 | import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; 10 | import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory; 11 | import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; 12 | 13 | import java.io.File; 14 | import java.io.IOException; 15 | import java.util.Collection; 16 | 17 | public class Word2VecDemo { 18 | 19 | private String inputFilePath = "input/content.txt"; 20 | private String modelFilePath = "output/word2vec.bin"; 21 | 22 | public static void main(String[] args) throws IOException { 23 | 24 | Word2VecDemo word2VecDemo = new Word2VecDemo(); 25 | // Before starting the training, don't forget to add the text into input/content.txt 26 | word2VecDemo.train(); 27 | 28 | Word2Vec word2VecModel = WordVectorSerializer.readWord2VecModel(new File(word2VecDemo.modelFilePath)); 29 | 30 | Collection list = word2VecModel.wordsNearest("boy" , 10); 31 | System.out.println(" boy: "+ list); 32 | 33 | list = word2VecModel.wordsNearest("girl" , 10); 34 | System.out.println("girl: " + list); 35 | 36 | Collection stringList = word2VecModel.wordsNearest("day", 10); 37 | System.out.println(" day: " + stringList); 38 | } 39 | 40 | public void train() throws IOException { 41 | SentenceIterator sentenceIterator = new FileSentenceIterator(new File(inputFilePath)); 42 | TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory(); 43 | tokenizerFactory.setTokenPreProcessor(new CommonPreprocessor()); 44 | 45 | Word2Vec vec = new Word2Vec.Builder() 46 | .minWordFrequency(2) 47 | .layerSize(300) 48 | .windowSize(5) 49 | .seed(42) 50 | .epochs(3) 51 | .elementsLearningAlgorithm(new SkipGram()) 52 | .iterate(sentenceIterator) 53 | .tokenizerFactory(tokenizerFactory) 54 | .build(); 55 | vec.fit(); 56 | 57 | WordVectorSerializer.writeWordVectors(vec, "output/word2vec.bin"); 58 | } 59 | } 60 | --------------------------------------------------------------------------------