├── Chapter01 └── Chapter1.java ├── Chapter02 ├── Chapter2.java ├── MyTokenizerFactory.java ├── PorterStemmer.java ├── StemmingLemaEx.java ├── StopWords.java └── Test.java ├── Chapter03 ├── Chapter3.java ├── SBDDemo.java ├── XMLProcessingDemo.java └── XMLTest.xml ├── Chapter04 ├── NERDemo.java ├── build.xml ├── build │ └── classes │ │ ├── .netbeans_automatic_build │ │ ├── .netbeans_update_resources │ │ └── packt │ │ ├── Chapter4.class │ │ ├── DictionaryChunker.class │ │ ├── EmailRegexChunker.class │ │ ├── RunChunker.class │ │ ├── TimeRegexChunker.class │ │ └── TrainEntities.class ├── en-ner-all.train ├── en-ner-person.eval ├── en-ner-person.train ├── manifest.mf ├── modelFile ├── nbproject │ ├── build-impl.xml │ ├── genfiles.properties │ ├── private │ │ ├── config.properties │ │ ├── private.properties │ │ └── private.xml │ ├── project.properties │ └── project.xml ├── old │ ├── Chapter4.java │ └── TimeRegexChunker.java └── src │ └── packt │ ├── Chapter4.java │ ├── DictionaryChunker.java │ ├── EmailRegexChunker.java │ ├── RunChunker.java │ ├── TimeRegexChunker.java │ └── TrainEntities.java ├── Chapter05 └── Chapter5.java ├── Chapter06 ├── Chapter6.java ├── GloveExample.java ├── NGramTest.java ├── box.prop ├── box.test ├── box.train ├── en-animal.model └── en-animal.train ├── Chapter07 ├── Chapter7.java └── President.java ├── Chapter08 ├── Chapter8.java ├── Positions.java ├── StopWords.java └── Word.java ├── Chapter09 └── TestMallet.java ├── Chapter10 ├── CoreferenceDemo.java ├── DemoParsing.java ├── President.java ├── StanfordLexicalDemo.java └── WordDependencyDemo.java ├── Chapter11 ├── HTMLExtractorDemo.java ├── PDFExtractor.java ├── PipelineDemo.java ├── SearchText.java ├── TikaDemo.java ├── Word.java └── WordDocExtractor.java ├── Chapter12 ├── GenerateAIML.java ├── Mychatbotdemo.java ├── Test.java ├── TestClass.java └── mybot.zip ├── LICENSE └── README.md /Chapter01/Chapter1.java: -------------------------------------------------------------------------------- 1 | package packt; 2 | 3 | import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory; 4 | import edu.stanford.nlp.ling.HasWord; 5 | import edu.stanford.nlp.pipeline.Annotation; 6 | import edu.stanford.nlp.pipeline.StanfordCoreNLP; 7 | import edu.stanford.nlp.process.CoreLabelTokenFactory; 8 | import edu.stanford.nlp.process.DocumentPreprocessor; 9 | import edu.stanford.nlp.process.PTBTokenizer; 10 | import java.io.File; 11 | import java.io.FileInputStream; 12 | import java.io.FileNotFoundException; 13 | import java.io.IOException; 14 | import java.io.InputStream; 15 | import java.io.Reader; 16 | import java.io.StringReader; 17 | import java.util.ArrayList; 18 | import java.util.Arrays; 19 | import java.util.LinkedList; 20 | import java.util.List; 21 | import java.util.Properties; 22 | import opennlp.tools.cmdline.postag.POSModelLoader; 23 | import opennlp.tools.namefind.NameFinderME; 24 | import opennlp.tools.namefind.TokenNameFinderModel; 25 | import opennlp.tools.postag.POSModel; 26 | import opennlp.tools.postag.POSSample; 27 | import opennlp.tools.postag.POSTaggerME; 28 | import opennlp.tools.tokenize.SimpleTokenizer; 29 | import opennlp.tools.tokenize.Tokenizer; 30 | import opennlp.tools.tokenize.TokenizerME; 31 | import opennlp.tools.tokenize.TokenizerModel; 32 | import opennlp.tools.tokenize.WhitespaceTokenizer; 33 | import opennlp.tools.util.Span; 34 | 35 | public class Chapter1 { 36 | 37 | public static void main(String[] args) { 38 | // apacheOpenNLPExample(); 39 | // stanfordNLPExample(); 40 | lingpipeExamples(); 41 | // findingPartsOfText(); 42 | // findingSentences(); 43 | // findingPeopleAndThings(); 44 | // nameFinderExample(); 45 | // detectingPartsOfSpeechExample(); 46 | // extractingRelationshipsExample(); 47 | } 48 | 49 | private static void apacheOpenNLPExample() { 50 | try (InputStream is = new FileInputStream( 51 | new File("C:\\OpenNLP Models", "en-token.bin"))) { 52 | TokenizerModel model = new TokenizerModel(is); 53 | Tokenizer tokenizer = new TokenizerME(model); 54 | String tokens[] = tokenizer.tokenize("He lives at 1511 W. Randolph."); 55 | for (String a : tokens) { 56 | System.out.print("[" + a + "] "); 57 | } 58 | System.out.println(); 59 | 60 | } catch (FileNotFoundException ex) { 61 | ex.printStackTrace(); 62 | } catch (IOException ex) { 63 | ex.printStackTrace(); 64 | } 65 | 66 | } 67 | 68 | private static void stanfordNLPExample() { 69 | PTBTokenizer ptb = new PTBTokenizer( 70 | new StringReader("He lives at 1511 W. Randolph."), 71 | new CoreLabelTokenFactory(), null); 72 | while (ptb.hasNext()) { 73 | System.out.println(ptb.next()); 74 | } 75 | 76 | } 77 | 78 | private static void lingpipeExamples() { 79 | List tokenList = new ArrayList<>(); 80 | List whiteList = new ArrayList<>(); 81 | String text = "A sample sentence processed \nby \tthe " 82 | + "LingPipe tokenizer."; 83 | com.aliasi.tokenizer.Tokenizer tokenizer = IndoEuropeanTokenizerFactory.INSTANCE. 84 | tokenizer(text.toCharArray(), 0, text.length()); 85 | tokenizer.tokenize(tokenList, whiteList); 86 | for (String element : tokenList) { 87 | System.out.print(element + " "); 88 | } 89 | System.out.println(); 90 | 91 | } 92 | 93 | private static void splitMethodDemonstration() { 94 | String text = "Mr. Smith went to 123 Washington avenue."; 95 | String tokens[] = text.split("\\s+"); 96 | for (String token : tokens) { 97 | System.out.println(token); 98 | } 99 | } 100 | 101 | private static void findingPartsOfText() { 102 | String text = "Mr. Smith went to 123 Washington avenue."; 103 | String tokens[] = text.split("\\s+"); 104 | for (String token : tokens) { 105 | System.out.println(token); 106 | } 107 | } 108 | 109 | private static void findingSentences() { 110 | String paragraph = "The first sentence. The second sentence."; 111 | Reader reader = new StringReader(paragraph); 112 | DocumentPreprocessor documentPreprocessor 113 | = new DocumentPreprocessor(reader); 114 | List sentenceList = new LinkedList(); 115 | for (List element : documentPreprocessor) { 116 | StringBuilder sentence = new StringBuilder(); 117 | List hasWordList = element; 118 | for (HasWord token : hasWordList) { 119 | sentence.append(token).append(" "); 120 | } 121 | sentenceList.add(sentence.toString()); 122 | } 123 | for (String sentence : sentenceList) { 124 | System.out.println(sentence); 125 | } 126 | 127 | } 128 | 129 | private static void findingPeopleAndThings() { 130 | String text = "Mr. Smith went to 123 Washington avenue."; 131 | String target = "Washington"; 132 | int index = text.indexOf(target); 133 | System.out.println(index); 134 | } 135 | 136 | private static void nameFinderExample() { 137 | try { 138 | String[] sentences = { 139 | "Tim was a good neighbor. Perhaps not as good a Bob " 140 | + "Haywood, but still pretty good. Of course Mr. Adam " 141 | + "took the cake!"}; 142 | Tokenizer tokenizer = SimpleTokenizer.INSTANCE; 143 | TokenNameFinderModel model = new TokenNameFinderModel(new File( 144 | "C:\\OpenNLP Models", "en-ner-person.bin")); 145 | NameFinderME finder = new NameFinderME(model); 146 | 147 | for (String sentence : sentences) { 148 | // Split the sentence into tokens 149 | String[] tokens = tokenizer.tokenize(sentence); 150 | 151 | // Find the names in the tokens and return Span objects 152 | Span[] nameSpans = finder.find(tokens); 153 | 154 | // Print the names extracted from the tokens using the Span data 155 | System.out.println(Arrays.toString( 156 | Span.spansToStrings(nameSpans, tokens))); 157 | } 158 | } catch (IOException ex) { 159 | ex.printStackTrace(); 160 | } 161 | } 162 | 163 | private static void detectingPartsOfSpeechExample() { 164 | String sentence = "POS processing is useful for enhancing the " 165 | + "quality of data sent to other elements of a pipeline."; 166 | 167 | POSModel model = new POSModelLoader() 168 | .load(new File("C:/Current Books/NLP and Java/Models/", "en-pos-maxent.bin")); 169 | POSTaggerME tagger = new POSTaggerME(model); 170 | 171 | String tokens[] = WhitespaceTokenizer.INSTANCE 172 | .tokenize(sentence); 173 | String[] tags = tagger.tag(tokens); 174 | 175 | POSSample sample = new POSSample(tokens, tags); 176 | String posTokens[] = sample.getSentence(); 177 | String posTags[] = sample.getTags(); 178 | for (int i = 0; i < posTokens.length; i++) { 179 | System.out.print(posTokens[i] + " - " + posTags[i]); 180 | } 181 | System.out.println(); 182 | 183 | for (int i = 0; i < tokens.length; i++) { 184 | System.out.print(tokens[i] + "[" + tags[i] + "] "); 185 | } 186 | } 187 | 188 | private static void extractingRelationshipsExample() { 189 | Properties properties = new Properties(); 190 | properties.put("annotators", "tokenize, ssplit, parse"); 191 | StanfordCoreNLP pipeline = new StanfordCoreNLP(properties); 192 | Annotation annotation = new Annotation( 193 | "The meaning and purpose of life is plain to see."); 194 | pipeline.annotate(annotation); 195 | pipeline.prettyPrint(annotation, System.out); 196 | 197 | } 198 | } 199 | -------------------------------------------------------------------------------- /Chapter02/MyTokenizerFactory.java: -------------------------------------------------------------------------------- 1 | package packt; 2 | 3 | import opennlp.tools.tokenize.TokenizerFactory; 4 | 5 | public class MyTokenizerFactory extends TokenizerFactory { 6 | 7 | } 8 | -------------------------------------------------------------------------------- /Chapter02/PorterStemmer.java: -------------------------------------------------------------------------------- 1 | package packt; 2 | 3 | //package opennlp.tools.stemmer; 4 | 5 | import opennlp.tools.stemmer.Stemmer; 6 | 7 | 8 | /* 9 | * Licensed to the Apache Software Foundation (ASF) under one or more 10 | * contributor license agreements. See the NOTICE file distributed with 11 | * this work for additional information regarding copyright ownership. 12 | * The ASF licenses this file to You under the Apache License, Version 2.0 13 | * (the "License"); you may not use this file except in compliance with 14 | * the License. You may obtain a copy of the License at 15 | * 16 | * http://www.apache.org/licenses/LICENSE-2.0 17 | * 18 | * Unless required by applicable law or agreed to in writing, software 19 | * distributed under the License is distributed on an "AS IS" BASIS, 20 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 21 | * See the License for the specific language governing permissions and 22 | * limitations under the License. 23 | */ 24 | 25 | /* 26 | 27 | Porter stemmer in Java. The original paper is in 28 | 29 | Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, 30 | no. 3, pp 130-137, 31 | 32 | See also http://www.tartarus.org/~martin/PorterStemmer/index.html 33 | 34 | Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below. 35 | Tthe words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1] 36 | is then out outside the bounds of b. 37 | 38 | Similarly, 39 | 40 | Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below. 41 | 'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and 42 | b[j] is then outside the bounds of b. 43 | 44 | Release 3. 45 | 46 | [ This version is derived from Release 3, modified by Brian Goetz to 47 | optimize for fewer object creations. ] 48 | 49 | */ 50 | 51 | /** 52 | * 53 | * Stemmer, implementing the Porter Stemming Algorithm 54 | * 55 | * The Stemmer class transforms a word into its root form. The input 56 | * word can be provided a character at time (by calling add()), or at once 57 | * by calling one of the various stem(something) methods. 58 | */ 59 | 60 | class PorterStemmer implements Stemmer { 61 | private char[] b; 62 | private int i, /* offset into b */ 63 | j, k, k0; 64 | private boolean dirty = false; 65 | private static final int INC = 50; 66 | 67 | public PorterStemmer() { 68 | b = new char[INC]; 69 | i = 0; 70 | } 71 | 72 | /** 73 | * reset() resets the stemmer so it can stem another word. If you invoke 74 | * the stemmer by calling add(char) and then stem(), you must call reset() 75 | * before starting another word. 76 | */ 77 | public void reset() { i = 0; dirty = false; } 78 | 79 | /** 80 | * Add a character to the word being stemmed. When you are finished 81 | * adding characters, you can call stem(void) to process the word. 82 | */ 83 | public void add(char ch) { 84 | if (b.length == i) { 85 | 86 | char[] new_b = new char[i+INC]; 87 | for (int c = 0; c < i; c++) new_b[c] = b[c]; { 88 | b = new_b; 89 | } 90 | } 91 | b[i++] = ch; 92 | } 93 | 94 | /** 95 | * After a word has been stemmed, it can be retrieved by toString(), 96 | * or a reference to the internal buffer can be retrieved by getResultBuffer 97 | * and getResultLength (which is generally more efficient.) 98 | */ 99 | @Override 100 | public String toString() { return new String(b,0,i); } 101 | 102 | /** 103 | * Returns the length of the word resulting from the stemming process. 104 | */ 105 | public int getResultLength() { return i; } 106 | 107 | /** 108 | * Returns a reference to a character buffer containing the results of 109 | * the stemming process. You also need to consult getResultLength() 110 | * to determine the length of the result. 111 | */ 112 | public char[] getResultBuffer() { return b; } 113 | 114 | /* cons(i) is true <=> b[i] is a consonant. */ 115 | 116 | private final boolean cons(int i) { 117 | switch (b[i]) { 118 | case 'a': case 'e': case 'i': case 'o': case 'u': 119 | return false; 120 | case 'y': 121 | return (i==k0) ? true : !cons(i-1); 122 | default: 123 | return true; 124 | } 125 | } 126 | 127 | /* m() measures the number of consonant sequences between k0 and j. if c is 128 | a consonant sequence and v a vowel sequence, and <..> indicates arbitrary 129 | presence, 130 | 131 | gives 0 132 | vc gives 1 133 | vcvc gives 2 134 | vcvcvc gives 3 135 | .... 136 | */ 137 | 138 | private final int m() { 139 | int n = 0; 140 | int i = k0; 141 | while(true) { 142 | if (i > j) 143 | return n; 144 | if (! cons(i)) 145 | break; 146 | i++; 147 | } 148 | i++; 149 | while(true) { 150 | while(true) { 151 | if (i > j) 152 | return n; 153 | if (cons(i)) 154 | break; 155 | i++; 156 | } 157 | i++; 158 | n++; 159 | while(true) { 160 | if (i > j) 161 | return n; 162 | if (! cons(i)) 163 | break; 164 | i++; 165 | } 166 | i++; 167 | } 168 | } 169 | 170 | /* vowelinstem() is true <=> k0,...j contains a vowel */ 171 | 172 | private final boolean vowelinstem() { 173 | int i; 174 | for (i = k0; i <= j; i++) 175 | if (! cons(i)) 176 | return true; 177 | return false; 178 | } 179 | 180 | /* doublec(j) is true <=> j,(j-1) contain a double consonant. */ 181 | 182 | private final boolean doublec(int j) { 183 | if (j < k0+1) 184 | return false; 185 | if (b[j] != b[j-1]) 186 | return false; 187 | return cons(j); 188 | } 189 | 190 | /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant 191 | and also if the second c is not w,x or y. this is used when trying to 192 | restore an e at the end of a short word. e.g. 193 | 194 | cav(e), lov(e), hop(e), crim(e), but 195 | snow, box, tray. 196 | 197 | */ 198 | 199 | private final boolean cvc(int i) { 200 | if (i < k0+2 || !cons(i) || cons(i-1) || !cons(i-2)) 201 | return false; 202 | else { 203 | int ch = b[i]; 204 | if (ch == 'w' || ch == 'x' || ch == 'y') return false; 205 | } 206 | return true; 207 | } 208 | 209 | private final boolean ends(String s) { 210 | int l = s.length(); 211 | int o = k-l+1; 212 | if (o < k0) 213 | return false; 214 | for (int i = 0; i < l; i++) 215 | if (b[o+i] != s.charAt(i)) 216 | return false; 217 | j = k-l; 218 | return true; 219 | } 220 | 221 | /* setto(s) sets (j+1),...k to the characters in the string s, readjusting 222 | k. */ 223 | 224 | void setto(String s) { 225 | int l = s.length(); 226 | int o = j+1; 227 | for (int i = 0; i < l; i++) 228 | b[o+i] = s.charAt(i); 229 | k = j+l; 230 | dirty = true; 231 | } 232 | 233 | /* r(s) is used further down. */ 234 | 235 | void r(String s) { if (m() > 0) setto(s); } 236 | 237 | /* step1() gets rid of plurals and -ed or -ing. e.g. 238 | 239 | caresses -> caress 240 | ponies -> poni 241 | ties -> ti 242 | caress -> caress 243 | cats -> cat 244 | 245 | feed -> feed 246 | agreed -> agree 247 | disabled -> disable 248 | 249 | matting -> mat 250 | mating -> mate 251 | meeting -> meet 252 | milling -> mill 253 | messing -> mess 254 | 255 | meetings -> meet 256 | 257 | */ 258 | 259 | private final void step1() { 260 | if (b[k] == 's') { 261 | if (ends("sses")) k -= 2; 262 | else if (ends("ies")) setto("i"); 263 | else if (b[k-1] != 's') k--; 264 | } 265 | if (ends("eed")) { 266 | if (m() > 0) 267 | k--; 268 | } 269 | else if ((ends("ed") || ends("ing")) && vowelinstem()) { 270 | k = j; 271 | if (ends("at")) setto("ate"); 272 | else if (ends("bl")) setto("ble"); 273 | else if (ends("iz")) setto("ize"); 274 | else if (doublec(k)) { 275 | int ch = b[k--]; 276 | if (ch == 'l' || ch == 's' || ch == 'z') 277 | k++; 278 | } 279 | else if (m() == 1 && cvc(k)) 280 | setto("e"); 281 | } 282 | } 283 | 284 | /* step2() turns terminal y to i when there is another vowel in the stem. */ 285 | 286 | private final void step2() { 287 | if (ends("y") && vowelinstem()) { 288 | b[k] = 'i'; 289 | dirty = true; 290 | } 291 | } 292 | 293 | /* step3() maps double suffices to single ones. so -ization ( = -ize plus 294 | -ation) maps to -ize etc. note that the string before the suffix must give 295 | m() > 0. */ 296 | 297 | private final void step3() { 298 | if (k == k0) return; /* For Bug 1 */ 299 | switch (b[k-1]) { 300 | case 'a': 301 | if (ends("ational")) { r("ate"); break; } 302 | if (ends("tional")) { r("tion"); break; } 303 | break; 304 | case 'c': 305 | if (ends("enci")) { r("ence"); break; } 306 | if (ends("anci")) { r("ance"); break; } 307 | break; 308 | case 'e': 309 | if (ends("izer")) { r("ize"); break; } 310 | break; 311 | case 'l': 312 | if (ends("bli")) { r("ble"); break; } 313 | if (ends("alli")) { r("al"); break; } 314 | if (ends("entli")) { r("ent"); break; } 315 | if (ends("eli")) { r("e"); break; } 316 | if (ends("ousli")) { r("ous"); break; } 317 | break; 318 | case 'o': 319 | if (ends("ization")) { r("ize"); break; } 320 | if (ends("ation")) { r("ate"); break; } 321 | if (ends("ator")) { r("ate"); break; } 322 | break; 323 | case 's': 324 | if (ends("alism")) { r("al"); break; } 325 | if (ends("iveness")) { r("ive"); break; } 326 | if (ends("fulness")) { r("ful"); break; } 327 | if (ends("ousness")) { r("ous"); break; } 328 | break; 329 | case 't': 330 | if (ends("aliti")) { r("al"); break; } 331 | if (ends("iviti")) { r("ive"); break; } 332 | if (ends("biliti")) { r("ble"); break; } 333 | break; 334 | case 'g': 335 | if (ends("logi")) { r("log"); break; } 336 | } 337 | } 338 | 339 | /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */ 340 | 341 | private final void step4() { 342 | switch (b[k]) { 343 | case 'e': 344 | if (ends("icate")) { r("ic"); break; } 345 | if (ends("ative")) { r(""); break; } 346 | if (ends("alize")) { r("al"); break; } 347 | break; 348 | case 'i': 349 | if (ends("iciti")) { r("ic"); break; } 350 | break; 351 | case 'l': 352 | if (ends("ical")) { r("ic"); break; } 353 | if (ends("ful")) { r(""); break; } 354 | break; 355 | case 's': 356 | if (ends("ness")) { r(""); break; } 357 | break; 358 | } 359 | } 360 | 361 | /* step5() takes off -ant, -ence etc., in context vcvc. */ 362 | 363 | private final void step5() { 364 | if (k == k0) return; /* for Bug 1 */ 365 | switch (b[k-1]) { 366 | case 'a': 367 | if (ends("al")) break; 368 | return; 369 | case 'c': 370 | if (ends("ance")) break; 371 | if (ends("ence")) break; 372 | return; 373 | case 'e': 374 | if (ends("er")) break; return; 375 | case 'i': 376 | if (ends("ic")) break; return; 377 | case 'l': 378 | if (ends("able")) break; 379 | if (ends("ible")) break; return; 380 | case 'n': 381 | if (ends("ant")) break; 382 | if (ends("ement")) break; 383 | if (ends("ment")) break; 384 | /* element etc. not stripped before the m */ 385 | if (ends("ent")) break; 386 | return; 387 | case 'o': 388 | if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break; 389 | /* j >= 0 fixes Bug 2 */ 390 | if (ends("ou")) break; 391 | return; 392 | /* takes care of -ous */ 393 | case 's': 394 | if (ends("ism")) break; 395 | return; 396 | case 't': 397 | if (ends("ate")) break; 398 | if (ends("iti")) break; 399 | return; 400 | case 'u': 401 | if (ends("ous")) break; 402 | return; 403 | case 'v': 404 | if (ends("ive")) break; 405 | return; 406 | case 'z': 407 | if (ends("ize")) break; 408 | return; 409 | default: 410 | return; 411 | } 412 | if (m() > 1) 413 | k = j; 414 | } 415 | 416 | /* step6() removes a final -e if m() > 1. */ 417 | 418 | private final void step6() { 419 | j = k; 420 | if (b[k] == 'e') { 421 | int a = m(); 422 | if (a > 1 || a == 1 && !cvc(k-1)) 423 | k--; 424 | } 425 | if (b[k] == 'l' && doublec(k) && m() > 1) 426 | k--; 427 | } 428 | 429 | 430 | /** 431 | * Stem a word provided as a String. Returns the result as a String. 432 | */ 433 | public String stem(String s) { 434 | if (stem(s.toCharArray(), s.length())) 435 | return toString(); 436 | else 437 | return s; 438 | } 439 | 440 | /** 441 | * Stem a word provided as a CharSequence. 442 | * Returns the result as a CharSequence. 443 | */ 444 | public CharSequence stem(CharSequence word) { 445 | return stem(word.toString()); 446 | } 447 | 448 | /** Stem a word contained in a char[]. Returns true if the stemming process 449 | * resulted in a word different from the input. You can retrieve the 450 | * result with getResultLength()/getResultBuffer() or toString(). 451 | */ 452 | public boolean stem(char[] word) { 453 | return stem(word, word.length); 454 | } 455 | 456 | /** Stem a word contained in a portion of a char[] array. Returns 457 | * true if the stemming process resulted in a word different from 458 | * the input. You can retrieve the result with 459 | * getResultLength()/getResultBuffer() or toString(). 460 | */ 461 | public boolean stem(char[] wordBuffer, int offset, int wordLen) { 462 | reset(); 463 | if (b.length < wordLen) { 464 | b = new char[wordLen - offset]; 465 | } 466 | System.arraycopy(wordBuffer, offset, b, 0, wordLen); 467 | i = wordLen; 468 | return stem(0); 469 | } 470 | 471 | /** Stem a word contained in a leading portion of a char[] array. 472 | * Returns true if the stemming process resulted in a word different 473 | * from the input. You can retrieve the result with 474 | * getResultLength()/getResultBuffer() or toString(). 475 | */ 476 | public boolean stem(char[] word, int wordLen) { 477 | return stem(word, 0, wordLen); 478 | } 479 | 480 | /** Stem the word placed into the Stemmer buffer through calls to add(). 481 | * Returns true if the stemming process resulted in a word different 482 | * from the input. You can retrieve the result with 483 | * getResultLength()/getResultBuffer() or toString(). 484 | */ 485 | public boolean stem() { 486 | return stem(0); 487 | } 488 | 489 | public boolean stem(int i0) { 490 | k = i - 1; 491 | k0 = i0; 492 | if (k > k0+1) { 493 | step1(); step2(); step3(); step4(); step5(); step6(); 494 | } 495 | // Also, a word is considered dirty if we lopped off letters 496 | // Thanks to Ifigenia Vairelles for pointing this out. 497 | if (i != k+1) 498 | dirty = true; 499 | i = k+1; 500 | return dirty; 501 | } 502 | } 503 | 504 | 505 | -------------------------------------------------------------------------------- /Chapter02/StemmingLemaEx.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package chapter2; 7 | 8 | 9 | import java.io.File; 10 | import java.io.FileInputStream; 11 | import java.io.IOException; 12 | import java.io.InputStream; 13 | import java.text.BreakIterator; 14 | import java.util.Locale; 15 | import java.util.regex.Matcher; 16 | import java.util.regex.Pattern; 17 | import opennlp.tools.sentdetect.SentenceDetector; 18 | import opennlp.tools.sentdetect.SentenceDetectorME; 19 | import opennlp.tools.sentdetect.SentenceModel; 20 | import opennlp.tools.stemmer.PorterStemmer; 21 | 22 | /** 23 | * 24 | * @author ashish 25 | */ 26 | public class StemmingLemaEx { 27 | public static void main(String args[]){ 28 | String words[] = {"bank", "banking", "banks", "banker", "banked", 29 | "bankart"}; 30 | PorterStemmer ps = new PorterStemmer(); 31 | for(String w : words){ 32 | String stem = ps.stem(w); 33 | System.out.println("Word : " + w + " Stem : " + stem); 34 | } 35 | String paragraph = "When determining the end of sentences " 36 | + "we need to consider several factors. Sentences may end with " 37 | + "exclamation marks! Or possibly questions marks? Within " 38 | + "sentences we may find numbers like 3.14159, abbreviations " 39 | + "such as found in Mr. Smith, and possibly ellipses either " 40 | + "within a sentence …, or at the end of a sentence…"; 41 | String simple = "[.?!]"; 42 | String[] splitString = (paragraph.split(simple)); 43 | for (String string : splitString) { 44 | System.out.println(string); 45 | } 46 | System.out.println("-------------Using Pattern and Matcher-------------"); 47 | Pattern sentencePattern = Pattern.compile( 48 | "# Match a sentence ending in punctuation or EOS.\n" 49 | + "[^.!?\\s] # First char is non-punct, non-ws\n" 50 | + "[^.!?]* # Greedily consume up to punctuation.\n" 51 | + "(?: # Group for unrolling the loop.\n" 52 | + " [.!?] # (special) inner punctuation ok if\n" 53 | + " (?!['\"]?\\s|$) # not followed by ws or EOS.\n" 54 | + " [^.!?]* # Greedily consume up to punctuation.\n" 55 | + ")* # Zero or more (special normal*)\n" 56 | + "[.!?]? # Optional ending punctuation.\n" 57 | + "['\"]? # Optional closing quote.\n" 58 | + "(?=\\s|$)", 59 | Pattern.MULTILINE | Pattern.COMMENTS); 60 | Matcher matcher = sentencePattern.matcher(paragraph); 61 | while (matcher.find()) { 62 | System.out.println(matcher.group()); 63 | } 64 | System.out.println("-------------Using BreakIterator-------------"); 65 | BreakIterator si = BreakIterator.getSentenceInstance(); 66 | Locale cl = new Locale("en", "US"); 67 | si.setText(paragraph); 68 | int boundary = si.first(); 69 | while(boundary!=BreakIterator.DONE){ 70 | int begin = boundary; 71 | System.out.println(boundary + " - "); 72 | boundary = si.next(); 73 | int end = boundary; 74 | if(end == BreakIterator.DONE){ 75 | break; 76 | } 77 | System.out.println(boundary + " [ " + paragraph.substring(begin,end) + " ] "); 78 | } 79 | System.out.println("-------------Using SentenceDetectorME-------------"); 80 | try{ 81 | InputStream is = new FileInputStream(new File("/home/ashish/Downloads/" + "en-sent.bin")); 82 | SentenceModel sm = new SentenceModel(is); 83 | SentenceDetectorME detector = new SentenceDetectorME(sm); 84 | String sentences [] = detector.sentDetect(paragraph); 85 | for(String s : sentences){ 86 | System.out.println(s); 87 | } 88 | } 89 | catch(IOException e){ 90 | System.out.println("Error Detected" + e); 91 | e.printStackTrace(); 92 | } 93 | } 94 | 95 | } 96 | -------------------------------------------------------------------------------- /Chapter02/StopWords.java: -------------------------------------------------------------------------------- 1 | package packt; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.FileReader; 5 | import java.io.IOException; 6 | import java.util.ArrayList; 7 | import java.util.Arrays; 8 | import java.util.HashSet; 9 | import java.util.Iterator; 10 | 11 | public class StopWords { 12 | 13 | private String[] defaultStopWords = {"i", "a", "about", "an", "are", "as", "at", 14 | "be", "by", "com", "for", "from", "how", "in", "is", "it", "of", "on", 15 | "or", "that", "the", "this", "to", "was", "what", "when", "where", 16 | "who", "will", "with"}; 17 | 18 | private static HashSet stopWords = new HashSet(); 19 | 20 | public StopWords() { 21 | stopWords.addAll(Arrays.asList(defaultStopWords)); 22 | } 23 | 24 | public StopWords(String fileName) { 25 | try { 26 | BufferedReader bufferedreader = 27 | new BufferedReader(new FileReader(fileName)); 28 | while (bufferedreader.ready()) { 29 | stopWords.add(bufferedreader.readLine()); 30 | } 31 | } catch (IOException ex) { 32 | ex.printStackTrace(); 33 | } 34 | } 35 | 36 | public void addStopWord(String word) { 37 | stopWords.add(word); 38 | } 39 | 40 | public String[] removeStopWords(String[] words) { 41 | ArrayList tokens = new ArrayList(Arrays.asList(words)); 42 | for (int i = 0; i < tokens.size(); i++) { 43 | // System.out.println(stopWords.contains(tokens.get(i)) + " " + tokens.get(i)); 44 | if (stopWords.contains(tokens.get(i))) { 45 | tokens.remove(i); 46 | } 47 | } 48 | return (String[]) tokens.toArray(new String[tokens.size()]); 49 | } 50 | 51 | public void displayStopWords() { 52 | Iterator iterator = stopWords.iterator(); 53 | while(iterator.hasNext()) { 54 | System.out.print("[" + iterator.next() + "] "); 55 | } 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /Chapter02/Test.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package chapter2; 7 | 8 | import com.aliasi.sentences.IndoEuropeanSentenceModel; 9 | import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory; 10 | import com.aliasi.tokenizer.TokenizerFactory; 11 | import edu.stanford.nlp.ling.CoreLabel; 12 | import edu.stanford.nlp.ling.HasWord; 13 | import edu.stanford.nlp.pipeline.Annotation; 14 | import edu.stanford.nlp.pipeline.StanfordCoreNLP; 15 | import edu.stanford.nlp.process.CoreLabelTokenFactory; 16 | import edu.stanford.nlp.process.DocumentPreprocessor; 17 | import edu.stanford.nlp.process.DocumentProcessor; 18 | import edu.stanford.nlp.process.PTBTokenizer; 19 | import edu.stanford.nlp.process.WordTokenFactory; 20 | import java.io.BufferedOutputStream; 21 | import java.io.File; 22 | import java.io.FileInputStream; 23 | import java.io.IOException; 24 | import java.io.InputStream; 25 | import java.io.Reader; 26 | import java.io.StreamTokenizer; 27 | import java.io.StringReader; 28 | import java.io.UnsupportedEncodingException; 29 | import java.text.BreakIterator; 30 | import java.util.ArrayList; 31 | import java.util.Iterator; 32 | import java.util.List; 33 | import java.util.Properties; 34 | import java.util.Scanner; 35 | import java.util.StringTokenizer; 36 | import opennlp.tools.tokenize.SimpleTokenizer; 37 | import opennlp.tools.tokenize.TokenSample; 38 | import opennlp.tools.tokenize.TokenSampleStream; 39 | import opennlp.tools.tokenize.Tokenizer; 40 | //import opennlp.tools.tokenize.TokenizerFactory; 41 | import opennlp.tools.tokenize.TokenizerME; 42 | import opennlp.tools.tokenize.TokenizerModel; 43 | import opennlp.tools.tokenize.WhitespaceTokenizer; 44 | import opennlp.tools.util.InputStreamFactory; 45 | import opennlp.tools.util.ObjectStream; 46 | import opennlp.tools.util.PlainTextByLineStream; 47 | 48 | /** 49 | * 50 | * @author ashish 51 | */ 52 | public class Test { 53 | private static String getResourcePath(){ 54 | File currDir = new File("."); 55 | String path = currDir .getAbsolutePath(); 56 | path = path.substring(0, path.length()-2); 57 | System.out.println(path); 58 | String resourcePath = path + File.separator + "src/chapter2/"; 59 | return resourcePath; 60 | } 61 | public static void main(String args[]){ 62 | Scanner s = new Scanner("Let's pause, and then reflect"); 63 | s.useDelimiter("[,.]"); 64 | List l = new ArrayList<>(); 65 | while(s.hasNext()){ 66 | String token = s.next(); 67 | l.add(token); 68 | } 69 | for(String token : l){ 70 | System.out.println(token); 71 | } 72 | String text = "Mr. Smith went to 123 Washington avenue"; 73 | String tokens[] = text.split("\\s+"); 74 | for(String token: tokens){ 75 | System.out.println(token); 76 | } 77 | BreakIterator b = BreakIterator.getWordInstance(); 78 | text = "Let's pause, and then reflect"; 79 | b.setText(text); 80 | int boundary = b.first(); 81 | while(boundary!=BreakIterator.DONE){ 82 | int begin = boundary; 83 | System.out.println(boundary); 84 | boundary = b.next(); 85 | int end = boundary; 86 | if(end==BreakIterator.DONE){ 87 | break; 88 | } 89 | System.out.println(boundary + "[" + text.substring(begin,end) + "]"); 90 | } 91 | 92 | try{ 93 | StreamTokenizer t = new StreamTokenizer( 94 | new StringReader("Let's pause, and then reflect.")); 95 | boolean isEOF = false; 96 | while(!isEOF){ 97 | int token = t.nextToken(); 98 | switch(token){ 99 | case StreamTokenizer.TT_EOF: 100 | isEOF = true; 101 | break; 102 | case StreamTokenizer.TT_EOL: 103 | break; 104 | case StreamTokenizer.TT_WORD: 105 | System.out.println(t.sval); 106 | break; 107 | case StreamTokenizer.TT_NUMBER: 108 | System.out.println(t.nval); 109 | break; 110 | default: 111 | System.out.println((char)token); 112 | } 113 | } 114 | 115 | } 116 | catch(IOException e){ 117 | e.printStackTrace(); 118 | } 119 | catch(Exception e){ 120 | e.printStackTrace(); 121 | } 122 | 123 | // Using OpenNLP 124 | 125 | String paragraph = "Let's pause, \nand then reflect."; 126 | SimpleTokenizer simpletokenizer = SimpleTokenizer.INSTANCE; 127 | String simpletokens[] = simpletokenizer.tokenize(paragraph); 128 | for(String token : simpletokens){ 129 | System.out.println(token); 130 | } 131 | 132 | tokens = WhitespaceTokenizer.INSTANCE.tokenize(paragraph); 133 | for (String token : tokens) { 134 | System.out.println(token); 135 | } 136 | 137 | try 138 | { 139 | InputStream modelis = new FileInputStream(new File(getResourcePath() + "en-token.bin")); 140 | TokenizerModel model = new TokenizerModel(modelis); 141 | Tokenizer tokenizer = new TokenizerME(model); 142 | tokens= tokenizer.tokenize(paragraph); 143 | for (String token : tokens){ 144 | System.out.println(token); 145 | } 146 | } 147 | catch(IOException e){ 148 | e.printStackTrace(); 149 | } 150 | 151 | 152 | 153 | PTBTokenizer ptb = new PTBTokenizer(new StringReader(paragraph), new WordTokenFactory(),null); 154 | while(ptb.hasNext()){ 155 | System.out.println(ptb.next()); 156 | } 157 | 158 | CoreLabelTokenFactory ctf = new CoreLabelTokenFactory(); 159 | ptb = new PTBTokenizer(new StringReader(paragraph), ctf, "invertible=true"); 160 | while(ptb.hasNext()){ 161 | CoreLabel cl = (CoreLabel)ptb.next(); 162 | System.out.println(cl.originalText() + "(" + cl.beginPosition() + "-" + cl.endPosition() + ")" ); 163 | } 164 | 165 | Reader reader = new StringReader(paragraph); 166 | DocumentPreprocessor dp = new DocumentPreprocessor(reader); 167 | Iterator> it = dp.iterator(); 168 | while(it.hasNext()){ 169 | List sentence = it.next(); 170 | for(HasWord token : sentence){ 171 | System.out.println(token); 172 | } 173 | } 174 | Properties prop = new Properties(); 175 | prop.put("annonators", "tokenize, ssplit"); 176 | StanfordCoreNLP pipeline = new StanfordCoreNLP(prop); 177 | // Annotation ann = new Annotation(paragraph); 178 | Annotation ann = new Annotation(paragraph); 179 | pipeline.annotate(ann); 180 | pipeline.prettyPrint(ann, System.out); 181 | 182 | // LingPipe Tokenizers 183 | char texts[] = paragraph.toCharArray(); 184 | 185 | TokenizerFactory tfac = IndoEuropeanTokenizerFactory.INSTANCE; 186 | com.aliasi.tokenizer.Tokenizer tokens1 = tfac.tokenizer(texts, 0, texts.length); 187 | for(String t : tokens1){ 188 | System.out.println(t); 189 | } 190 | 191 | BufferedOutputStream bos = null; 192 | try{ 193 | ObjectStream linestream = new PlainTextByLineStream((InputStreamFactory) new FileInputStream("training.train"),"UTF-8"); 194 | ObjectStream samplestream = new TokenSampleStream(linestream); 195 | // TokenizerModel model = TokenizerME.train(samplestream, factory, mlParams) 196 | 197 | } 198 | catch(UnsupportedEncodingException e){ 199 | e.printStackTrace(); 200 | } 201 | catch(IOException e){ 202 | e.printStackTrace(); 203 | } 204 | 205 | 206 | } 207 | 208 | } 209 | -------------------------------------------------------------------------------- /Chapter03/SBDDemo.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package chapter3; 7 | 8 | import java.util.regex.Matcher; 9 | import java.util.regex.Pattern; 10 | 11 | /** 12 | * 13 | * @author ashish 14 | */ 15 | public class SBDDemo { 16 | private static String paragraph = "When determining the end of sentences " 17 | + "we need to consider several factors. Sentences may end with " 18 | + "exclamation marks! Or possibly questions marks? Within " 19 | + "sentences we may find numbers like 3.14159, abbreviations " 20 | + "such as found in Mr. Smith, and possibly ellipses either " 21 | + "within a sentence …, or at the end of a sentence…"; 22 | 23 | public static void main(String args[]){ 24 | System.out.println("--------- Simple regex ---------"); 25 | String simple = "[.?!]"; 26 | String[] splitString = (paragraph.split(simple)); 27 | for (String string : splitString) { 28 | System.out.println(string); 29 | } 30 | System.out.println(">>>> Using Pattern and Matcher --------"); 31 | Pattern sentencePattern = Pattern.compile( 32 | "# Match a sentence ending in punctuation or EOS.\n" 33 | + "[^.!?\\s] # First char is non-punct, non-ws\n" 34 | + "[^.!?]* # Greedily consume up to punctuation.\n" 35 | + "(?: # Group for unrolling the loop.\n" 36 | + " [.!?] # (special) inner punctuation ok if\n" 37 | + " (?!['\"]?\\s|$) # not followed by ws or EOS.\n" 38 | + " [^.!?]* # Greedily consume up to punctuation.\n" 39 | + ")* # Zero or more (special normal*)\n" 40 | + "[.!?]? # Optional ending punctuation.\n" 41 | + "['\"]? # Optional closing quote.\n" 42 | + "(?=\\s|$)", 43 | Pattern.MULTILINE | Pattern.COMMENTS); 44 | Matcher matcher = sentencePattern.matcher(paragraph); 45 | while (matcher.find()) { 46 | System.out.println(matcher.group()); 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /Chapter03/XMLProcessingDemo.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package chapter3; 7 | 8 | import edu.stanford.nlp.process.DocumentPreprocessor; 9 | import edu.stanford.nlp.process.DocumentProcessor; 10 | import java.io.File; 11 | import java.io.FileNotFoundException; 12 | import java.io.FileReader; 13 | import java.io.Reader; 14 | import java.util.List; 15 | import java.util.ListIterator; 16 | import java.util.logging.Level; 17 | import java.util.logging.Logger; 18 | 19 | /** 20 | * 21 | * @author ashish 22 | */ 23 | public class XMLProcessingDemo { 24 | private static String getResourcePath(){ 25 | File currDir = new File("."); 26 | String path = currDir .getAbsolutePath(); 27 | path = path.substring(0, path.length()-2); 28 | System.out.println(path); 29 | String resourcePath = path + File.separator + "src/chapter3/XMLTest.xml"; 30 | return resourcePath; 31 | } 32 | 33 | public static void main(String args[]){ 34 | try { 35 | Reader reader = new FileReader(getResourcePath()); 36 | DocumentPreprocessor dp = new DocumentPreprocessor(reader, DocumentPreprocessor.DocType.XML); 37 | dp.setElementDelimiter("sentence"); 38 | for(List sentence : dp){ 39 | ListIterator list = sentence.listIterator(); 40 | while (list.hasNext()) { 41 | System.out.print(list.next() + " "); 42 | } 43 | System.out.println(); 44 | 45 | } 46 | } catch (FileNotFoundException ex) { 47 | Logger.getLogger(XMLProcessingDemo.class.getName()).log(Level.SEVERE, null, ex); 48 | } 49 | 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /Chapter03/XMLTest.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | When 7 | the 8 | day 9 | is 10 | done 11 | we 12 | can 13 | sleep 14 | . 15 | 16 | 17 | When 18 | the 19 | morning 20 | comes 21 | we 22 | can 23 | wake 24 | . 25 | 26 | 27 | After 28 | that 29 | who -------------------------------------------------------------------------------- /Chapter04/NERDemo.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package chapter4; 7 | 8 | 9 | import java.io.File; 10 | import java.io.FileInputStream; 11 | import java.io.InputStream; 12 | import opennlp.tools.namefind.NameFinderME; 13 | import opennlp.tools.namefind.TokenNameFinder; 14 | import opennlp.tools.namefind.TokenNameFinderModel; 15 | import opennlp.tools.tokenize.Tokenizer; 16 | import opennlp.tools.tokenize.TokenizerME; 17 | import opennlp.tools.tokenize.TokenizerModel; 18 | import opennlp.tools.util.Span; 19 | 20 | /** 21 | * 22 | * @author ashish 23 | */ 24 | public class NERDemo { 25 | private static String getResourcePath(){ 26 | File currDir = new File("."); 27 | String path = currDir .getAbsolutePath(); 28 | path = path.substring(0, path.length()-2); 29 | System.out.println(path); 30 | String resourcePath = path + File.separator + "src/chapter4/"; 31 | return resourcePath; 32 | } 33 | public static void main(String args[]){ 34 | String sentences[] = {"Joe was the last person to see Fred. ", 35 | "He saw him in Boston at McKenzie's pub at 3:00 where he " 36 | + " paid $2.45 for an ale. ", 37 | "Joe wanted to go to Vermont for the day to visit a cousin who " 38 | + "works at IBM, but Sally and he had to look for Fred"}; 39 | String sentence = "He was the last person to see Fred."; 40 | try 41 | { 42 | InputStream tokenStream = new FileInputStream(new File(getResourcePath()+ "en-token.bin")); 43 | InputStream modelStream = new FileInputStream(new File(getResourcePath() + "en-ner-person.bin")); 44 | TokenizerModel tokenModel = new TokenizerModel(tokenStream); 45 | Tokenizer tokenizer = new TokenizerME(tokenModel); 46 | TokenNameFinderModel entityModel = new TokenNameFinderModel(modelStream); 47 | NameFinderME nameFinder = new NameFinderME(entityModel); 48 | String tokens1[] = tokenizer.tokenize(sentence); 49 | Span nameSpans1[] = nameFinder.find(tokens1); 50 | for (int i = 0; i < nameSpans1.length; i++) { 51 | System.out.println("Span: " + nameSpans1[i].toString()); 52 | System.out.println("Entity: " 53 | + tokens1[nameSpans1[i].getStart()]); 54 | } 55 | 56 | System.out.println("---------- Multiple Sentences -----------"); 57 | for (String sentence1 : sentences) { 58 | String tokens[] = tokenizer.tokenize(sentence1); 59 | Span nameSpans[] = nameFinder.find(tokens); 60 | for (int i = 0; i < nameSpans.length; i++) { 61 | System.out.println("Span: " + nameSpans[i].toString()); 62 | System.out.println("Entity: " 63 | + tokens[nameSpans[i].getStart()]); 64 | } 65 | System.out.println(); 66 | } 67 | 68 | } 69 | catch(Exception e){ 70 | System.out.println(e); 71 | } 72 | } 73 | 74 | } 75 | -------------------------------------------------------------------------------- /Chapter04/build.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | Builds, tests, and runs the project Chapter 4. 12 | 13 | 73 | 74 | -------------------------------------------------------------------------------- /Chapter04/build/classes/.netbeans_automatic_build: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Natural-Language-Processing-with-Java-Second-Edition/f235777262c48f4bbb4e52e45c4936896471080c/Chapter04/build/classes/.netbeans_automatic_build -------------------------------------------------------------------------------- /Chapter04/build/classes/.netbeans_update_resources: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Natural-Language-Processing-with-Java-Second-Edition/f235777262c48f4bbb4e52e45c4936896471080c/Chapter04/build/classes/.netbeans_update_resources -------------------------------------------------------------------------------- /Chapter04/build/classes/packt/Chapter4.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Natural-Language-Processing-with-Java-Second-Edition/f235777262c48f4bbb4e52e45c4936896471080c/Chapter04/build/classes/packt/Chapter4.class -------------------------------------------------------------------------------- /Chapter04/build/classes/packt/DictionaryChunker.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Natural-Language-Processing-with-Java-Second-Edition/f235777262c48f4bbb4e52e45c4936896471080c/Chapter04/build/classes/packt/DictionaryChunker.class -------------------------------------------------------------------------------- /Chapter04/build/classes/packt/EmailRegexChunker.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Natural-Language-Processing-with-Java-Second-Edition/f235777262c48f4bbb4e52e45c4936896471080c/Chapter04/build/classes/packt/EmailRegexChunker.class -------------------------------------------------------------------------------- /Chapter04/build/classes/packt/RunChunker.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Natural-Language-Processing-with-Java-Second-Edition/f235777262c48f4bbb4e52e45c4936896471080c/Chapter04/build/classes/packt/RunChunker.class -------------------------------------------------------------------------------- /Chapter04/build/classes/packt/TimeRegexChunker.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Natural-Language-Processing-with-Java-Second-Edition/f235777262c48f4bbb4e52e45c4936896471080c/Chapter04/build/classes/packt/TimeRegexChunker.class -------------------------------------------------------------------------------- /Chapter04/build/classes/packt/TrainEntities.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Natural-Language-Processing-with-Java-Second-Edition/f235777262c48f4bbb4e52e45c4936896471080c/Chapter04/build/classes/packt/TrainEntities.class -------------------------------------------------------------------------------- /Chapter04/en-ner-all.train: -------------------------------------------------------------------------------- 1 | Joe was the last person to see Fred. He saw him in Boston at McKenzie's pub at 3:00 where he paid $2.45 for an ale. Joe wanted to go to Vermont for the day to visit a cousin who works at IBM, but Sally and he had to look for Fred. 2 | -------------------------------------------------------------------------------- /Chapter04/en-ner-person.eval: -------------------------------------------------------------------------------- 1 | Bill went to the farm to see Sally . 2 | Unable to find Sally he went to town. 3 | There he saw Fred who had seen Sally at the book store with Mary . 4 | -------------------------------------------------------------------------------- /Chapter04/en-ner-person.train: -------------------------------------------------------------------------------- 1 | Joe was the last person to see Fred . 2 | He saw him in Boston at McKenzie's pub at 3:00 where he paid $2.45 for an ale. 3 | Joe wanted to go to Vermont for the day to visit a cousin who works at IBM, but Sally and he had to look for Fred . 4 | -------------------------------------------------------------------------------- /Chapter04/manifest.mf: -------------------------------------------------------------------------------- 1 | Manifest-Version: 1.0 2 | X-COMMENT: Main-Class will be added automatically by build 3 | 4 | -------------------------------------------------------------------------------- /Chapter04/modelFile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Natural-Language-Processing-with-Java-Second-Edition/f235777262c48f4bbb4e52e45c4936896471080c/Chapter04/modelFile -------------------------------------------------------------------------------- /Chapter04/nbproject/genfiles.properties: -------------------------------------------------------------------------------- 1 | build.xml.data.CRC32=be185f10 2 | build.xml.script.CRC32=2181029d 3 | build.xml.stylesheet.CRC32=8064a381@1.75.1.48 4 | # This file is used by a NetBeans-based IDE to track changes in generated files such as build-impl.xml. 5 | # Do not edit this file. You may delete it but then the IDE will never regenerate such files for you. 6 | nbproject/build-impl.xml.data.CRC32=be185f10 7 | nbproject/build-impl.xml.script.CRC32=b30cc88e 8 | nbproject/build-impl.xml.stylesheet.CRC32=876e7a8f@1.75.1.48 9 | -------------------------------------------------------------------------------- /Chapter04/nbproject/private/config.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Natural-Language-Processing-with-Java-Second-Edition/f235777262c48f4bbb4e52e45c4936896471080c/Chapter04/nbproject/private/config.properties -------------------------------------------------------------------------------- /Chapter04/nbproject/private/private.properties: -------------------------------------------------------------------------------- 1 | compile.on.save=true 2 | do.depend=false 3 | do.jar=true 4 | javac.debug=true 5 | javadoc.preview=true 6 | user.properties.file=C:\\Users\\Richard\\AppData\\Roaming\\NetBeans\\8.0.2\\build.properties 7 | -------------------------------------------------------------------------------- /Chapter04/nbproject/private/private.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | file:/C:/Current%20Books/NLP%20and%20Java/Chapter%204/Chapter%204/src/packt/Chapter4.java 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /Chapter04/nbproject/project.properties: -------------------------------------------------------------------------------- 1 | annotation.processing.enabled=true 2 | annotation.processing.enabled.in.editor=false 3 | annotation.processing.processors.list= 4 | annotation.processing.run.all.processors=true 5 | annotation.processing.source.output=${build.generated.sources.dir}/ap-source-output 6 | application.title=Chapter 4 7 | application.vendor=Richard 8 | build.classes.dir=${build.dir}/classes 9 | build.classes.excludes=**/*.java,**/*.form 10 | # This directory is removed when the project is cleaned: 11 | build.dir=build 12 | build.generated.dir=${build.dir}/generated 13 | build.generated.sources.dir=${build.dir}/generated-sources 14 | # Only compile against the classpath explicitly listed here: 15 | build.sysclasspath=ignore 16 | build.test.classes.dir=${build.dir}/test/classes 17 | build.test.results.dir=${build.dir}/test/results 18 | # Uncomment to specify the preferred debugger connection transport: 19 | #debug.transport=dt_socket 20 | debug.classpath=\ 21 | ${run.classpath} 22 | debug.test.classpath=\ 23 | ${run.test.classpath} 24 | # Files in build.classes.dir which should be excluded from distribution jar 25 | dist.archive.excludes= 26 | # This directory is removed when the project is cleaned: 27 | dist.dir=dist 28 | dist.jar=${dist.dir}/Chapter_4.jar 29 | dist.javadoc.dir=${dist.dir}/javadoc 30 | endorsed.classpath= 31 | excludes= 32 | file.reference.ejml-0.23.jar=C:\\Current Books in Progress\\NLP and Java\\Downloads\\Stanford\\stanford-corenlp-full-2014-08-27\\ejml-0.23.jar 33 | file.reference.javax.json-api-1.0-sources.jar=C:\\Current Books in Progress\\NLP and Java\\Downloads\\Stanford\\stanford-corenlp-full-2014-08-27\\javax.json-api-1.0-sources.jar 34 | file.reference.javax.json.jar=C:\\Current Books in Progress\\NLP and Java\\Downloads\\Stanford\\stanford-corenlp-full-2014-08-27\\javax.json.jar 35 | file.reference.joda-time-2.1-sources.jar=C:\\Current Books in Progress\\NLP and Java\\Downloads\\Stanford\\stanford-corenlp-full-2014-08-27\\joda-time-2.1-sources.jar 36 | file.reference.joda-time.jar=C:\\Current Books in Progress\\NLP and Java\\Downloads\\Stanford\\stanford-corenlp-full-2014-08-27\\joda-time.jar 37 | file.reference.jollyday-0.4.7-sources.jar=C:\\Current Books in Progress\\NLP and Java\\Downloads\\Stanford\\stanford-corenlp-full-2014-08-27\\jollyday-0.4.7-sources.jar 38 | file.reference.jollyday.jar=C:\\Current Books in Progress\\NLP and Java\\Downloads\\Stanford\\stanford-corenlp-full-2014-08-27\\jollyday.jar 39 | file.reference.jwnl-1.3.3.jar=C:\\Downloads\\OpenNLP\\apache-opennlp-1.5.3\\lib\\jwnl-1.3.3.jar 40 | file.reference.lingpipe-4.1.0.jar=C:\\Current Books\\NLP and Java\\Downloads\\lingpipe-4.1.0\\lingpipe-4.1.0.jar 41 | file.reference.opennlp-maxent-3.0.3.jar=C:\\Downloads\\OpenNLP\\apache-opennlp-1.5.3\\lib\\opennlp-maxent-3.0.3.jar 42 | file.reference.opennlp-maxent-3.0.3.jar-1=C:\\Current Books\\NLP and Java\\Downloads\\apache-opennlp-1.5.3\\lib\\opennlp-maxent-3.0.3.jar 43 | file.reference.opennlp-tools-1.5.3.jar=C:\\Downloads\\OpenNLP\\apache-opennlp-1.5.3\\lib\\opennlp-tools-1.5.3.jar 44 | file.reference.opennlp-tools-1.5.3.jar-1=C:\\Current Books\\NLP and Java\\Downloads\\apache-opennlp-1.5.3\\lib\\opennlp-tools-1.5.3.jar 45 | file.reference.opennlp-uima-1.5.3.jar=C:\\Downloads\\OpenNLP\\apache-opennlp-1.5.3\\lib\\opennlp-uima-1.5.3.jar 46 | file.reference.stanford-corenlp-3.4.1-javadoc.jar=C:\\Current Books in Progress\\NLP and Java\\Downloads\\Stanford\\stanford-corenlp-full-2014-08-27\\stanford-corenlp-3.4.1-javadoc.jar 47 | file.reference.stanford-corenlp-3.4.1-models.jar=C:\\Current Books in Progress\\NLP and Java\\Downloads\\Stanford\\stanford-corenlp-full-2014-08-27\\stanford-corenlp-3.4.1-models.jar 48 | file.reference.stanford-corenlp-3.4.1-sources.jar=C:\\Current Books in Progress\\NLP and Java\\Downloads\\Stanford\\stanford-corenlp-full-2014-08-27\\stanford-corenlp-3.4.1-sources.jar 49 | file.reference.stanford-corenlp-3.4.1.jar=C:\\Current Books in Progress\\NLP and Java\\Downloads\\Stanford\\stanford-corenlp-full-2014-08-27\\stanford-corenlp-3.4.1.jar 50 | file.reference.stanford-ner-3.5.0-javadoc.jar=C:\\Current Books\\NLP and Java\\Downloads\\stanford-ner-2014-10-26\\stanford-ner-3.5.0-javadoc.jar 51 | file.reference.stanford-ner-3.5.0.jar=C:\\Current Books\\NLP and Java\\Downloads\\stanford-ner-2014-10-26\\stanford-ner-3.5.0.jar 52 | file.reference.stanford-ner.jar=C:\\Current Books\\NLP and Java\\Downloads\\stanford-ner-2014-10-26\\stanford-ner.jar 53 | file.reference.xom-1.2.10-src.jar=C:\\Current Books in Progress\\NLP and Java\\Downloads\\Stanford\\stanford-corenlp-full-2014-08-27\\xom-1.2.10-src.jar 54 | file.reference.xom.jar=C:\\Current Books in Progress\\NLP and Java\\Downloads\\Stanford\\stanford-corenlp-full-2014-08-27\\xom.jar 55 | includes=** 56 | jar.compress=false 57 | javac.classpath=\ 58 | ${file.reference.jwnl-1.3.3.jar}:\ 59 | ${file.reference.opennlp-maxent-3.0.3.jar}:\ 60 | ${file.reference.opennlp-tools-1.5.3.jar}:\ 61 | ${file.reference.opennlp-uima-1.5.3.jar}:\ 62 | ${file.reference.ejml-0.23.jar}:\ 63 | ${file.reference.javax.json-api-1.0-sources.jar}:\ 64 | ${file.reference.javax.json.jar}:\ 65 | ${file.reference.joda-time-2.1-sources.jar}:\ 66 | ${file.reference.joda-time.jar}:\ 67 | ${file.reference.jollyday-0.4.7-sources.jar}:\ 68 | ${file.reference.jollyday.jar}:\ 69 | ${file.reference.stanford-corenlp-3.4.1-javadoc.jar}:\ 70 | ${file.reference.stanford-corenlp-3.4.1-models.jar}:\ 71 | ${file.reference.stanford-corenlp-3.4.1-sources.jar}:\ 72 | ${file.reference.stanford-corenlp-3.4.1.jar}:\ 73 | ${file.reference.xom-1.2.10-src.jar}:\ 74 | ${file.reference.xom.jar}:\ 75 | ${file.reference.lingpipe-4.1.0.jar}:\ 76 | ${file.reference.stanford-ner-3.5.0-javadoc.jar}:\ 77 | ${file.reference.stanford-ner-3.5.0.jar}:\ 78 | ${file.reference.stanford-ner.jar}:\ 79 | ${file.reference.opennlp-maxent-3.0.3.jar-1}:\ 80 | ${file.reference.opennlp-tools-1.5.3.jar-1} 81 | # Space-separated list of extra javac options 82 | javac.compilerargs= 83 | javac.deprecation=false 84 | javac.processorpath=\ 85 | ${javac.classpath} 86 | javac.source=1.8 87 | javac.target=1.8 88 | javac.test.classpath=\ 89 | ${javac.classpath}:\ 90 | ${build.classes.dir} 91 | javac.test.processorpath=\ 92 | ${javac.test.classpath} 93 | javadoc.additionalparam= 94 | javadoc.author=false 95 | javadoc.encoding=${source.encoding} 96 | javadoc.noindex=false 97 | javadoc.nonavbar=false 98 | javadoc.notree=false 99 | javadoc.private=false 100 | javadoc.splitindex=true 101 | javadoc.use=true 102 | javadoc.version=false 103 | javadoc.windowtitle= 104 | main.class=packt.Chapter4 105 | manifest.file=manifest.mf 106 | meta.inf.dir=${src.dir}/META-INF 107 | mkdist.disabled=false 108 | platform.active=default_platform 109 | run.classpath=\ 110 | ${javac.classpath}:\ 111 | ${build.classes.dir} 112 | # Space-separated list of JVM arguments used when running the project. 113 | # You may also define separate properties like run-sys-prop.name=value instead of -Dname=value. 114 | # To set system properties for unit tests define test-sys-prop.name=value: 115 | run.jvmargs= 116 | run.test.classpath=\ 117 | ${javac.test.classpath}:\ 118 | ${build.test.classes.dir} 119 | source.encoding=UTF-8 120 | src.dir=src 121 | test.src.dir=test 122 | -------------------------------------------------------------------------------- /Chapter04/nbproject/project.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | org.netbeans.modules.java.j2seproject 4 | 5 | 6 | Chapter 4 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /Chapter04/old/Chapter4.java: -------------------------------------------------------------------------------- 1 | package packt; 2 | 3 | import com.aliasi.chunk.Chunk; 4 | import com.aliasi.chunk.Chunker; 5 | import com.aliasi.chunk.Chunking; 6 | import com.aliasi.dict.DictionaryEntry; 7 | import com.aliasi.dict.ExactDictionaryChunker; 8 | import com.aliasi.dict.MapDictionary; 9 | import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory; 10 | import com.aliasi.util.AbstractExternalizable; 11 | import edu.stanford.nlp.ie.crf.CRFClassifier; 12 | import edu.stanford.nlp.ling.CoreAnnotations; 13 | import edu.stanford.nlp.ling.CoreLabel; 14 | import java.io.BufferedOutputStream; 15 | import java.io.File; 16 | import java.io.FileInputStream; 17 | import java.io.FileOutputStream; 18 | import java.io.IOException; 19 | import java.io.InputStream; 20 | import java.io.OutputStream; 21 | import java.util.ArrayList; 22 | import java.util.List; 23 | import java.util.Set; 24 | import java.util.regex.Matcher; 25 | import java.util.regex.Pattern; 26 | import opennlp.tools.namefind.NameFinderME; 27 | import opennlp.tools.namefind.NameSample; 28 | import opennlp.tools.namefind.NameSampleDataStream; 29 | import opennlp.tools.namefind.TokenNameFinderEvaluator; 30 | import opennlp.tools.namefind.TokenNameFinderModel; 31 | import opennlp.tools.tokenize.Tokenizer; 32 | import opennlp.tools.tokenize.TokenizerME; 33 | import opennlp.tools.tokenize.TokenizerModel; 34 | import opennlp.tools.util.ObjectStream; 35 | import opennlp.tools.util.PlainTextByLineStream; 36 | import opennlp.tools.util.Span; 37 | import opennlp.tools.util.eval.FMeasure; 38 | 39 | public class Chapter4 { 40 | 41 | private static final String sentences[] = {"Joe was the last person to see Fred. ", 42 | "He saw him in Boston at McKenzie's pub at 3:00 where he paid " 43 | + "$2.45 for an ale. ", 44 | "Joe wanted to go to Vermont for the day to visit a cousin who " 45 | + "works at IBM, but Sally and he had to look for Fred"}; 46 | 47 | private static String regularExpressionText 48 | = "He left his email address (rgb@colorworks.com) and his " 49 | + "phone number,800-555-1234. We believe his current address " 50 | + "is 100 Washington Place, Seattle, CO 12345-1234. I " 51 | + "understand you can also call at 123-555-1234 between " 52 | + "8:00 AM and 4:30 most days. His URL is http://example.com " 53 | + "and he was born on February 25, 1954 or 2/25/1954."; 54 | 55 | private static MapDictionary dictionary; 56 | 57 | public static void main(String[] args) { 58 | usingRegularExpressions(); 59 | // usingOpenNLP(); 60 | // usingStanfordNER(); 61 | // usingLingPipeNER(); 62 | // trainingOpenNLPNERModel(); 63 | } 64 | 65 | public static File getModelDir() { 66 | return new File("C:\\Current Books in Progress\\NLP and Java\\Models"); 67 | } 68 | 69 | private static void usingRegularExpressions() { 70 | usingJavaRegularExpressions(); 71 | // usingLingPipeRegExChunker(); 72 | // usingLingPipeRegularExpressions(); 73 | } 74 | 75 | private static void usingJavaRegularExpressions() { 76 | String phoneNumberRE = "\\d{3}-\\d{3}-\\d{4}"; 77 | String urlRegex = "\\b(https?|ftp|file|ldap)://" 78 | + "[-A-Za-z0-9+&@#/%?=~_|!:,.;]" 79 | + "*[-A-Za-z0-9+&@#/%=~_|]"; 80 | String zipCodeRegEx = "[0-9]{5}(\\-?[0-9]{4})?"; 81 | String emailRegEx = "[a-zA-Z0-9'._%+-]+@" 82 | + "(?:[a-zA-Z0-9-]+\\.)" 83 | + "+[a-zA-Z]{2,4}"; 84 | String timeRE = "([01]?[0-9]|2[0-3]):[0-5][0-9]"; 85 | String dateRE = "(0?[1-9]|[12][0-9]|3[01])/(0?[1-9]|1[012])/((19|20)\\\\d\\\\d)"; 86 | dateRE = "((0?[13578]|10|12)(-|\\/)(([1-9])|(0[1-9])|([12])([0-9]?)|(3[01]?))(-|\\/)((19)([2-9])(\\d{1})|(20)([01])(\\d{1})|([8901])(\\d{1}))|(0?[2469]|11)(-|\\/)(([1-9])|(0[1-9])|([12])([0-9]?)|(3[0]?))(-|\\/)((19)([2-9])(\\d{1})|(20)([01])(\\d{1})|([8901])(\\d{1})))"; 87 | Pattern pattern = Pattern.compile(phoneNumberRE + "|" + timeRE + "|" + emailRegEx); 88 | // regularExpressionText = "(888)555-1234 888-SEL-HIGH 888-555-1234-J88-W3S"; 89 | Matcher matcher = pattern.matcher(regularExpressionText); 90 | System.out.println("---Searching ..."); 91 | while (matcher.find()) { 92 | System.out.println(matcher.group() + " [" + matcher.start() 93 | + ":" + matcher.end() + "]"); 94 | } 95 | System.out.println("---Done Searching ..."); 96 | 97 | } 98 | 99 | private static void usingLingPipeRegExChunker() { 100 | String timeRE = "(([0-1]?[0-9])|([2][0-3])):([0-5]?[0-9])(:([0-5]?[0-9]))?"; 101 | Chunker chunker = new TimeRegexChunker(); 102 | // chunker = new RegExChunker(timeRE,"time",1.0); 103 | Chunking chunking = chunker.chunk(regularExpressionText); 104 | Set chunkSet = chunking.chunkSet(); 105 | displayChunkSet(chunker, regularExpressionText); 106 | } 107 | 108 | private static void usingLingPipeRegularExpressions() { 109 | try { 110 | File modelFile = new File(getModelDir(), 111 | "ne-en-news-muc6.AbstractCharLmRescoringChunker"); 112 | Chunker chunker = (Chunker) AbstractExternalizable.readObject(modelFile); 113 | for (int i = 0; i < sentences.length; ++i) { 114 | Chunking chunking = chunker.chunk(sentences[i]); 115 | System.out.println("Chunking=" + chunking); 116 | } 117 | for (String sentence : sentences) { 118 | displayChunkSet(chunker, sentence); 119 | } 120 | 121 | } catch (IOException | ClassNotFoundException ex) { 122 | // Handle exception 123 | } 124 | 125 | } 126 | 127 | // ------ OpenNLP----------------------------------- 128 | private static void usingOpenNLP() { 129 | System.out.println("OpenNLP Examples"); 130 | usingOpenNLPNameFinderME(); 131 | // usingMultipleNERModels(); 132 | } 133 | 134 | private static void usingOpenNLPNameFinderME() { 135 | System.out.println("OpenNLP NameFinderME Examples"); 136 | try (InputStream tokenStream = new FileInputStream( 137 | new File(getModelDir(), "en-token.bin")); 138 | InputStream modelStream = new FileInputStream( 139 | new File(getModelDir(), "en-ner-person.bin"));) { 140 | 141 | TokenizerModel tokenModel = new TokenizerModel(tokenStream); 142 | Tokenizer tokenizer = new TokenizerME(tokenModel); 143 | 144 | TokenNameFinderModel entityModel 145 | = new TokenNameFinderModel(modelStream); 146 | NameFinderME nameFinder = new NameFinderME(entityModel); 147 | 148 | // Single sentence 149 | { 150 | System.out.println("Single sentence"); 151 | StringBuilder builder = new StringBuilder(); 152 | String sentence = "He was the last person to see Fred."; 153 | 154 | String tokens[] = tokenizer.tokenize(sentence); 155 | Span nameSpans[] = nameFinder.find(tokens); 156 | 157 | for (int i = 0; i < nameSpans.length; i++) { 158 | System.out.println("Span: " + nameSpans[i].toString()); 159 | System.out.println("Entity: " 160 | + tokens[nameSpans[i].getStart()]); 161 | } 162 | } 163 | System.out.println(); 164 | for (String sentence : sentences) { 165 | String tokens[] = tokenizer.tokenize(sentence); 166 | Span nameSpans[] = nameFinder.find(tokens); 167 | double[] spanProbs = nameFinder.probs(nameSpans); 168 | 169 | for (int i = 0; i < nameSpans.length; i++) { 170 | System.out.println("Span: " + nameSpans[i].toString()); 171 | System.out.println("Entity: " 172 | + tokens[nameSpans[i].getStart()]); 173 | System.out.println("Probability: " + spanProbs[i]); 174 | } 175 | System.out.println(); 176 | } 177 | } catch (Exception ex) { 178 | ex.printStackTrace(); 179 | } 180 | } 181 | 182 | private static void usingMultipleNERModels() { 183 | // Models - en-ner-person.bin en-ner-location.bin en-ner-money.bin 184 | // en-ner-organization.bin en-ner-time.bin 185 | try { 186 | InputStream tokenStream = new FileInputStream( 187 | new File(getModelDir(), "en-token.bin")); 188 | 189 | TokenizerModel tokenModel = new TokenizerModel(tokenStream); 190 | Tokenizer tokenizer = new TokenizerME(tokenModel); 191 | 192 | String modelNames[] = {"en-ner-person.bin", "en-ner-location.bin", 193 | "en-ner-organization.bin"}; 194 | ArrayList list = new ArrayList(); 195 | for (String name : modelNames) { 196 | TokenNameFinderModel entityModel = new TokenNameFinderModel( 197 | new FileInputStream( 198 | new File(getModelDir(), name))); 199 | NameFinderME nameFinder = new NameFinderME(entityModel); 200 | for (int index = 0; index < sentences.length; index++) { 201 | String tokens[] = tokenizer.tokenize(sentences[index]); 202 | Span nameSpans[] = nameFinder.find(tokens); 203 | for (Span span : nameSpans) { 204 | list.add("Sentence: " + index 205 | + " Span: " + span.toString() + " Entity: " 206 | + tokens[span.getStart()]); 207 | } 208 | } 209 | } 210 | System.out.println("Multiple Entities"); 211 | for (String element : list) { 212 | System.out.println(element); 213 | } 214 | } catch (Exception ex) { 215 | ex.printStackTrace(); 216 | } 217 | } 218 | 219 | private static void usingStanfordNER() { 220 | String model = getModelDir() + "\\english.conll.4class.distsim.crf.ser.gz"; 221 | CRFClassifier classifier = CRFClassifier.getClassifierNoExceptions(model); 222 | 223 | String sentence = ""; 224 | for (String element : sentences) { 225 | sentence += element; 226 | } 227 | 228 | List> entityList = classifier.classify(sentence); 229 | 230 | for (List internalList : entityList) { 231 | for (CoreLabel coreLabel : internalList) { 232 | String word = coreLabel.word(); 233 | String category = coreLabel.get(CoreAnnotations.AnswerAnnotation.class); 234 | // System.out.println(word + ":" + category); 235 | if (!"O".equals(category)) { 236 | System.out.println(word + ":" + category); 237 | } 238 | 239 | } 240 | 241 | } 242 | } 243 | 244 | private static void usingLingPipeNER() { 245 | // usingLingPipeRexExChunker(); 246 | usingExactDictionaryChunker(); 247 | 248 | } 249 | 250 | private static void usingLingPipeRexExChunker() { 251 | try { 252 | File modelFile = new File(getModelDir(), 253 | "ne-en-news-muc6.AbstractCharLmRescoringChunker"); 254 | Chunker chunker 255 | = (Chunker) AbstractExternalizable.readObject(modelFile); 256 | 257 | for (String sentence : sentences) { 258 | displayChunkSet(chunker, sentence); 259 | } 260 | } catch (IOException | ClassNotFoundException ex) { 261 | ex.printStackTrace(); 262 | } 263 | } 264 | 265 | private static void displayChunkSet(Chunker chunker, String text) { 266 | Chunking chunking = chunker.chunk(text); 267 | Set set = chunking.chunkSet(); 268 | for (Chunk chunk : set) { 269 | System.out.println("Type: " + chunk.type() + " Entity: [" 270 | + text.substring(chunk.start(), chunk.end()) 271 | + "] Score: " + chunk.score()); 272 | } 273 | } 274 | 275 | private static void initializeDictionary() { 276 | dictionary = new MapDictionary(); 277 | dictionary.addEntry( 278 | new DictionaryEntry("Joe", "PERSON", 1.0)); 279 | dictionary.addEntry( 280 | new DictionaryEntry("Fred", "PERSON", 1.0)); 281 | dictionary.addEntry( 282 | new DictionaryEntry("Boston", "PLACE", 1.0)); 283 | dictionary.addEntry( 284 | new DictionaryEntry("pub", "PLACE", 1.0)); 285 | dictionary.addEntry( 286 | new DictionaryEntry("Vermont", "PLACE", 1.0)); 287 | dictionary.addEntry( 288 | new DictionaryEntry("IBM", "ORGANIZATION", 1.0)); 289 | dictionary.addEntry( 290 | new DictionaryEntry("Sally", "PERSON", 1.0)); 291 | } 292 | 293 | private static void usingExactDictionaryChunker() { 294 | initializeDictionary(); 295 | System.out.println("\nDICTIONARY\n" + dictionary); 296 | 297 | ExactDictionaryChunker dictionaryChunker 298 | = new ExactDictionaryChunker(dictionary, 299 | IndoEuropeanTokenizerFactory.INSTANCE, true, false); 300 | 301 | for (String sentence : sentences) { 302 | System.out.println("\nTEXT=" + sentence); 303 | displayChunkSet(dictionaryChunker, sentence); 304 | } 305 | } 306 | 307 | // Training Models 308 | private static void trainingOpenNLPNERModel() { 309 | try (OutputStream modelOutputStream = new BufferedOutputStream( 310 | new FileOutputStream(new File("modelFile")));) { 311 | ObjectStream lineStream = new PlainTextByLineStream( 312 | new FileInputStream("en-ner-person.train"), "UTF-8"); 313 | ObjectStream sampleStream = new NameSampleDataStream(lineStream); 314 | 315 | TokenNameFinderModel model = NameFinderME.train("en", "person", sampleStream, 316 | null, 100, 5); 317 | 318 | model.serialize(modelOutputStream); 319 | } catch (IOException ex) { 320 | ex.printStackTrace(); 321 | } 322 | } 323 | } 324 | -------------------------------------------------------------------------------- /Chapter04/old/TimeRegexChunker.java: -------------------------------------------------------------------------------- 1 | package packt; 2 | 3 | import com.aliasi.chunk.RegExChunker; 4 | 5 | public class TimeRegexChunker extends RegExChunker { 6 | private final static String TIME_RE = 7 | "(([0-1]?[0-9])|([2][0-3])):([0-5]?[0-9])(:([0-5]?[0-9]))?"; 8 | private final static String CHUNK_TYPE = "time"; 9 | private final static double CHUNK_SCORE = 1.0; 10 | 11 | public TimeRegexChunker() { 12 | super(TIME_RE,CHUNK_TYPE,CHUNK_SCORE); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /Chapter04/src/packt/Chapter4.java: -------------------------------------------------------------------------------- 1 | package packt; 2 | 3 | import com.aliasi.chunk.Chunk; 4 | import com.aliasi.chunk.Chunker; 5 | import com.aliasi.chunk.Chunking; 6 | import com.aliasi.dict.DictionaryEntry; 7 | import com.aliasi.dict.ExactDictionaryChunker; 8 | import com.aliasi.dict.MapDictionary; 9 | import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory; 10 | import com.aliasi.util.AbstractExternalizable; 11 | import edu.stanford.nlp.ie.crf.CRFClassifier; 12 | import edu.stanford.nlp.ling.CoreAnnotations; 13 | import edu.stanford.nlp.ling.CoreLabel; 14 | import java.io.BufferedOutputStream; 15 | import java.io.File; 16 | import java.io.FileInputStream; 17 | import java.io.FileOutputStream; 18 | import java.io.IOException; 19 | import java.io.InputStream; 20 | import java.io.OutputStream; 21 | import java.util.ArrayList; 22 | import java.util.List; 23 | import java.util.Set; 24 | import java.util.regex.Matcher; 25 | import java.util.regex.Pattern; 26 | import opennlp.tools.namefind.NameFinderME; 27 | import opennlp.tools.namefind.NameSample; 28 | import opennlp.tools.namefind.NameSampleDataStream; 29 | import opennlp.tools.namefind.TokenNameFinderEvaluator; 30 | import opennlp.tools.namefind.TokenNameFinderModel; 31 | import opennlp.tools.sentdetect.SentenceDetectorEvaluator; 32 | import opennlp.tools.tokenize.Tokenizer; 33 | import opennlp.tools.tokenize.TokenizerME; 34 | import opennlp.tools.tokenize.TokenizerModel; 35 | import opennlp.tools.util.ObjectStream; 36 | import opennlp.tools.util.PlainTextByLineStream; 37 | import opennlp.tools.util.Span; 38 | import opennlp.tools.util.eval.FMeasure; 39 | 40 | public class Chapter4 { 41 | 42 | private static final String sentences[] = {"Joe was the last person to see Fred. ", 43 | "He saw him in Boston at McKenzie's pub at 3:00 where he paid " 44 | + "$2.45 for an ale. ", 45 | "Joe wanted to go to Vermont for the day to visit a cousin who " 46 | + "works at IBM, but Sally and he had to look for Fred"}; 47 | 48 | private static String regularExpressionText 49 | = "He left his email address (rgb@colorworks.com) and his " 50 | + "phone number,800-555-1234. We believe his current address " 51 | + "is 100 Washington Place, Seattle, CO 12345-1234. I " 52 | + "understand you can also call at 123-555-1234 between " 53 | + "8:00 AM and 4:30 most days. His URL is http://example.com " 54 | + "and he was born on February 25, 1954 or 2/25/1954."; 55 | 56 | private static MapDictionary dictionary; 57 | 58 | public static void main(String[] args) { 59 | usingRegularExpressions(); 60 | // usingOpenNLP(); 61 | // usingStanfordNER(); 62 | // usingLingPipeNER(); 63 | // trainingOpenNLPNERModel(); 64 | } 65 | 66 | public static File getModelDir() { 67 | return new File("C:/Current Books/NLP and Java/Models"); 68 | } 69 | 70 | private static void usingRegularExpressions() { 71 | usingJavaRegularExpressions(); 72 | // usingLingPipeRegExChunker(); 73 | // usingLingPipeRegularExpressions(); 74 | } 75 | 76 | private static void usingJavaRegularExpressions() { 77 | String phoneNumberRE = "\\d{3}-\\d{3}-\\d{4}"; 78 | String urlRegex = "\\b(https?|ftp|file|ldap)://" 79 | + "[-A-Za-z0-9+&@#/%?=~_|!:,.;]" 80 | + "*[-A-Za-z0-9+&@#/%=~_|]"; 81 | String zipCodeRegEx = "[0-9]{5}(\\-?[0-9]{4})?"; 82 | String emailRegEx = "[a-zA-Z0-9'._%+-]+@" 83 | + "(?:[a-zA-Z0-9-]+\\.)" 84 | + "+[a-zA-Z]{2,4}"; 85 | String timeRE = "([01]?[0-9]|2[0-3]):[0-5][0-9]"; 86 | String dateRE = "(0?[1-9]|[12][0-9]|3[01])/(0?[1-9]|1[012])/((19|20)\\\\d\\\\d)"; 87 | dateRE = "((0?[13578]|10|12)(-|\\/)(([1-9])|(0[1-9])|([12])([0-9]?)|(3[01]?))(-|\\/)((19)([2-9])(\\d{1})|(20)([01])(\\d{1})|([8901])(\\d{1}))|(0?[2469]|11)(-|\\/)(([1-9])|(0[1-9])|([12])([0-9]?)|(3[0]?))(-|\\/)((19)([2-9])(\\d{1})|(20)([01])(\\d{1})|([8901])(\\d{1})))"; 88 | Pattern pattern = Pattern.compile(phoneNumberRE + "|" + timeRE + "|" + emailRegEx); 89 | regularExpressionText = "(888)555-1111 888-SEL-HIGH 888-555-2222-J88-W3S"; 90 | Matcher matcher = pattern.matcher(regularExpressionText); 91 | System.out.println("---Searching ..."); 92 | while (matcher.find()) { 93 | System.out.println(matcher.group() + " [" + matcher.start() 94 | + ":" + matcher.end() + "]"); 95 | } 96 | System.out.println("---Done Searching ..."); 97 | 98 | } 99 | 100 | private static void usingLingPipeRegExChunker() { 101 | String timeRE = "(([0-1]?[0-9])|([2][0-3])):([0-5]?[0-9])(:([0-5]?[0-9]))?"; 102 | Chunker chunker = new TimeRegexChunker(); 103 | // chunker = new RegExChunker(timeRE,"time",1.0); 104 | Chunking chunking = chunker.chunk(regularExpressionText); 105 | Set chunkSet = chunking.chunkSet(); 106 | displayChunkSet(chunker, regularExpressionText); 107 | } 108 | 109 | private static void usingLingPipeRegularExpressions() { 110 | try { 111 | File modelFile = new File(getModelDir(), 112 | "ne-en-news-muc6.AbstractCharLmRescoringChunker"); 113 | Chunker chunker = (Chunker) AbstractExternalizable.readObject(modelFile); 114 | for (int i = 0; i < sentences.length; ++i) { 115 | Chunking chunking = chunker.chunk(sentences[i]); 116 | System.out.println("Chunking=" + chunking); 117 | } 118 | for (String sentence : sentences) { 119 | displayChunkSet(chunker, sentence); 120 | } 121 | 122 | } catch (IOException | ClassNotFoundException ex) { 123 | // Handle exception 124 | } 125 | 126 | } 127 | 128 | // ------ OpenNLP----------------------------------- 129 | private static void usingOpenNLP() { 130 | System.out.println("OpenNLP Examples"); 131 | usingOpenNLPNameFinderME(); 132 | // usingMultipleNERModels(); 133 | } 134 | 135 | private static void usingOpenNLPNameFinderME() { 136 | System.out.println("OpenNLP NameFinderME Examples"); 137 | try (InputStream tokenStream = new FileInputStream( 138 | new File(getModelDir(), "en-token.bin")); 139 | InputStream modelStream = new FileInputStream( 140 | new File(getModelDir(), "en-ner-time.bin"));) { 141 | 142 | TokenizerModel tokenModel = new TokenizerModel(tokenStream); 143 | Tokenizer tokenizer = new TokenizerME(tokenModel); 144 | 145 | TokenNameFinderModel entityModel 146 | = new TokenNameFinderModel(modelStream); 147 | NameFinderME nameFinder = new NameFinderME(entityModel); 148 | 149 | // Single sentence 150 | { 151 | System.out.println("Single sentence"); 152 | StringBuilder builder = new StringBuilder(); 153 | String sentence = "He was the last person to see Fred."; 154 | 155 | String tokens[] = tokenizer.tokenize(sentence); 156 | Span nameSpans[] = nameFinder.find(tokens); 157 | 158 | for (int i = 0; i < nameSpans.length; i++) { 159 | System.out.println("Span: " + nameSpans[i].toString()); 160 | System.out.println("Entity: " 161 | + tokens[nameSpans[i].getStart()]); 162 | } 163 | } 164 | System.out.println(); 165 | System.out.println("Sentences"); 166 | for (String sentence : sentences) { 167 | String tokens[] = tokenizer.tokenize(sentence); 168 | Span nameSpans[] = nameFinder.find(tokens); 169 | double[] spanProbs = nameFinder.probs(nameSpans); 170 | 171 | for (int i = 0; i < nameSpans.length; i++) { 172 | System.out.println("Span: " + nameSpans[i].toString()); 173 | System.out.println("Entity: " 174 | + tokens[nameSpans[i].getStart()]); 175 | System.out.println("Probability: " + spanProbs[i]); 176 | } 177 | System.out.println(); 178 | } 179 | } catch (Exception ex) { 180 | ex.printStackTrace(); 181 | } 182 | } 183 | 184 | private static void usingMultipleNERModels() { 185 | // Models - en-ner-person.bin en-ner-location.bin en-ner-money.bin 186 | // en-ner-organization.bin en-ner-time.bin 187 | try { 188 | InputStream tokenStream = new FileInputStream( 189 | new File(getModelDir(), "en-token.bin")); 190 | 191 | TokenizerModel tokenModel = new TokenizerModel(tokenStream); 192 | Tokenizer tokenizer = new TokenizerME(tokenModel); 193 | 194 | String modelNames[] = {"en-ner-person.bin", "en-ner-location.bin", 195 | "en-ner-organization.bin"}; 196 | ArrayList list = new ArrayList(); 197 | for (String name : modelNames) { 198 | TokenNameFinderModel entityModel = new TokenNameFinderModel( 199 | new FileInputStream( 200 | new File(getModelDir(), name))); 201 | NameFinderME nameFinder = new NameFinderME(entityModel); 202 | for (int index = 0; index < sentences.length; index++) { 203 | String tokens[] = tokenizer.tokenize(sentences[index]); 204 | Span nameSpans[] = nameFinder.find(tokens); 205 | for (Span span : nameSpans) { 206 | list.add("Sentence: " + index 207 | + " Span: " + span.toString() + " Entity: " 208 | + tokens[span.getStart()]); 209 | } 210 | } 211 | } 212 | System.out.println("Multiple Entities"); 213 | for (String element : list) { 214 | System.out.println(element); 215 | } 216 | } catch (Exception ex) { 217 | ex.printStackTrace(); 218 | } 219 | } 220 | 221 | private static void usingStanfordNER() { 222 | String model = getModelDir() + "\\english.conll.4class.distsim.crf.ser.gz"; 223 | CRFClassifier classifier = CRFClassifier.getClassifierNoExceptions(model); 224 | 225 | String sentence = ""; 226 | for (String element : sentences) { 227 | sentence += element; 228 | } 229 | 230 | List> entityList = classifier.classify(sentence); 231 | 232 | for (List internalList : entityList) { 233 | for (CoreLabel coreLabel : internalList) { 234 | String word = coreLabel.word(); 235 | String category = coreLabel.get(CoreAnnotations.AnswerAnnotation.class); 236 | // System.out.println(word + ":" + category); 237 | if (!"O".equals(category)) { 238 | System.out.println(word + ":" + category); 239 | } 240 | 241 | } 242 | 243 | } 244 | } 245 | 246 | private static void usingLingPipeNER() { 247 | // usingLingPipeRexExChunker(); 248 | usingExactDictionaryChunker(); 249 | 250 | } 251 | 252 | private static void usingLingPipeRexExChunker() { 253 | try { 254 | File modelFile = new File(getModelDir(), 255 | "ne-en-news-muc6.AbstractCharLmRescoringChunker"); 256 | Chunker chunker 257 | = (Chunker) AbstractExternalizable.readObject(modelFile); 258 | 259 | for (String sentence : sentences) { 260 | displayChunkSet(chunker, sentence); 261 | } 262 | } catch (IOException | ClassNotFoundException ex) { 263 | ex.printStackTrace(); 264 | } 265 | } 266 | 267 | private static void displayChunkSet(Chunker chunker, String text) { 268 | Chunking chunking = chunker.chunk(text); 269 | Set set = chunking.chunkSet(); 270 | for (Chunk chunk : set) { 271 | System.out.println("Type: " + chunk.type() + " Entity: [" 272 | + text.substring(chunk.start(), chunk.end()) 273 | + "] Score: " + chunk.score()); 274 | } 275 | } 276 | 277 | private static void initializeDictionary() { 278 | dictionary = new MapDictionary(); 279 | dictionary.addEntry( 280 | new DictionaryEntry("Joe", "PERSON", 1.0)); 281 | dictionary.addEntry( 282 | new DictionaryEntry("Fred", "PERSON", 1.0)); 283 | dictionary.addEntry( 284 | new DictionaryEntry("Boston", "PLACE", 1.0)); 285 | dictionary.addEntry( 286 | new DictionaryEntry("pub", "PLACE", 1.0)); 287 | dictionary.addEntry( 288 | new DictionaryEntry("Vermont", "PLACE", 1.0)); 289 | dictionary.addEntry( 290 | new DictionaryEntry("IBM", "ORGANIZATION", 1.0)); 291 | dictionary.addEntry( 292 | new DictionaryEntry("Sally", "PERSON", 1.0)); 293 | } 294 | 295 | private static void usingExactDictionaryChunker() { 296 | initializeDictionary(); 297 | System.out.println("\nDICTIONARY\n" + dictionary); 298 | 299 | ExactDictionaryChunker dictionaryChunker 300 | = new ExactDictionaryChunker(dictionary, 301 | IndoEuropeanTokenizerFactory.INSTANCE, true, false); 302 | 303 | for (String sentence : sentences) { 304 | System.out.println("\nTEXT=" + sentence); 305 | displayChunkSet(dictionaryChunker, sentence); 306 | } 307 | } 308 | 309 | // Training Models 310 | private static void trainingOpenNLPNERModel() { 311 | try (OutputStream modelOutputStream = new BufferedOutputStream( 312 | new FileOutputStream(new File("modelFile")));) { 313 | ObjectStream lineStream = new PlainTextByLineStream( 314 | new FileInputStream("en-ner-person.train"), "UTF-8"); 315 | ObjectStream sampleStream = new NameSampleDataStream(lineStream); 316 | 317 | TokenNameFinderModel model = NameFinderME.train("en", "person", sampleStream, 318 | null, 100, 5); 319 | 320 | model.serialize(modelOutputStream); 321 | 322 | System.out.println("TokenNameFinderEvaluator"); 323 | TokenNameFinderEvaluator evaluator = new TokenNameFinderEvaluator(new NameFinderME(model)); 324 | 325 | lineStream = new PlainTextByLineStream( 326 | new FileInputStream("en-ner-person.eval"), "UTF-8"); 327 | sampleStream = new NameSampleDataStream(lineStream); 328 | evaluator.evaluate(sampleStream); 329 | 330 | FMeasure result = evaluator.getFMeasure(); 331 | System.out.println(result.toString()); 332 | } catch (IOException ex) { 333 | ex.printStackTrace(); 334 | } 335 | } 336 | } 337 | -------------------------------------------------------------------------------- /Chapter04/src/packt/DictionaryChunker.java: -------------------------------------------------------------------------------- 1 | package packt; 2 | 3 | import com.aliasi.chunk.Chunk; 4 | import com.aliasi.chunk.Chunker; 5 | import com.aliasi.chunk.Chunking; 6 | import com.aliasi.dict.DictionaryEntry; 7 | import com.aliasi.dict.MapDictionary; 8 | import com.aliasi.dict.TrieDictionary; 9 | import com.aliasi.dict.Dictionary; 10 | import com.aliasi.dict.ExactDictionaryChunker; 11 | 12 | import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory; 13 | 14 | import java.util.Iterator; 15 | import java.util.Set; 16 | 17 | public class DictionaryChunker { 18 | private static final String sentences[] = {"Joe was the last person to see Fred. ", 19 | "He saw him in Boston at McKenzie's pub at 3:00 where he paid " 20 | + "$2.45 for an ale. ", 21 | "Joe wanted to go to Vermont for the day to visit a cousin who " 22 | + "works at IBM, but Sally and he had to look for Fred"}; 23 | static final double CHUNK_SCORE = 1.0; 24 | 25 | public static void main(String[] args) { 26 | 27 | MapDictionary dictionary = new MapDictionary(); 28 | dictionary.addEntry(new DictionaryEntry("Joe","PERSON",CHUNK_SCORE)); 29 | dictionary.addEntry(new DictionaryEntry("Fred","PERSON",CHUNK_SCORE)); 30 | dictionary.addEntry(new DictionaryEntry("Boston","PLACE",CHUNK_SCORE)); 31 | dictionary.addEntry(new DictionaryEntry("pub","PLACE",CHUNK_SCORE)); 32 | dictionary.addEntry(new DictionaryEntry("Vermont","PLACE",CHUNK_SCORE)); 33 | dictionary.addEntry(new DictionaryEntry("IBM","ORGANIZATION",CHUNK_SCORE)); 34 | dictionary.addEntry(new DictionaryEntry("Sally","PERSON",CHUNK_SCORE)); 35 | 36 | 37 | ExactDictionaryChunker dictionaryChunkerTT 38 | = new ExactDictionaryChunker(dictionary, 39 | IndoEuropeanTokenizerFactory.INSTANCE, 40 | true,true); 41 | 42 | ExactDictionaryChunker dictionaryChunkerTF 43 | = new ExactDictionaryChunker(dictionary, 44 | IndoEuropeanTokenizerFactory.INSTANCE, 45 | true,false); 46 | 47 | ExactDictionaryChunker dictionaryChunkerFT 48 | = new ExactDictionaryChunker(dictionary, 49 | IndoEuropeanTokenizerFactory.INSTANCE, 50 | false,true); 51 | 52 | ExactDictionaryChunker dictionaryChunkerFF 53 | = new ExactDictionaryChunker(dictionary, 54 | IndoEuropeanTokenizerFactory.INSTANCE, 55 | false,false); 56 | 57 | 58 | 59 | System.out.println("\nDICTIONARY\n" + dictionary); 60 | 61 | for (int i = 0; i < sentences.length; ++i) { 62 | String text = sentences[i]; 63 | System.out.println("\n\nTEXT=" + text); 64 | 65 | chunk(dictionaryChunkerTT,text); 66 | chunk(dictionaryChunkerTF,text); 67 | chunk(dictionaryChunkerFT,text); 68 | chunk(dictionaryChunkerFF,text); 69 | } 70 | 71 | } 72 | 73 | static void chunk(ExactDictionaryChunker chunker, String text) { 74 | System.out.println("\nChunker." 75 | + " All matches=" + chunker.returnAllMatches() 76 | + " Case sensitive=" + chunker.caseSensitive()); 77 | Chunking chunking = chunker.chunk(text); 78 | for (Chunk chunk : chunking.chunkSet()) { 79 | int start = chunk.start(); 80 | int end = chunk.end(); 81 | String type = chunk.type(); 82 | double score = chunk.score(); 83 | String phrase = text.substring(start,end); 84 | System.out.println(" phrase=|" + phrase + "|" 85 | + " start=" + start 86 | + " end=" + end 87 | + " type=" + type 88 | + " score=" + score); 89 | } 90 | } 91 | 92 | } 93 | 94 | -------------------------------------------------------------------------------- /Chapter04/src/packt/EmailRegexChunker.java: -------------------------------------------------------------------------------- 1 | package packt; 2 | 3 | import com.aliasi.chunk.RegExChunker; 4 | 5 | public class EmailRegexChunker extends RegExChunker { 6 | // From: http://alias-i.com/lingpipe/demos/tutorial/ne/read-me.html 7 | public EmailRegexChunker() { 8 | super(EMAIL_REGEX,CHUNK_TYPE,CHUNK_SCORE); 9 | } 10 | 11 | private final static String EMAIL_REGEX 12 | = "[A-Za-z0-9](([_\\.\\-]?[a-zA-Z0-9]+)*)@([A-Za-z0-9]+)(([\\.\\-]?[a-zA-Z0-9]+)*)\\.([A-Za-z]{2,})"; 13 | 14 | private final static String CHUNK_TYPE = "email"; 15 | 16 | private final static double CHUNK_SCORE = 0.0; 17 | 18 | } 19 | -------------------------------------------------------------------------------- /Chapter04/src/packt/RunChunker.java: -------------------------------------------------------------------------------- 1 | package packt; 2 | 3 | import com.aliasi.chunk.Chunker; 4 | import com.aliasi.chunk.Chunking; 5 | 6 | import com.aliasi.util.AbstractExternalizable; 7 | 8 | import java.io.File; 9 | 10 | public class RunChunker { 11 | private static final String sentences[] = {"Joe was the last person to see Fred. ", 12 | "He saw him in Boston at McKenzie's pub at 3:00 where he paid " 13 | + "$2.45 for an ale. ", 14 | "Joe wanted to go to Vermont for the day to visit a cousin who " 15 | + "works at IBM, but Sally and he had to look for Fred"}; 16 | 17 | public static void main(String[] args) throws Exception { 18 | File modelFile = new File(getModelDir(), "ne-en-news-muc6.AbstractCharLmRescoringChunker"); 19 | 20 | System.out.println("Reading chunker from file=" + modelFile); 21 | Chunker chunker 22 | = (Chunker) AbstractExternalizable.readObject(modelFile); 23 | 24 | for (int i = 0; i < sentences.length; ++i) { 25 | Chunking chunking = chunker.chunk(sentences[i]); 26 | System.out.println("Chunking=" + chunking); 27 | } 28 | 29 | } 30 | 31 | public static File getModelDir() { 32 | return new File("C:\\Current Books in Progress\\NLP and Java\\Models"); 33 | } 34 | } 35 | 36 | -------------------------------------------------------------------------------- /Chapter04/src/packt/TimeRegexChunker.java: -------------------------------------------------------------------------------- 1 | package packt; 2 | 3 | import com.aliasi.chunk.RegExChunker; 4 | 5 | public class TimeRegexChunker extends RegExChunker { 6 | private final static String TIME_RE = 7 | "(([0-1]?[0-9])|([2][0-3])):([0-5]?[0-9])(:([0-5]?[0-9]))?"; 8 | private final static String CHUNK_TYPE = "time"; 9 | private final static double CHUNK_SCORE = 1.0; 10 | 11 | public TimeRegexChunker() { 12 | super(TIME_RE,CHUNK_TYPE,CHUNK_SCORE); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /Chapter04/src/packt/TrainEntities.java: -------------------------------------------------------------------------------- 1 | package packt; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | 6 | import com.aliasi.chunk.CharLmHmmChunker; 7 | //import com.aliasi.corpus.parsers.Muc6ChunkParser; 8 | import com.aliasi.hmm.HmmCharLmEstimator; 9 | import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory; 10 | import com.aliasi.tokenizer.TokenizerFactory; 11 | import com.aliasi.util.AbstractExternalizable; 12 | 13 | @SuppressWarnings("deprecation") 14 | public class TrainEntities { 15 | 16 | static final int MAX_N_GRAM = 50; 17 | static final int NUM_CHARS = 300; 18 | static final double LM_INTERPOLATION = MAX_N_GRAM; // default behavior 19 | 20 | public static void main(String[] args) throws IOException { 21 | File corpusFile = new File("inputfile.txt");// my annotated file 22 | File modelFile = new File("outputmodelfile.model"); 23 | 24 | System.out.println("Setting up Chunker Estimator"); 25 | TokenizerFactory factory 26 | = IndoEuropeanTokenizerFactory.INSTANCE; 27 | HmmCharLmEstimator hmmEstimator 28 | = new HmmCharLmEstimator(MAX_N_GRAM,NUM_CHARS,LM_INTERPOLATION); 29 | CharLmHmmChunker chunkerEstimator 30 | = new CharLmHmmChunker(factory,hmmEstimator); 31 | 32 | System.out.println("Setting up Data Parser"); 33 | // Muc6ChunkParser parser = new Muc6ChunkParser(); 34 | // parser.setHandler( chunkerEstimator); 35 | 36 | System.out.println("Training with Data from File=" + corpusFile); 37 | // parser.parse(corpusFile); 38 | 39 | System.out.println("Compiling and Writing Model to File=" + modelFile); 40 | AbstractExternalizable.compileTo(chunkerEstimator,modelFile); 41 | } 42 | 43 | } -------------------------------------------------------------------------------- /Chapter06/Chapter6.java: -------------------------------------------------------------------------------- 1 | package packt; 2 | 3 | import com.aliasi.classify.Classification; 4 | import com.aliasi.classify.Classified; 5 | import com.aliasi.classify.DynamicLMClassifier; 6 | import com.aliasi.classify.JointClassification; 7 | import com.aliasi.classify.JointClassifier; 8 | import com.aliasi.classify.LMClassifier; 9 | import com.aliasi.lm.NGramProcessLM; 10 | import com.aliasi.util.AbstractExternalizable; 11 | import com.aliasi.util.Compilable; 12 | import com.aliasi.util.Files; 13 | import edu.stanford.nlp.classify.Classifier; 14 | import edu.stanford.nlp.classify.ColumnDataClassifier; 15 | import edu.stanford.nlp.ie.crf.CRFClassifier; 16 | import edu.stanford.nlp.ling.CoreAnnotations; 17 | import edu.stanford.nlp.ling.Datum; 18 | import edu.stanford.nlp.neural.rnn.RNNCoreAnnotations; 19 | import edu.stanford.nlp.objectbank.ObjectBank; 20 | import edu.stanford.nlp.pipeline.Annotation; 21 | import edu.stanford.nlp.pipeline.StanfordCoreNLP; 22 | import edu.stanford.nlp.sentiment.SentimentCoreAnnotations; 23 | import edu.stanford.nlp.stats.Counter; 24 | import edu.stanford.nlp.trees.Tree; 25 | import edu.stanford.nlp.util.CoreMap; 26 | import java.io.BufferedOutputStream; 27 | import java.io.File; 28 | import java.io.FileInputStream; 29 | import java.io.FileOutputStream; 30 | import java.io.IOException; 31 | import java.io.InputStream; 32 | import java.io.OutputStream; 33 | import java.util.Properties; 34 | import java.util.Set; 35 | import opennlp.tools.doccat.DoccatModel; 36 | import opennlp.tools.doccat.DocumentCategorizerME; 37 | import opennlp.tools.doccat.DocumentSample; 38 | import opennlp.tools.doccat.DocumentSampleStream; 39 | import opennlp.tools.util.ObjectStream; 40 | import opennlp.tools.util.PlainTextByLineStream; 41 | 42 | public class Chapter6 { 43 | 44 | private static String inputText = null; 45 | private static String toto = "Toto belongs to Dorothy Gale, the heroine of " 46 | + "the first and many subsequent books. In the first " 47 | + "book, he never spoke, although other animals, native " 48 | + "to Oz, did. In subsequent books, other animals " 49 | + "gained the ability to speak upon reaching Oz or " 50 | + "similar lands, but Toto remained speechless."; 51 | private static String garfield = "Garfield is a comic strip created by Jim " 52 | + "Davis. Published since June 19, 1978, it chronicles " 53 | + "the life of the title character, the cat Garfield " 54 | + "(named after the grandfather of Davis); his owner, " 55 | + "Jon Arbuckle; and Jon's dog, Odie."; 56 | 57 | private static String calico = "This cat is also known as a calimanco cat or " 58 | + "clouded tiger cat, and by the abbreviation 'tortie'. " 59 | + "In the cat fancy, a tortoiseshell cat is patched " 60 | + "over with red (or its dilute form, cream) and black " 61 | + "(or its dilute blue) mottled throughout the coat."; 62 | 63 | public static void main(String[] args) { 64 | inputText = toto; 65 | // inputText = garfield; 66 | // inputText = calico; 67 | // trainingOpenNLPModel(); 68 | // usingOpenNLP(); 69 | // usingStandfordClassifier(); 70 | // usingStanfordSentimentAnalysis(); 71 | usingLingPipe(); 72 | } 73 | 74 | private static void trainingOpenNLPModel() { 75 | DoccatModel model = null; 76 | try (InputStream dataIn = new FileInputStream("en-animal.train"); 77 | OutputStream dataOut = new FileOutputStream("en-animal.model");) { 78 | ObjectStream lineStream 79 | = new PlainTextByLineStream(dataIn, "UTF-8"); 80 | ObjectStream sampleStream = new DocumentSampleStream(lineStream); 81 | model = DocumentCategorizerME.train("en", sampleStream); 82 | 83 | // Save the model 84 | OutputStream modelOut = null; 85 | modelOut = new BufferedOutputStream(dataOut); 86 | model.serialize(modelOut); 87 | } catch (IOException e) { 88 | e.printStackTrace(); 89 | } 90 | } 91 | 92 | private static void usingOpenNLP() { 93 | try (InputStream modelIn = new FileInputStream( 94 | new File("en-animal.model"));) { 95 | DoccatModel model = new DoccatModel(modelIn); 96 | DocumentCategorizerME categorizer = new DocumentCategorizerME(model); 97 | double[] outcomes = categorizer.categorize(inputText); 98 | for (int i = 0; i < categorizer.getNumberOfCategories(); i++) { 99 | String category = categorizer.getCategory(i); 100 | System.out.println(category + " - " + outcomes[i]); 101 | } 102 | System.out.println(categorizer.getBestCategory(outcomes)); 103 | System.out.println(categorizer.getAllResults(outcomes)); 104 | } catch (IOException ex) { 105 | ex.printStackTrace(); 106 | } 107 | } 108 | 109 | public static File getModelDir() { 110 | return new File("C:\\Current Books in Progress\\NLP and Java\\Models"); 111 | } 112 | 113 | private static void usingStandfordClassifier() { 114 | // String dir = "C:/Current Books in Progress/NLP and Java/Downloads/Stanford/stanford-classifier-2014-10-26/"; 115 | ColumnDataClassifier cdc = new ColumnDataClassifier("box.prop"); 116 | Classifier classifier 117 | = cdc.makeClassifier(cdc.readTrainingExamples("box.train")); 118 | for (String line : ObjectBank.getLineIterator("box.test", "utf-8")) { 119 | // instead of the method in the line below, if you have the individual elements 120 | // already you can use cdc.makeDatumFromStrings(String[]) 121 | Datum datum = cdc.makeDatumFromLine(line); 122 | System.out.println("Datum: {" + line + "]\tPredicted Category: " + classifier.classOf(datum)); 123 | // System.out.println(" Scores: " + classifier.scoresOf(datum)); 124 | // Counter counter = classifier.scoresOf(datum); 125 | // Set set = counter.keySet(); 126 | // for (String element : set) { 127 | // System.out.printf("Scores - %-6s: %5.2f ", element, counter.getCount(element)); 128 | // } 129 | // System.out.println(); 130 | } 131 | 132 | 133 | System.out.println(); 134 | String sample[] = {"", "6.90", "9.8", "15.69"}; 135 | Datum datum = cdc.makeDatumFromStrings(sample); 136 | System.out.println("Category: " + classifier.classOf(datum)); 137 | } 138 | 139 | private static void usingStanfordSentimentAnalysis() { 140 | String review = "An overly sentimental film with a somewhat " 141 | + "problematic message, but its sweetness and charm " 142 | + "are occasionally enough to approximate true depth " 143 | + "and grace. "; 144 | 145 | String sam = "Sam was an odd sort of fellow. Not prone to angry and " 146 | + "not prone to merriment. Overall, an odd fellow."; 147 | String mary = "Mary thought that custard pie was the best pie in the " 148 | + "world. However, she loathed chocolate pie."; 149 | Properties props = new Properties(); 150 | props.put("annotators", "tokenize, ssplit, parse, sentiment"); 151 | StanfordCoreNLP pipeline = new StanfordCoreNLP(props); 152 | 153 | Annotation annotation = new Annotation(review); 154 | pipeline.annotate(annotation); 155 | 156 | System.out.println("---sentimentText"); 157 | String[] sentimentText = {"Very Negative", "Negative", "Neutral", 158 | "Positive", "Very Positive"}; 159 | for (CoreMap sentence : annotation.get( 160 | CoreAnnotations.SentencesAnnotation.class)) { 161 | Tree tree = sentence.get( 162 | SentimentCoreAnnotations.AnnotatedTree.class); 163 | System.out.println("---Number of children: " + tree.numChildren()); 164 | System.out.println("[" + tree.getChild(0) + "][" + tree.getChild(1) + "]"); 165 | tree.printLocalTree(); 166 | int score = RNNCoreAnnotations.getPredictedClass(tree); 167 | System.out.println(sentimentText[score]); 168 | } 169 | 170 | // Classifer 171 | CRFClassifier crf 172 | = CRFClassifier.getClassifierNoExceptions( 173 | "C:/Current Books in Progress/NLP and Java/Models" 174 | + "/english.all.3class.distsim.crf.ser.gz"); 175 | String S1 = "Good afternoon Rajat Raina, how are you today?"; 176 | String S2 = "I go to school at Stanford University, which is located in California."; 177 | System.out.println(crf.classifyToString(S1)); 178 | System.out.println(crf.classifyWithInlineXML(S2)); 179 | System.out.println(crf.classifyToString(S2, "xml", true)); 180 | 181 | Object classification[] = crf.classify(S2).toArray(); 182 | for (int i = 0; i < classification.length; i++) { 183 | System.out.println(classification[i]); 184 | } 185 | } 186 | 187 | //---------------------------------------------------------------------------- 188 | private static String[] categories 189 | = {"soc.religion.christian", 190 | "talk.religion.misc", 191 | "alt.atheism", 192 | "misc.forsale"}; 193 | 194 | private static JointClassifier compiledClassifier; 195 | 196 | private static void usingLingPipe() { 197 | // trainingLingPipeClassificationModels(); 198 | // usingLingPipeModels(); 199 | usingLingPipeSentimentAnalysis(); 200 | // classifyLingPipeSLanguageAnalysis(); 201 | } 202 | 203 | private static int nGramSize = 6; 204 | private static DynamicLMClassifier classifier 205 | = DynamicLMClassifier.createNGramProcess(categories, nGramSize); 206 | 207 | private static void trainingLingPipeClassificationModels() { 208 | final String directory = "C:/Current Books/NLP and Java/Downloads/lingpipe-4.1.0/demos"; 209 | final File trainingDirectory 210 | = new File(directory + "/data/fourNewsGroups/4news-train"); 211 | 212 | for (int i = 0; i < categories.length; ++i) { 213 | final File classDir = new File(trainingDirectory, categories[i]); 214 | 215 | String[] trainingFiles = classDir.list(); 216 | for (int j = 0; j < trainingFiles.length; ++j) { 217 | try { 218 | File file = new File(classDir, trainingFiles[j]); 219 | String text = Files.readFromFile(file, "ISO-8859-1"); 220 | Classification classification 221 | = new Classification(categories[i]); 222 | Classified classified 223 | = new Classified<>((CharSequence)text, classification); 224 | classifier.handle(classified); 225 | 226 | } catch (IOException ex) { 227 | ex.printStackTrace(); 228 | } 229 | } 230 | } 231 | 232 | try { 233 | AbstractExternalizable.compileTo( 234 | (Compilable) classifier, 235 | new File("classifier.model")); 236 | } catch (IOException ex) { 237 | ex.printStackTrace(); 238 | } 239 | 240 | } 241 | 242 | private static void usingLingPipeModels() { 243 | String text = "Finding a home for sale has never been " 244 | + "easier. With Homes.com, you can search new " 245 | + "homes, foreclosures, multi-family homes, " 246 | + "as well as condos and townhouses for sale. " 247 | + "You can even search our real estate agent " 248 | + "directory to work with a professional " 249 | + "Realtor and find your perfect home."; 250 | // text = "Luther taught that salvation and subsequently " 251 | // + "eternity in heaven is not earned by good deeds " 252 | // + "but is received only as a free gift of God's " 253 | // + "grace through faith in Jesus Christ as redeemer " 254 | // + "from sin and subsequently eternity in Hell."; 255 | LMClassifier classifier = null; 256 | try { 257 | // text = 258 | // "Homeowners may employ the services of marketing or online listing companies or market their own property but do not pay a commission and represent themselves with the help of a lawyer or Solicitor (mostly in Commonwealth) throughout the sale."; 259 | classifier = (LMClassifier) AbstractExternalizable.readObject(new File("classifier.model")); 260 | } catch (IOException | ClassNotFoundException ex) { 261 | ex.printStackTrace(); 262 | } 263 | JointClassification classification 264 | = classifier.classify(text); 265 | 266 | System.out.println("---------------"); 267 | System.out.println("Text: " + text); 268 | String bestCategory = classification.bestCategory(); 269 | System.out.println("Best Category: " + bestCategory); 270 | for (int i = 0; i < categories.length; i++) { 271 | double score = classification.score(i); 272 | double probability = classification.jointLog2Probability(i); 273 | String category = classification.category(i); 274 | System.out.printf(" %3d - Category: %-24s Score: %6.2f jointLog2Probability: %6.2f%n", 275 | i, category, score, probability); 276 | } 277 | 278 | // } 279 | // } 280 | } 281 | 282 | private static void usingLingPipeSentimentAnalysis() { 283 | categories = new String[2]; 284 | categories[0] = "neg"; 285 | categories[1] = "pos"; 286 | nGramSize = 8; 287 | classifier = DynamicLMClassifier.createNGramProcess(categories, nGramSize); 288 | 289 | trainingLingPipeSentimentAnalysis(); 290 | classifyLingPipeSentimentAnalysis(); 291 | } 292 | 293 | private static void trainingLingPipeSentimentAnalysis() { 294 | String directory = "C:/Current Books/NLP and Java/Downloads/Sentiment Data"; 295 | File trainingDirectory = new File(directory, "txt_sentoken"); 296 | System.out.println("\nTraining."); 297 | for (int i = 0; i < categories.length; ++i) { 298 | Classification classification 299 | = new Classification(categories[i]); 300 | File file = new File(trainingDirectory, categories[i]); 301 | File[] trainingFiles = file.listFiles(); 302 | for (int j = 0; j < trainingFiles.length; ++j) { 303 | try { 304 | String review = Files.readFromFile(trainingFiles[j], "ISO-8859-1"); 305 | Classified classified; 306 | classified = new Classified<>((CharSequence)review, classification); 307 | classifier.handle(classified); 308 | } catch (IOException ex) { 309 | ex.printStackTrace(); 310 | } 311 | } 312 | } 313 | } 314 | 315 | private static void classifyLingPipeSentimentAnalysis() { 316 | System.out.println("---------------"); 317 | //http://www.rottentomatoes.com/m/forrest_gump/ 318 | String review = "An overly sentimental film with a somewhat " 319 | + "problematic message, but its sweetness and charm " 320 | + "are occasionally enough to approximate true depth " 321 | + "and grace. "; 322 | System.out.println("Text: " + review); 323 | Classification classification 324 | = classifier.classify(review); 325 | String bestCategory = classification.bestCategory(); 326 | System.out.println("Best Category: " + bestCategory); 327 | 328 | for (String category : classifier.categories()) { 329 | System.out.println(category); 330 | } 331 | } 332 | 333 | private static void classifyLingPipeSLanguageAnalysis() { 334 | System.out.println("---------------"); 335 | //http://www.rottentomatoes.com/m/forrest_gump/ 336 | String text = "An overly sentimental film with a somewhat " 337 | + "problematic message, but its sweetness and charm " 338 | + "are occasionally enough to approximate true depth " 339 | + "and grace. "; 340 | text = "Svenska är ett östnordiskt språk som talas av cirka " 341 | + "tio miljoner personer[1], främst i Finland " 342 | + "och Sverige."; 343 | // text = "¡Buenos días, clase! Good morning, class! Hola, ¿Cómo están hoy? Hello, how are you today? Adiós, ¡hasta luego! Bye, see you soon!"; 344 | System.out.println("Text: " + text); 345 | LMClassifier classifier = null; 346 | try { 347 | classifier = (LMClassifier) AbstractExternalizable.readObject( 348 | new File("C:/Current Books/NLP and Java/Models/langid-leipzig.classifier")); 349 | } catch (IOException | ClassNotFoundException ex) { 350 | ex.printStackTrace(); 351 | } 352 | 353 | Classification classification 354 | = classifier.classify(text); 355 | String bestCategory = classification.bestCategory(); 356 | System.out.println("Best Language: " + bestCategory); 357 | 358 | for (String category : classifier.categories()) { 359 | System.out.println(category); 360 | } 361 | } 362 | } 363 | -------------------------------------------------------------------------------- /Chapter06/GloveExample.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package chapter6; 7 | 8 | import glove.GloVe; 9 | import glove.objects.Cooccurrence; 10 | import glove.objects.Vocabulary; 11 | import glove.utils.Methods; 12 | import glove.utils.Options; 13 | import java.io.File; 14 | import java.util.List; 15 | import org.jblas.DoubleMatrix; 16 | 17 | /** 18 | * 19 | * @author ashish 20 | */ 21 | public class GloveExample { 22 | private static String getResourcePath(){ 23 | File currDir = new File("."); 24 | String path = currDir .getAbsolutePath(); 25 | path = path.substring(0, path.length()-2); 26 | System.out.println(path); 27 | String resourcePath = path + File.separator + "src/chapter6/"; 28 | return resourcePath; 29 | } 30 | public static void main(String args[]){ 31 | String file = getResourcePath() + "test.txt"; 32 | 33 | Options options = new Options(); 34 | options.debug = true; 35 | 36 | Vocabulary vocab = GloVe.build_vocabulary(file, options); 37 | 38 | options.window_size = 3; 39 | List c = GloVe.build_cooccurrence(vocab, file, options); 40 | 41 | options.iterations = 10; 42 | options.vector_size = 10; 43 | options.debug = true; 44 | DoubleMatrix W = GloVe.train(vocab, c, options); 45 | 46 | List similars = Methods.most_similar(W, vocab, "graph", 15); 47 | for(String similar : similars) { 48 | System.out.println("@" + similar); 49 | } 50 | 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /Chapter06/NGramTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package chapter6; 7 | 8 | import opennlp.tools.ngram.NGramModel; 9 | import opennlp.tools.tokenize.WhitespaceTokenizer; 10 | import opennlp.tools.util.StringList; 11 | 12 | /** 13 | * 14 | * @author ashish 15 | */ 16 | public class NGramTest { 17 | public static void main(String args[]){ 18 | String sampletext = "This is n-gram model"; 19 | System.out.println(sampletext); 20 | 21 | StringList tokens = new StringList(WhitespaceTokenizer.INSTANCE.tokenize(sampletext)); 22 | System.out.println("Tokens " + tokens); 23 | 24 | NGramModel nGramModel = new NGramModel(); 25 | nGramModel.add(tokens,2,4); // minlength and maxlength 26 | 27 | System.out.println("Total ngrams: " + nGramModel.numberOfGrams()); 28 | for (StringList ngram : nGramModel) { 29 | System.out.println(nGramModel.getCount(ngram) + " - " + ngram); 30 | } 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /Chapter06/box.prop: -------------------------------------------------------------------------------- 1 | useClassFeature=true 2 | 1.realValued=true 3 | 2.realValued=true 4 | 3.realValued=true 5 | trainFile=.box.train 6 | testFile=.box.test 7 | -------------------------------------------------------------------------------- /Chapter06/box.test: -------------------------------------------------------------------------------- 1 | small 1.33 3.50 5.43 2 | small 1.18 1.73 3.14 3 | small 2.29 2.69 4.18 4 | small 2.94 2.74 1.71 5 | small 1.41 2.72 5.21 6 | small 1.27 2.97 5.93 7 | small 1.69 1.41 5.42 8 | small 2.80 2.64 2.79 9 | small 2.01 2.42 4.46 10 | small 2.15 2.66 4.55 11 | medium 4.10 6.31 8.71 12 | medium 3.15 4.85 8.23 13 | medium 3.99 4.17 6.76 14 | medium 3.29 4.56 9.31 15 | medium 4.45 5.33 8.46 16 | medium 3.60 4.77 7.74 17 | medium 3.01 6.98 10.57 18 | medium 4.10 4.92 10.40 19 | medium 4.42 4.20 8.85 20 | medium 3.15 6.53 9.26 21 | large 6.90 9.82 15.69 22 | large 7.57 10.83 15.55 23 | large 7.78 9.16 16.26 24 | large 7.81 10.80 15.86 25 | large 6.62 10.44 12.50 26 | large 7.82 8.31 15.09 27 | large 6.21 9.96 12.75 28 | large 7.57 8.46 15.25 29 | large 6.01 9.35 16.64 30 | large 6.76 9.66 15.44 -------------------------------------------------------------------------------- /Chapter06/box.train: -------------------------------------------------------------------------------- 1 | small 2.34 1.60 1.50 2 | small 2.28 1.19 4.26 3 | small 1.94 1.99 3.79 4 | small 1.41 1.89 3.10 5 | small 1.36 1.99 4.98 6 | small 1.71 2.60 5.09 7 | small 1.92 3.91 1.20 8 | small 1.17 3.14 5.69 9 | small 2.68 3.05 2.30 10 | small 2.22 1.03 2.99 11 | small 1.44 2.73 5.73 12 | small 2.96 1.88 4.43 13 | small 2.90 1.09 1.17 14 | small 1.01 1.53 5.95 15 | small 2.47 2.06 5.79 16 | small 1.46 1.81 2.64 17 | small 2.58 3.47 1.18 18 | small 1.06 1.89 3.83 19 | small 2.51 1.18 2.83 20 | small 2.24 2.42 3.92 21 | medium 3.43 6.78 7.69 22 | medium 3.05 4.96 8.65 23 | medium 4.38 5.86 10.27 24 | medium 4.74 6.23 6.23 25 | medium 3.71 5.85 6.24 26 | medium 4.10 4.22 6.33 27 | medium 4.21 6.08 7.09 28 | medium 5.00 6.53 7.84 29 | medium 4.70 5.72 8.47 30 | medium 3.54 6.00 10.70 31 | medium 3.63 4.42 7.29 32 | medium 3.92 6.48 7.12 33 | medium 4.32 6.61 9.01 34 | medium 3.45 5.02 9.67 35 | medium 4.88 6.90 10.38 36 | medium 3.17 6.00 7.15 37 | medium 3.36 5.64 9.38 38 | medium 4.78 5.35 8.83 39 | medium 4.48 5.32 7.39 40 | medium 3.68 4.63 11.00 41 | large 6.52 8.74 14.16 42 | large 6.68 10.64 12.38 43 | large 6.19 8.48 15.12 44 | large 6.44 9.63 13.36 45 | large 6.71 9.71 16.72 46 | large 6.84 8.64 14.70 47 | large 6.21 8.50 15.70 48 | large 7.60 8.90 13.25 49 | large 6.12 9.92 12.66 50 | large 6.58 9.91 13.19 51 | large 6.03 10.63 12.25 52 | large 6.12 8.74 16.00 53 | large 6.36 9.13 14.17 54 | large 7.93 10.94 15.48 55 | large 6.40 10.21 14.36 56 | large 6.41 9.94 12.70 57 | large 6.46 10.37 12.80 58 | large 7.25 10.81 15.67 59 | large 6.04 10.22 14.28 60 | large 7.63 9.51 14.96 -------------------------------------------------------------------------------- /Chapter06/en-animal.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Natural-Language-Processing-with-Java-Second-Edition/f235777262c48f4bbb4e52e45c4936896471080c/Chapter06/en-animal.model -------------------------------------------------------------------------------- /Chapter06/en-animal.train: -------------------------------------------------------------------------------- 1 | dog The most widespread form of interspecies bonding occurs between humans and dogs and the keeping of dogs as companions, particularly by elites, has a long history. (As a possible example, at the Natufian culture site of Ain Mallaha in Israel, dated to 12,000 BC, the remains of an elderly human and a four-to-five-month-old puppy were found buried together). However, pet dog populations grew significantly after World War II as suburbanization increased. In the 1950s and 1960s, dogs were kept outside more often than they tend to be today (using the expression "in the doghouse" to describe exclusion from the group signifies the distance between the doghouse and the home) and were still primarily functional, acting as a guard, children's playmate, or walking companion. From the 1980s, there have been changes in the role of the pet dog, such as the increased role of dogs in the emotional support of their human guardians. People and dogs have become increasingly integrated and implicated in each other's lives, to the point where pet dogs actively shape the way a family and home are experienced. 2 | dog There have been two major trends in the changing status of pet dogs. The first has been the 'commodification' of the dog, shaping it to conform to human expectations of personality and behaviour. The second has been the broadening of the concept of the family and the home to include dogs-as-dogs within everyday routines and practices. 3 | dog There are a vast range of commodity forms available to transform a pet dog into an ideal companion. The list of goods, services and places available is enormous: from dog perfumes, couture, furniture and housing, to dog groomers, therapists, trainers and caretakers, dog cafes, spas, parks and beaches, and dog hotels, airlines and cemeteries. While dog training as an organized activity can be traced back to the 18th century, in the last decades of the 20th century it became a high profile issue as many normal dog behaviors such as barking, jumping up, digging, rolling in dung, fighting, and urine marking[further explanation needed] became increasingly incompatible with the new role of a pet dog. Dog training books, classes and television programs proliferated as the process of commodifying the pet dog continued. 4 | dog An Australian Cattle Dog in reindeer antlers sits on Santa's lap 5 | dog A pet dog taking part in Christmas traditions 6 | dog The majority of contemporary people with dogs describe their pet as part of the family, although some ambivalence about the relationship is evident in the popular reconceptualization of the dog–human family as a pack. A dominance model of dog–human relationships has been promoted by some dog trainers, such as on the television program Dog Whisperer. However it has been disputed that "trying to achieve status" is characteristic of dog–human interactions. Pet dogs play an active role in family life; for example, a study of conversations in dog–human families showed how family members use the dog as a resource, talking to the dog, or talking through the dog, to mediate their interactions with each other. 7 | dog Another study of dogs' roles in families showed many dogs have set tasks or routines undertaken as family members, the most common of which was helping with the washing-up by licking the plates in the dishwasher, and bringing in the newspaper from the lawn. Increasingly, human family members are engaging in activities centered on the perceived needs and interests of the dog, or in which the dog is an integral partner, such as Dog Dancing and Doga. 8 | dog According to statistics published by the American Pet Products Manufacturers Association in the National Pet Owner Survey in 2009–2010, it is estimated there are 77.5 million people with pet dogs in the United States.[49] The same survey shows nearly 40% of American households own at least one dog, of which 67% own just one dog, 25% two dogs and nearly 9% more than two dogs. There does not seem to be any gender preference among dogs as pets, as the statistical data reveal an equal number of female and male dog pets. Yet, although several programs are undergoing to promote pet adoption, less than a fifth of the owned dogs come from a shelter. 9 | dog The latest study using Magnetic resonance imaging (MRI) to humans and dogs together proved that dogs have same response to voices and use the same parts of the brain as humans to do so. This gives dogs the ability to recognize emotional human sounds, making them friendly social pets to humans. 10 | cat Cats are common pets in Europe and North America, and their worldwide population is difficult to ascertain, with estimates ranging from anywhere between 200 million to 600 million. In 1998 there were around 76 million cats in Europe, 7 million in Japan and 3 million in Australia.:4 A 2007 report stated that about 37 million US households owned cats, with an average of 2.2 cats per household giving a total population of around 82 million; in contrast, there are about 72 million pet dogs in that country. Cats exceeded dogs in number as pets in the United States in 1985 for the first time, in part because the development of kitty litter in the mid-20th century eliminated the unpleasantly powerful smell of cat urine. 11 | cat Although cat ownership has commonly been associated with women, a 2007 Gallup poll reported that men and women were equally likely to own a cat. The ratio of pedigree/purebred cats to random-bred cats varies from country to country. However, generally speaking, purebreds are less than 10% of the total population. 12 | cat The concept of a cat breed appeared in Britain during the late 19th century. The current list of cat breeds is quite large: with the Cat Fanciers' Association recognizing 41 breeds, of which 16 are "natural breeds" that probably emerged before humans began breeding pedigree cats, while the others were developed over the latter half of the 20th century. The owners and breeders of show cats compete to see whose animal bears the closest resemblance to the "ideal" definition and standard of the breed (see selective breeding). Because of common crossbreeding in populated areas, many cats are simply identified as belonging to the homogeneous breeds of domestic longhair and domestic shorthair, depending on their type of fur. In the United Kingdom and Australasia, non-purebred cats are referred in slang as moggies (derived from "Maggie", short for Margaret, reputed to have been a common name for cows and calves in 18th century England and latter applied to housecats during the Victorian era). In the United States, a non-purebred cat is sometimes referred to as a barn or alley cat, even if it is not a stray. 13 | cat Cats come in a variety of colors and patterns. These are physical properties and should not be confused with a breed of cat. Furthermore, cats may show the color and/or pattern particular to a certain breed without actually being of that breed. For example, cats may have point coloration, but not be Siamese. 14 | cat A natural behavior in cats is to hook their front claws periodically into suitable surfaces and pull backwards. Cats, like humans, keep their muscles trim by stretching. However, a cat cannot keep his claw muscles in trim by this method.[citation needed] Cats, therefore, have found another method, as described above. Additionally, such periodic scratching serves to clean and sharpen their claws. Indoor cats may benefit from being provided with a scratching post so that they are less likely to use carpet or furniture, which they can easily ruin. However, cats may simply ignore such a device. Commercial scratching posts typically are covered in carpeting or upholstery, but some authorities[who?] advise against this practice, as not making it clear to the cat which surfaces are permissible and which are not; they suggest using a plain wooden surface, or reversing the carpeting on the posts so that the rougher texture of the carpet backing is a more attractive alternative to the cat than the floor covering. However, see the comment above about claw muscles. Scratching posts made of sisal rope or corrugated cardboard are also common. 15 | cat Although scratching can serve cats to keep their claws from growing excessively long, their nails can be trimmed if necessary. Another response to indoor scratching is onychectomy, commonly known as declawing. This is a surgical procedure to remove the claw and first bone of each digit of a cat's paws. Declawing is most commonly only performed on the front feet. A related procedure is tendonectomy, which involves cutting a tendon needed for cats to extend their claws. Declawing is a major surgical procedure and can produce pain, and infections. -------------------------------------------------------------------------------- /Chapter07/Chapter7.java: -------------------------------------------------------------------------------- 1 | package packt; 2 | 3 | import edu.stanford.nlp.dcoref.CorefChain; 4 | import edu.stanford.nlp.dcoref.CorefChain.CorefMention; 5 | import edu.stanford.nlp.dcoref.CorefCoreAnnotations.CorefChainAnnotation; 6 | import edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations; 7 | import edu.stanford.nlp.ie.machinereading.structure.RelationMention; 8 | import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; 9 | import edu.stanford.nlp.ling.CoreLabel; 10 | import edu.stanford.nlp.ling.Sentence; 11 | import edu.stanford.nlp.parser.lexparser.LexicalizedParser; 12 | import edu.stanford.nlp.pipeline.Annotation; 13 | import edu.stanford.nlp.pipeline.StanfordCoreNLP; 14 | import edu.stanford.nlp.process.CoreLabelTokenFactory; 15 | import edu.stanford.nlp.process.PTBTokenizer; 16 | import edu.stanford.nlp.process.Tokenizer; 17 | import edu.stanford.nlp.process.TokenizerFactory; 18 | import edu.stanford.nlp.trees.GrammaticalStructure; 19 | import edu.stanford.nlp.trees.GrammaticalStructureFactory; 20 | import edu.stanford.nlp.trees.Tree; 21 | import edu.stanford.nlp.trees.TreebankLanguagePack; 22 | import edu.stanford.nlp.trees.TypedDependency; 23 | import edu.stanford.nlp.util.CoreMap; 24 | import java.io.BufferedReader; 25 | import java.io.FileInputStream; 26 | import java.io.FileReader; 27 | import java.io.IOException; 28 | import java.io.InputStream; 29 | import java.io.StringReader; 30 | import java.util.ArrayList; 31 | import java.util.HashSet; 32 | import java.util.Iterator; 33 | import java.util.List; 34 | import java.util.Map; 35 | import java.util.Properties; 36 | import java.util.Set; 37 | import opennlp.tools.cmdline.parser.ParserTool; 38 | import opennlp.tools.parser.Parse; 39 | import opennlp.tools.parser.ParserFactory; 40 | import opennlp.tools.parser.ParserModel; 41 | import opennlp.tools.parser.Parser; 42 | import opennlp.tools.tokenize.SimpleTokenizer; 43 | 44 | public class Chapter7 { 45 | 46 | public static void main(String[] args) { 47 | usingOpenNLP(); 48 | // usingStanford(); 49 | } 50 | 51 | static Set nounPhrases = new HashSet<>(); 52 | 53 | private static void usingOpenNLP() { 54 | String fileLocation = getModelDir() + "/en-parser-chunking.bin"; 55 | System.out.println(fileLocation); 56 | try (InputStream modelInputStream = new FileInputStream(fileLocation);) { 57 | ParserModel model = new ParserModel(modelInputStream); 58 | Parser parser = ParserFactory.create(model); 59 | String sentence = "The cow jumped over the moon"; 60 | // Used to demonstrate difference between NER and Parser 61 | sentence = "He was the last person to see Fred."; 62 | 63 | Parse parses[] = ParserTool.parseLine(sentence, parser, 3); 64 | for (Parse parse : parses) { 65 | // First display 66 | parse.show(); 67 | // Second display 68 | // parse.showCodeTree(); 69 | // Third display 70 | // System.out.println("Children"); 71 | // Parse children[] = parse.getChildren(); 72 | // for (Parse parseElement : children) { 73 | // System.out.println(parseElement); 74 | // System.out.println(parseElement.getText()); 75 | // System.out.println(parseElement.getType()); 76 | // Parse tags[] = parseElement.getTagNodes(); 77 | // System.out.println("Tags"); 78 | // for (Parse tag : tags) { 79 | // System.out.println("[" + tag + "]" + " type: " + tag.getType() 80 | // + " Probability: " + tag.getProb() 81 | // + " Label: " + tag.getLabel()); 82 | // } 83 | // } 84 | } 85 | } catch (IOException ex) { 86 | ex.printStackTrace(); 87 | } 88 | } 89 | 90 | public static String getModelDir() { 91 | return "C:/Current Books/NLP and Java/Models"; 92 | } 93 | 94 | private static void usingStanfordParsers() { 95 | usingStanford(); 96 | // usingStanfordLexicalizedParser(); 97 | } 98 | 99 | private static void usingStanford() { 100 | usingStanfordLexicalizedParser(); 101 | // usingStanfordRelationExtraction(); 102 | // usingStanfordCoreferenceResolution(); 103 | // extractingRelations(); 104 | } 105 | 106 | private static void usingStanfordLexicalizedParser() { 107 | String parserModel = "C:/Current Books in Progress/NLP and Java/Models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"; 108 | LexicalizedParser lexicalizedParser = LexicalizedParser.loadModel(parserModel); 109 | 110 | // This option shows parsing a list of correctly tokenized words 111 | System.out.println("---First option"); 112 | String[] senetenceArray = {"The", "cow", "jumped", "over", "the", "moon", "."}; 113 | List words = Sentence.toCoreLabelList(senetenceArray); 114 | 115 | Tree parseTree = lexicalizedParser.apply(words); 116 | parseTree.pennPrint(); 117 | System.out.println(); 118 | 119 | // This option shows loading and using an explicit tokenizer 120 | System.out.println("---Second option"); 121 | String sentence = "The cow jumped over the moon."; 122 | TokenizerFactory tokenizerFactory 123 | = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); 124 | Tokenizer tokenizer 125 | = tokenizerFactory.getTokenizer(new StringReader(sentence)); 126 | List wordList = tokenizer.tokenize(); 127 | parseTree = lexicalizedParser.apply(wordList); 128 | 129 | TreebankLanguagePack tlp = lexicalizedParser.treebankLanguagePack(); // PennTreebankLanguagePack for English 130 | GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); 131 | GrammaticalStructure gs = gsf.newGrammaticalStructure(parseTree); 132 | List tdl = gs.typedDependenciesCCprocessed(); 133 | System.out.println(tdl); 134 | for (TypedDependency dependency : tdl) { 135 | System.out.println("Governor Word: [" + dependency.gov() + "] Relation: [" + dependency.reln().getLongName() 136 | + "] Dependent Word: [" + dependency.dep() + "]"); 137 | } 138 | System.out.println(); 139 | 140 | // You can also use a TreePrint object to print trees and dependencies 141 | // System.out.println("---Using TreePrint"); 142 | // TreePrint treePrint = new TreePrint("penn,typedDependenciesCollapsed"); 143 | // treePrint.printTree(parseTree); 144 | // System.out.println("TreePrint Formats"); 145 | // for (String format : TreePrint.outputTreeFormats) { 146 | // System.out.println(format); 147 | // } 148 | // System.out.println(); 149 | } 150 | 151 | private static void usingStanfordCoreferenceResolution() { 152 | System.out.println("StanfordCoreferenceResolution"); 153 | String sentence = "He took his cash and she took her change " 154 | + "and together they bought their lunch."; 155 | Properties props = new Properties(); 156 | props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref"); 157 | StanfordCoreNLP pipeline = new StanfordCoreNLP(props); 158 | 159 | Annotation annotation = new Annotation(sentence); 160 | pipeline.annotate(annotation); 161 | System.out.println("Sentence: " + sentence); 162 | Map corefChainMap = annotation.get(CorefChainAnnotation.class); 163 | 164 | Set set = corefChainMap.keySet(); 165 | Iterator setIterator = set.iterator(); 166 | while(setIterator.hasNext()) { 167 | CorefChain corefChain = corefChainMap.get(setIterator.next()); 168 | System.out.println("CorefChain: " + corefChain); 169 | System.out.print("ClusterId: " + corefChain.getChainID()); 170 | CorefMention mention = corefChain.getRepresentativeMention(); 171 | System.out.println(" CorefMention: " + mention + " Span: [" + mention.mentionSpan + "]"); 172 | 173 | List mentionList = corefChain.getMentionsInTextualOrder(); 174 | Iterator mentionIterator = mentionList.iterator(); 175 | while(mentionIterator.hasNext()) { 176 | CorefMention cfm = mentionIterator.next(); 177 | System.out.println("\tMention: " + cfm + " Span: [" + mention.mentionSpan + "]"); 178 | System.out.print("\tMention Type: " + cfm.mentionType + " Gender: " + cfm.gender); 179 | System.out.println(" Start: " + cfm.startIndex + " End: " + cfm.endIndex); 180 | } 181 | System.out.println(); 182 | } 183 | } 184 | 185 | private static void extractingRelations() { 186 | String question = "Who is the 32rd president of the United States?"; 187 | // question = "Who was the 32rd president of the United States?"; 188 | // question = "The 32rd president of the United States was who?"; 189 | // question = "The 32rd president is who of the United States?"; 190 | // question = "What was the 3rd President's party?"; 191 | // question = "When was the 12th president inaugurated"; 192 | // question = "Where is the 30th president's home town?"; 193 | 194 | String parserModel = "C:/Current Books/NLP and Java/Models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"; 195 | LexicalizedParser lexicalizedParser = LexicalizedParser.loadModel(parserModel); 196 | 197 | TokenizerFactory tokenizerFactory 198 | = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); 199 | Tokenizer tokenizer 200 | = tokenizerFactory.getTokenizer(new StringReader(question)); 201 | List wordList = tokenizer.tokenize(); 202 | Tree parseTree = lexicalizedParser.apply(wordList); 203 | 204 | TreebankLanguagePack tlp = lexicalizedParser.treebankLanguagePack(); 205 | GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); 206 | GrammaticalStructure gs = gsf.newGrammaticalStructure(parseTree); 207 | List tdl = gs.typedDependenciesCCprocessed(); 208 | System.out.println(tdl); 209 | for (TypedDependency dependency : tdl) { 210 | System.out.println("Governor Word: [" + dependency.gov() + "] Relation: [" + dependency.reln().getLongName() 211 | + "] Dependent Word: [" + dependency.dep() + "]"); 212 | } 213 | 214 | System.out.println(); 215 | System.out.println("You asked: " + question); 216 | for (TypedDependency dependency : tdl) { 217 | if ("nominal subject".equals(dependency.reln().getLongName()) 218 | && "who".equalsIgnoreCase(dependency.gov().originalText())) { 219 | System.out.println("Found Who question --- Governor Word: [" + dependency.gov() + "] Relation: [" + dependency.reln().getLongName() 220 | + "] Dependent Word: [" + dependency.dep() + "]"); 221 | processWhoQuestion(tdl); 222 | } else if ("nominal subject".equals(dependency.reln().getLongName()) 223 | && "what".equalsIgnoreCase(dependency.gov().originalText())) { 224 | System.out.println("Found What question --- Governor Word: [" + dependency.gov() + "] Relation: [" + dependency.reln().getLongName() 225 | + "] Dependent Word: [" + dependency.dep() + "]"); 226 | } else if ("adverbial modifier".equals(dependency.reln().getLongName()) 227 | && "when".equalsIgnoreCase(dependency.dep().originalText())) { 228 | System.out.println("Found When question --- Governor Word: [" + dependency.gov() + "] Relation: [" + dependency.reln().getLongName() 229 | + "] Dependent Word: [" + dependency.dep() + "]"); 230 | } else if ("adverbial modifier".equals(dependency.reln().getLongName()) 231 | && "where".equalsIgnoreCase(dependency.dep().originalText())) { 232 | System.out.println("Found Where question --- Governor Word: [" + dependency.gov() + "] Relation: [" + dependency.reln().getLongName() 233 | + "] Dependent Word: [" + dependency.dep() + "]"); 234 | } 235 | } 236 | } 237 | 238 | private static void processWhoQuestion(List tdl) { 239 | System.out.println("Processing Who Question"); 240 | List list = createPresidentList(); 241 | for (TypedDependency dependency : tdl) { 242 | if ("president".equalsIgnoreCase(dependency.gov().originalText()) 243 | && "adjectival modifier".equals(dependency.reln().getLongName())) { 244 | String positionText = dependency.dep().originalText(); 245 | int position = getOrder(positionText) - 1; 246 | System.out.println("The president is " + list.get(position).getName()); 247 | } 248 | } 249 | } 250 | 251 | private static int getOrder(String position) { 252 | String tmp = ""; 253 | int i = 0; 254 | while (Character.isDigit(position.charAt(i))) { 255 | tmp += position.charAt(i++); 256 | } 257 | return Integer.parseInt(tmp); 258 | } 259 | 260 | private static List createPresidentList() { 261 | ArrayList list = new ArrayList<>(); 262 | String line = null; 263 | try (FileReader reader = new FileReader("PresidentList"); 264 | BufferedReader br = new BufferedReader(reader)) { 265 | while ((line = br.readLine()) != null) { 266 | SimpleTokenizer simpleTokenizer = SimpleTokenizer.INSTANCE; 267 | String tokens[] = simpleTokenizer.tokenize(line); 268 | String name = ""; 269 | String start = ""; 270 | String end = ""; 271 | int i = 0; 272 | while (!"(".equals(tokens[i])) { 273 | name += tokens[i] + " "; 274 | i++; 275 | } 276 | start = tokens[i + 1]; 277 | end = tokens[i + 3]; 278 | if (end.equalsIgnoreCase("present")) { 279 | end = start; 280 | } 281 | list.add(new President(name, Integer.parseInt(start), 282 | Integer.parseInt(end))); 283 | } 284 | } catch (IOException ex) { 285 | ex.printStackTrace(); 286 | } 287 | return list; 288 | } 289 | } 290 | -------------------------------------------------------------------------------- /Chapter07/President.java: -------------------------------------------------------------------------------- 1 | package packt; 2 | 3 | public class President { 4 | private String name; 5 | private int start; 6 | private int end; 7 | 8 | public President(String name, int start, int end) { 9 | this.name = name; 10 | this.start = start; 11 | this.end = end; 12 | } 13 | 14 | public int getStart() { 15 | return start; 16 | } 17 | 18 | public int getEnd() { 19 | return end; 20 | } 21 | 22 | public String getName() { 23 | return name; 24 | } 25 | 26 | 27 | } 28 | -------------------------------------------------------------------------------- /Chapter08/Chapter8.java: -------------------------------------------------------------------------------- 1 | package packt; 2 | 3 | import de.l3s.boilerpipe.BoilerpipeProcessingException; 4 | import de.l3s.boilerpipe.document.TextBlock; 5 | import de.l3s.boilerpipe.document.TextDocument; 6 | import de.l3s.boilerpipe.sax.BoilerpipeSAXInput; 7 | import de.l3s.boilerpipe.sax.HTMLDocument; 8 | import de.l3s.boilerpipe.sax.HTMLFetcher; 9 | import edu.stanford.nlp.dcoref.CorefChain; 10 | import edu.stanford.nlp.dcoref.CorefCoreAnnotations.CorefChainAnnotation; 11 | import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation; 12 | import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation; 13 | import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; 14 | import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; 15 | import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; 16 | import edu.stanford.nlp.ling.CoreLabel; 17 | import edu.stanford.nlp.parser.lexparser.LexicalizedParser; 18 | import edu.stanford.nlp.pipeline.Annotation; 19 | import edu.stanford.nlp.pipeline.StanfordCoreNLP; 20 | import edu.stanford.nlp.semgraph.SemanticGraph; 21 | import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation; 22 | import edu.stanford.nlp.trees.Tree; 23 | import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation; 24 | import edu.stanford.nlp.util.CoreMap; 25 | import java.io.BufferedReader; 26 | import java.io.File; 27 | import java.io.FileInputStream; 28 | import java.io.FileNotFoundException; 29 | import java.io.FileReader; 30 | import java.io.IOException; 31 | import java.io.InputStream; 32 | import java.net.MalformedURLException; 33 | import java.net.URL; 34 | import java.util.ArrayList; 35 | import java.util.HashMap; 36 | import java.util.List; 37 | import java.util.Map; 38 | import java.util.Properties; 39 | import java.util.Set; 40 | import opennlp.tools.sentdetect.SentenceDetectorME; 41 | import opennlp.tools.sentdetect.SentenceModel; 42 | import opennlp.tools.tokenize.WhitespaceTokenizer; 43 | import org.apache.pdfbox.pdmodel.PDDocument; 44 | import org.apache.pdfbox.util.PDFTextStripper; 45 | import org.apache.poi.POITextExtractor; 46 | import org.apache.poi.POIXMLProperties.CoreProperties; 47 | import org.apache.poi.POIXMLProperties.CustomProperties; 48 | import org.apache.poi.POIXMLProperties.ExtendedProperties; 49 | import org.apache.poi.POIXMLPropertiesTextExtractor; 50 | import org.apache.poi.extractor.ExtractorFactory; 51 | import org.apache.poi.openxml4j.exceptions.OpenXML4JException; 52 | import org.apache.poi.xwpf.usermodel.XWPFDocument; 53 | import org.apache.xmlbeans.XmlException; 54 | import org.xml.sax.InputSource; 55 | import org.xml.sax.SAXException; 56 | 57 | public class Chapter8 { 58 | 59 | public static void main(String[] args) { 60 | extractingText(); 61 | searches(); 62 | usingStanfordPipeline(); 63 | usingStanfordPipelineParallel(); 64 | } 65 | 66 | private static void usingStanfordPipeline() { 67 | String text = "The robber took the cash and ran."; 68 | Properties props = new Properties(); 69 | props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref"); 70 | // String path = "C:\\Current Books\\NLP and Java\\Downloads\\stanford-ner-2014-10-26\\classifiers"; 71 | // props.put("ner.model",path+"/english.muc.7class.distsim.crf.ser.gz"); 72 | StanfordCoreNLP pipeline = new StanfordCoreNLP(props); 73 | 74 | Annotation annotation = new Annotation(text); 75 | System.out.println("Before annotate method executed "); 76 | Set> annotationSet = annotation.keySet(); 77 | for (Class c : annotationSet) { 78 | System.out.println("\tClass: " + c.getCanonicalName()); 79 | } 80 | pipeline.annotate(annotation); 81 | System.out.println("After annotate method executed "); 82 | annotationSet = annotation.keySet(); 83 | for (Class c : annotationSet) { 84 | System.out.println("\tClass: " + c.getCanonicalName()); 85 | } 86 | 87 | System.out.println("Total time: " + pipeline.timingInformation()); 88 | List sentences = annotation.get(SentencesAnnotation.class); 89 | 90 | for (CoreMap sentence : sentences) { 91 | for (CoreLabel token : sentence.get(TokensAnnotation.class)) { 92 | String word = token.get(TextAnnotation.class); 93 | System.out.println("text of the token: " + word); 94 | String pos = token.get(PartOfSpeechAnnotation.class); 95 | System.out.println("POS Tag: " + pos); 96 | String ne = token.get(NamedEntityTagAnnotation.class); 97 | System.out.println("ne: " + ne); 98 | Map graph 99 | = token.get(CorefChainAnnotation.class); 100 | System.out.println("graph: " + graph); 101 | } 102 | Tree tree = sentence.get(TreeAnnotation.class); 103 | System.out.println("tree: " + tree); 104 | 105 | SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class); 106 | System.out.println("dependencies: " + dependencies); 107 | 108 | Map graph 109 | = annotation.get(CorefChainAnnotation.class); 110 | System.out.println("graph: " + graph); 111 | } 112 | } 113 | 114 | private static void usingStanfordPipelineParallel() { 115 | Properties props = new Properties(); 116 | props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref"); 117 | String path = "C:\\Current Books\\NLP and Java\\Downloads\\stanford-ner-2014-10-26\\classifiers"; 118 | props.put("ner.model", path + "/english.muc.7class.distsim.crf.ser.gz"); 119 | StanfordCoreNLP pipeline = new StanfordCoreNLP(props); 120 | 121 | Annotation annotation1 = new Annotation("The robber took the cash and ran."); 122 | Annotation annotation2 = new Annotation("The policeman chased him down the street."); 123 | Annotation annotation3 = new Annotation("A passerby, watching the action, tripped the thief as he passed by."); 124 | Annotation annotation4 = new Annotation("They all lived happily everafter, except for the thief of course."); 125 | ArrayList list = new ArrayList(); 126 | list.add(annotation1); 127 | list.add(annotation2); 128 | list.add(annotation3); 129 | list.add(annotation4); 130 | Iterable iterable = list; 131 | 132 | pipeline.annotate(iterable); 133 | 134 | System.out.println("Total time: " + pipeline.timingInformation()); 135 | List sentences = annotation2.get(SentencesAnnotation.class); 136 | 137 | for (CoreMap sentence : sentences) { 138 | for (CoreLabel token : sentence.get(TokensAnnotation.class)) { 139 | String word = token.get(TextAnnotation.class); 140 | String pos = token.get(PartOfSpeechAnnotation.class); 141 | System.out.println("Word: " + word + " POS Tag: " + pos); 142 | } 143 | } 144 | } 145 | 146 | private static void searches() { 147 | try (InputStream is = new FileInputStream( 148 | new File("C:/Current Books/NLP and Java/Models/en-sent.bin")); 149 | FileReader fr = new FileReader("Twenty Thousands.txt"); 150 | BufferedReader br = new BufferedReader(fr)) { 151 | SentenceModel model = new SentenceModel(is); 152 | SentenceDetectorME detector = new SentenceDetectorME(model); 153 | String line; 154 | StringBuilder sb = new StringBuilder(); 155 | while ((line = br.readLine()) != null) { 156 | sb.append(line + " "); 157 | } 158 | String sentences[] = detector.sentDetect(sb.toString()); 159 | System.out.println(sentences.length); 160 | // Convert each character to lowercase 161 | for (int i = 0; i < sentences.length; i++) { 162 | sentences[i] = sentences[i].toLowerCase(); 163 | } 164 | 165 | // Remove stopwords 166 | StopWords stopWords = new StopWords("stop-words_english_2_en.txt"); 167 | for (int i = 0; i < sentences.length; i++) { 168 | sentences[i] = stopWords.removeStopWords(sentences[i]); 169 | } 170 | 171 | // Create map 172 | HashMap wordMap = new HashMap(); 173 | for (int sentenceIndex = 0; sentenceIndex < sentences.length; sentenceIndex++) { 174 | String words[] = WhitespaceTokenizer.INSTANCE.tokenize(sentences[sentenceIndex]); 175 | Word word; 176 | for (int wordIndex = 0; wordIndex < words.length; wordIndex++) { 177 | String newWord = words[wordIndex]; 178 | if (wordMap.containsKey(newWord)) { 179 | word = wordMap.remove(newWord); 180 | } else { 181 | word = new Word(); 182 | } 183 | word.addWord(newWord, sentenceIndex, wordIndex); 184 | wordMap.put(newWord, word); 185 | } 186 | } 187 | System.out.println(wordMap.size()); 188 | 189 | // Locate word in document 190 | Word word = wordMap.get("reef"); 191 | ArrayList positions = word.getPositions(); 192 | for (Positions position : positions) { 193 | System.out.println(word.getWord() + " is found at line " 194 | + position.sentence + ", word " + position.position); 195 | } 196 | } catch (FileNotFoundException ex) { 197 | ex.printStackTrace(); 198 | } catch (IOException ex) { 199 | ex.printStackTrace(); 200 | } 201 | 202 | } 203 | 204 | private static void extractingText() { 205 | usingBoilerpipe(); 206 | usingPOI(); 207 | usingPDFBox(); 208 | } 209 | 210 | private static void usingBoilerpipe() { 211 | try { 212 | URL url = new URL("http://en.wikipedia.org/wiki/Berlin"); 213 | HTMLDocument htmlDoc = HTMLFetcher.fetch(url); 214 | InputSource is = htmlDoc.toInputSource(); 215 | TextDocument document 216 | = new BoilerpipeSAXInput(is).getTextDocument(); 217 | 218 | System.out.println(document.getText(true, true)); 219 | 220 | System.out.println("--------------------------------"); 221 | List blocks = document.getTextBlocks(); 222 | for (TextBlock block : blocks) { 223 | System.out.println(block.isContent()); 224 | System.out.println(block.getText()); 225 | System.out.println(block.getNumWords()); 226 | System.out.println("------"); 227 | } 228 | } catch (MalformedURLException ex) { 229 | ex.printStackTrace(); 230 | } catch (BoilerpipeProcessingException | SAXException | IOException ex) { 231 | ex.printStackTrace(); 232 | } 233 | } 234 | 235 | private static void usingPDFBox() { 236 | try { 237 | File file = new File("TestDocument.pdf"); 238 | PDDocument pdDocument = PDDocument.load(file); 239 | PDFTextStripper stripper = new PDFTextStripper(); 240 | String text = stripper.getText(pdDocument); 241 | System.out.println(text); 242 | pdDocument.close(); 243 | } catch (IOException ex) { 244 | ex.printStackTrace(); 245 | } 246 | } 247 | 248 | private static void usingPOI() { 249 | try { 250 | FileInputStream fis = new FileInputStream("TestDocument.docx"); 251 | POITextExtractor textExtractor = ExtractorFactory.createExtractor(fis); 252 | System.out.println(textExtractor.getText()); 253 | 254 | POITextExtractor metaExtractor = textExtractor.getMetadataTextExtractor(); 255 | System.out.println(metaExtractor.getText()); 256 | System.out.println(); 257 | 258 | fis = new FileInputStream("TestDocument.docx"); 259 | POIXMLPropertiesTextExtractor properties = new POIXMLPropertiesTextExtractor(new XWPFDocument(fis)); 260 | System.out.println(properties.getText()); 261 | System.out.println(); 262 | 263 | CoreProperties coreProperties = properties.getCoreProperties(); 264 | System.out.println("Core Properties"); 265 | System.out.println(properties.getCorePropertiesText()); 266 | 267 | System.out.println(); 268 | System.out.println("Creator: " + coreProperties.getCreator()); 269 | System.out.println("Date Created: " + coreProperties.getCreated()); 270 | System.out.println("Date Last Modified: " + coreProperties.getModified()); 271 | 272 | System.out.println(); 273 | System.out.println("Extended Properties"); 274 | ExtendedProperties extendedProperties = properties.getExtendedProperties(); 275 | System.out.println(properties.getExtendedPropertiesText()); 276 | System.out.println(); 277 | System.out.println("Application: " + extendedProperties.getApplication()); 278 | System.out.println("Application Version: " + extendedProperties.getAppVersion()); 279 | System.out.println("Pages: " + extendedProperties.getPages()); 280 | 281 | System.out.println(); 282 | System.out.println("Custom Properties: " ); 283 | System.out.println(properties.getCustomPropertiesText()); 284 | } catch (IOException ex) { 285 | ex.printStackTrace(); 286 | } catch (OpenXML4JException | XmlException ex) { 287 | ex.printStackTrace(); 288 | } 289 | } 290 | } 291 | -------------------------------------------------------------------------------- /Chapter08/Positions.java: -------------------------------------------------------------------------------- 1 | package packt; 2 | 3 | class Positions { 4 | int sentence; 5 | int position; 6 | 7 | Positions(int sentence, int position) { 8 | this.sentence = sentence; 9 | this.position = position; 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /Chapter08/StopWords.java: -------------------------------------------------------------------------------- 1 | package packt; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.FileReader; 5 | import java.io.IOException; 6 | import java.util.ArrayList; 7 | import java.util.Arrays; 8 | import java.util.HashSet; 9 | import java.util.Iterator; 10 | import opennlp.tools.tokenize.WhitespaceTokenizer; 11 | 12 | public class StopWords { 13 | 14 | private String[] defaultStopWords = {"i", "a", "about", "an", "are", "as", "at", 15 | "be", "by", "com", "for", "from", "how", "in", "is", "it", "of", "on", 16 | "or", "that", "the", "this", "to", "was", "what", "when", "where", 17 | "who", "will", "with"}; 18 | 19 | private static HashSet stopWords = new HashSet(); 20 | 21 | public StopWords() { 22 | stopWords.addAll(Arrays.asList(defaultStopWords)); 23 | } 24 | 25 | public StopWords(String fileName) { 26 | try { 27 | BufferedReader bufferedreader 28 | = new BufferedReader(new FileReader(fileName)); 29 | String line = null; 30 | while ((line = bufferedreader.readLine()) != null) { 31 | // line = bufferedreader.readLine(); 32 | System.out.println("---Adding: [" + line + "]" + (int)line.charAt(0)); 33 | stopWords.add(line); 34 | } 35 | } catch (IOException ex) { 36 | ex.printStackTrace(); 37 | } 38 | } 39 | 40 | public void addStopWord(String word) { 41 | stopWords.add(word); 42 | } 43 | 44 | public String[] removeStopWords(String[] words) { 45 | ArrayList tokens = new ArrayList(Arrays.asList(words)); 46 | for (int i = 0; i < tokens.size(); i++) { 47 | // System.out.println(stopWords.contains(tokens.get(i)) + " " + tokens.get(i)); 48 | if (stopWords.contains(tokens.get(i))) { 49 | tokens.remove(i); 50 | } 51 | } 52 | return (String[]) tokens.toArray(new String[tokens.size()]); 53 | } 54 | 55 | public String removeStopWords(String words) { 56 | String arr[] = WhitespaceTokenizer.INSTANCE.tokenize(words); 57 | StringBuilder sb = new StringBuilder(); 58 | // ArrayList tokens = new ArrayList(Arrays.asList(arr)); 59 | for (int i = 0; i < arr.length; i++) { 60 | // System.out.println(tokens.get(i) + "-"); 61 | if (stopWords.contains(arr[i])) { 62 | // tokens.remove(i); 63 | // System.out.println("Removing: [" + arr[i] + "]"); 64 | } else { 65 | sb.append(arr[i]+" "); 66 | } 67 | } 68 | return sb.toString(); 69 | } 70 | 71 | public void displayStopWords() { 72 | Iterator iterator = stopWords.iterator(); 73 | while (iterator.hasNext()) { 74 | System.out.print("[" + iterator.next() + "] "); 75 | } 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /Chapter08/Word.java: -------------------------------------------------------------------------------- 1 | package packt; 2 | 3 | import java.util.ArrayList; 4 | 5 | public class Word { 6 | private String word; 7 | private final ArrayList positions; 8 | 9 | public Word() { 10 | this.positions = new ArrayList(); 11 | } 12 | 13 | public void addWord(String word, int sentence, int position) { 14 | this.word = word; 15 | Positions counts = new Positions(sentence, position); 16 | positions.add(counts); 17 | } 18 | 19 | public ArrayList getPositions() { 20 | return positions; 21 | } 22 | 23 | public String getWord() { 24 | return word; 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /Chapter09/TestMallet.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package chapter9; 7 | 8 | /** 9 | * 10 | * @author ashish 11 | */ 12 | 13 | class MyData{ 14 | 15 | } 16 | public class TestMallet { 17 | public static void main(String args[]){ 18 | 19 | } 20 | 21 | } 22 | -------------------------------------------------------------------------------- /Chapter10/CoreferenceDemo.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package chapter10; 7 | 8 | import edu.stanford.nlp.coref.CorefCoreAnnotations.CorefChainAnnotation; 9 | import edu.stanford.nlp.pipeline.Annotation; 10 | import edu.stanford.nlp.coref.data.CorefChain; 11 | import edu.stanford.nlp.coref.data.CorefChain.CorefMention; 12 | import edu.stanford.nlp.pipeline.StanfordCoreNLP; 13 | import java.io.File; 14 | import java.util.Iterator; 15 | import java.util.List; 16 | import java.util.Map; 17 | import java.util.Properties; 18 | import java.util.Set; 19 | import static org.jdom2.filter.Filters.document; 20 | 21 | /** 22 | * 23 | * @author ashish 24 | */ 25 | public class CoreferenceDemo { 26 | private static String getResourcePath(){ 27 | File currDir = new File("."); 28 | String path = currDir .getAbsolutePath(); 29 | path = path.substring(0, path.length()-2); 30 | System.out.println(path); 31 | String resourcePath = path + File.separator + "src/chapter10/"; 32 | return resourcePath; 33 | } 34 | public static void main(String args[]){ 35 | String sentence = "He took his cash and she took her change " 36 | + "and together they bought their lunch."; 37 | Properties props = new Properties(); 38 | props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref"); 39 | StanfordCoreNLP pipeline = new StanfordCoreNLP(props); 40 | Annotation annotation = new Annotation(sentence); 41 | pipeline.annotate(annotation); 42 | // Map corefChainMap = annotation.get(CorefChainAnnotation.class); 43 | 44 | Map corefChainMap = annotation.get(CorefChainAnnotation.class); 45 | Set set = corefChainMap.keySet(); 46 | Iterator setIterator = set.iterator(); 47 | while(setIterator.hasNext()) { 48 | CorefChain corefChain = corefChainMap.get(setIterator.next()); 49 | System.out.println("CorefChain: " + corefChain); 50 | System.out.print("ClusterId: " + corefChain.getChainID()); 51 | CorefMention mention = corefChain.getRepresentativeMention(); 52 | System.out.println(" CorefMention: " + mention 53 | + " Span: [" + mention.mentionSpan + "]"); 54 | 55 | List mentionList = 56 | corefChain.getMentionsInTextualOrder(); 57 | Iterator mentionIterator = 58 | mentionList.iterator(); 59 | while(mentionIterator.hasNext()) { 60 | CorefMention cfm = mentionIterator.next(); 61 | System.out.println("tMention: " + cfm 62 | + " Span: [" + mention.mentionSpan + "]"); 63 | System.out.print("tMention Mention Type: " 64 | + cfm.mentionType + " Gender: " + cfm.gender); 65 | System.out.println(" Start: " + cfm.startIndex 66 | + " End: " + cfm.endIndex); 67 | } 68 | System.out.println(); 69 | } 70 | 71 | 72 | 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /Chapter10/DemoParsing.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package chapter10; 7 | 8 | import java.io.File; 9 | import java.io.FileInputStream; 10 | import java.io.FileNotFoundException; 11 | import java.io.IOException; 12 | import java.io.InputStream; 13 | import java.util.logging.Level; 14 | import java.util.logging.Logger; 15 | import opennlp.tools.cmdline.parser.ParserTool; 16 | import opennlp.tools.parser.Parse; 17 | import opennlp.tools.parser.Parser; 18 | import opennlp.tools.parser.ParserFactory; 19 | import opennlp.tools.parser.ParserModel; 20 | 21 | /** 22 | * 23 | * @author ashish 24 | */ 25 | public class DemoParsing { 26 | private static String getResourcePath(){ 27 | File currDir = new File("."); 28 | String path = currDir .getAbsolutePath(); 29 | path = path.substring(0, path.length()-2); 30 | System.out.println(path); 31 | String resourcePath = path + File.separator + "src/chapter10/"; 32 | return resourcePath; 33 | } 34 | public static void main(String args[]){ 35 | String fileLocation = getResourcePath() + "en-parser-chunking.bin"; 36 | try { 37 | InputStream modelInputStream = new FileInputStream(fileLocation); 38 | ParserModel model = new ParserModel(modelInputStream); 39 | Parser parser = ParserFactory.create(model); 40 | String sentence = "The cow jumped over the moon"; 41 | Parse parses[] = ParserTool.parseLine(sentence, parser, 3); 42 | for(Parse parse : parses) { 43 | parse.show(); 44 | System.out.println("Probability: " + parse.getProb()); 45 | parse.showCodeTree(); 46 | 47 | Parse children[] = parse.getChildren(); 48 | for (Parse parseElement : children) { 49 | System.out.println(parseElement.getText()); 50 | System.out.println(parseElement.getType()); 51 | Parse tags[] = parseElement.getTagNodes(); 52 | System.out.println("Tags"); 53 | for (Parse tag : tags) { 54 | System.out.println("[" + tag + "]" 55 | + " type: " + tag.getType() 56 | + " Probability: " + tag.getProb() 57 | + " Label: " + tag.getLabel()); 58 | } 59 | } 60 | } 61 | 62 | } catch (FileNotFoundException ex) { 63 | ex.printStackTrace(); 64 | } catch (IOException ex) { 65 | ex.printStackTrace(); 66 | } 67 | } 68 | 69 | } 70 | -------------------------------------------------------------------------------- /Chapter10/President.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package chapter10; 7 | 8 | import edu.stanford.nlp.trees.TypedDependency; 9 | import java.io.BufferedReader; 10 | import java.io.File; 11 | import java.io.FileReader; 12 | import java.io.IOException; 13 | import java.util.ArrayList; 14 | import java.util.List; 15 | import opennlp.tools.tokenize.SimpleTokenizer; 16 | 17 | /** 18 | * 19 | * @author ashish 20 | */ 21 | public class President { 22 | private String name; 23 | private int start; 24 | private int end; 25 | 26 | public President(){ 27 | 28 | } 29 | public President(String name, int start, int end) { 30 | this.name = name; 31 | this.start = start; 32 | this.end = end; 33 | } 34 | 35 | public String getName() { 36 | return name; 37 | } 38 | 39 | public int getStart() { 40 | return start; 41 | } 42 | 43 | public int getEnd() { 44 | return end; 45 | } 46 | 47 | 48 | private static int getOrder(String position) { 49 | String tmp = ""; 50 | int i = 0; 51 | while (Character.isDigit(position.charAt(i))) { 52 | tmp += position.charAt(i++); 53 | } 54 | return Integer.parseInt(tmp); 55 | } 56 | 57 | public void processWhoQuestion(List tdl) { 58 | List list = createPresidentList(); 59 | for (TypedDependency dependency : tdl) { 60 | if ("president".equalsIgnoreCase( 61 | dependency.gov().originalText()) 62 | && "adjectival modifier".equals( 63 | dependency.reln().getLongName())) { 64 | String positionText = 65 | dependency.dep().originalText(); 66 | int position = getOrder(positionText)-1; 67 | System.out.println("The president is " 68 | + list.get(position).getName()); 69 | } 70 | } 71 | } 72 | private static String getResourcePath(){ 73 | File currDir = new File("."); 74 | String path = currDir .getAbsolutePath(); 75 | path = path.substring(0, path.length()-2); 76 | System.out.println(path); 77 | String resourcePath = path + File.separator + "src/chapter10/"; 78 | return resourcePath; 79 | } 80 | public List createPresidentList() { 81 | ArrayList list = new ArrayList<>(); 82 | String line = null; 83 | try (FileReader reader = new FileReader(getResourcePath() + "PresidentList"); 84 | BufferedReader br = new BufferedReader(reader)) { 85 | while ((line = br.readLine()) != null) { 86 | System.out.println(">>>>>>>>>>>>." + line); 87 | SimpleTokenizer simpleTokenizer = 88 | SimpleTokenizer.INSTANCE; 89 | String tokens[] = simpleTokenizer.tokenize(line); 90 | for(int i=0;i words = SentenceUtils.toCoreLabelList(sentenceArray); 44 | Tree parseTree = lexicalizedParser.apply(words); 45 | parseTree.pennPrint(); 46 | 47 | TreePrint treePrint = new TreePrint("typedDependenciesCollapsed"); 48 | treePrint.printTree(parseTree); 49 | 50 | 51 | String sentence = "The cow jumped over the moon."; 52 | TokenizerFactory tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); 53 | Tokenizer tokenizer = tokenizerFactory.getTokenizer(new StringReader(sentence)); 54 | List wordList = tokenizer.tokenize(); 55 | parseTree = lexicalizedParser.apply(wordList); 56 | TreebankLanguagePack tlp = lexicalizedParser.treebankLanguagePack(); 57 | GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); 58 | GrammaticalStructure gs = gsf.newGrammaticalStructure(parseTree); 59 | List tdl = gs.typedDependenciesCCprocessed(); 60 | System.out.println(tdl); 61 | 62 | for(TypedDependency dependency : tdl) { 63 | System.out.println("Governor Word: [" + dependency.gov() 64 | + "] Relation: [" + dependency.reln().getLongName() 65 | + "] Dependent Word: [" + dependency.dep() + "]"); 66 | } 67 | 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /Chapter10/WordDependencyDemo.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package chapter10; 7 | 8 | import edu.stanford.nlp.parser.lexparser.LexicalizedParser; 9 | import edu.stanford.nlp.parser.lexparser.LexicalizedParser; 10 | import edu.stanford.nlp.process.CoreLabelTokenFactory; 11 | import edu.stanford.nlp.process.PTBTokenizer; 12 | import edu.stanford.nlp.process.Tokenizer; 13 | import edu.stanford.nlp.process.TokenizerFactory; 14 | import edu.stanford.nlp.ling.CoreLabel; 15 | import edu.stanford.nlp.trees.GrammaticalStructure; 16 | import edu.stanford.nlp.trees.GrammaticalStructureFactory; 17 | import edu.stanford.nlp.trees.Tree; 18 | import edu.stanford.nlp.trees.TreebankLanguagePack; 19 | import edu.stanford.nlp.trees.TypedDependency; 20 | import java.io.File; 21 | import java.io.StringReader; 22 | import java.util.List; 23 | 24 | /** 25 | * 26 | * @author ashish 27 | */ 28 | public class WordDependencyDemo { 29 | private static String getResourcePath(){ 30 | File currDir = new File("."); 31 | String path = currDir .getAbsolutePath(); 32 | path = path.substring(0, path.length()-2); 33 | System.out.println(path); 34 | String resourcePath = path + File.separator + "src/chapter10/"; 35 | return resourcePath; 36 | } 37 | 38 | public static void main(String args[]){ 39 | String question = "Who is the 32nd president of the United States?"; 40 | String parseModel = getResourcePath() + "englishPCFG.ser.gz"; 41 | 42 | LexicalizedParser lexicalizedParser = LexicalizedParser.loadModel(parseModel); 43 | TokenizerFactory tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); 44 | Tokenizer tokenizer = tokenizerFactory.getTokenizer(new StringReader(question)); 45 | List wordList = tokenizer.tokenize(); 46 | Tree parseTree = lexicalizedParser.apply(wordList); 47 | 48 | TreebankLanguagePack tlp = lexicalizedParser.treebankLanguagePack(); 49 | GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); 50 | GrammaticalStructure gs = gsf.newGrammaticalStructure(parseTree); 51 | List tdl = gs.typedDependenciesCCprocessed(); 52 | System.out.println(tdl); 53 | for (TypedDependency dependency : tdl) { 54 | System.out.println("Governor Word: [" + dependency.gov() 55 | + "] Relation: [" + dependency.reln().getLongName() 56 | + "] Dependent Word: [" + dependency.dep() + "]"); 57 | } 58 | 59 | for (TypedDependency dependency : tdl) { 60 | if ("nominal subject".equals( dependency.reln().getLongName()) 61 | && "who".equalsIgnoreCase( dependency.gov().originalText())) { 62 | President p = new President(); 63 | p.processWhoQuestion(tdl); 64 | } 65 | } 66 | 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /Chapter11/HTMLExtractorDemo.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package chapter11; 7 | 8 | import de.l3s.boilerpipe.BoilerpipeProcessingException; 9 | import de.l3s.boilerpipe.document.TextDocument; 10 | import de.l3s.boilerpipe.sax.BoilerpipeSAXInput; 11 | import de.l3s.boilerpipe.sax.HTMLFetcher; 12 | import java.net.MalformedURLException; 13 | import java.net.URL; 14 | import java.util.logging.Level; 15 | import java.util.logging.Logger; 16 | import de.l3s.boilerpipe.sax.HTMLDocument; 17 | import java.io.IOException; 18 | import org.xml.sax.InputSource; 19 | import org.xml.sax.SAXException; 20 | 21 | /** 22 | * 23 | * @author ashish 24 | */ 25 | public class HTMLExtractorDemo { 26 | public static void main(String args[]){ 27 | try{ 28 | URL url = new URL("https://en.wikipedia.org/wiki/Berlin"); 29 | HTMLDocument htmldoc = HTMLFetcher.fetch(url); 30 | InputSource is = htmldoc.toInputSource(); 31 | TextDocument document = new BoilerpipeSAXInput(is).getTextDocument(); 32 | System.out.println(document.getText(true, true)); 33 | } catch (MalformedURLException ex) { 34 | System.out.println(ex); 35 | Logger.getLogger(HTMLExtractorDemo.class.getName()).log(Level.SEVERE, null, ex); 36 | } catch (IOException ex) { 37 | System.out.println(ex); 38 | Logger.getLogger(HTMLExtractorDemo.class.getName()).log(Level.SEVERE, null, ex); 39 | } catch (SAXException | BoilerpipeProcessingException ex) { 40 | System.out.println(ex); 41 | Logger.getLogger(HTMLExtractorDemo.class.getName()).log(Level.SEVERE, null, ex); 42 | } 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /Chapter11/PDFExtractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package chapter11; 7 | 8 | import java.io.File; 9 | import java.io.IOException; 10 | import org.apache.pdfbox.pdmodel.PDDocument; 11 | import org.apache.pdfbox.text.PDFTextStripper; 12 | 13 | /** 14 | * 15 | * @author ashish 16 | */ 17 | public class PDFExtractor { 18 | private static String getResourcePath(){ 19 | File currDir = new File("."); 20 | String path = currDir .getAbsolutePath(); 21 | path = path.substring(0, path.length()-2); 22 | String resourcePath = path + File.separator + "src/chapter11/TestDocument.pdf"; 23 | return resourcePath; 24 | } 25 | public static void main(String args[]){ 26 | try{ 27 | File file = new File(getResourcePath()); 28 | PDDocument pd = PDDocument.load(file); 29 | PDFTextStripper stripper = new PDFTextStripper(); 30 | String text= stripper.getText(pd); 31 | System.out.println(text); 32 | } 33 | catch(IOException ex){ 34 | System.out.println(ex); 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /Chapter11/PipelineDemo.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package chapter11; 7 | 8 | import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation; 9 | import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; 10 | import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; 11 | import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; 12 | import edu.stanford.nlp.ling.CoreLabel; 13 | import edu.stanford.nlp.pipeline.Annotation; 14 | import edu.stanford.nlp.pipeline.StanfordCoreNLP; 15 | import edu.stanford.nlp.util.CoreMap; 16 | import java.util.ArrayList; 17 | import java.util.List; 18 | import java.util.Properties; 19 | import java.util.Set; 20 | 21 | /** 22 | * 23 | * @author ashish 24 | */ 25 | public class PipelineDemo { 26 | public static void main(String args[]){ 27 | String text = "The robber took the cash and ran"; 28 | Properties props = new Properties(); 29 | props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref"); 30 | StanfordCoreNLP pipeline = new StanfordCoreNLP(props); 31 | Annotation annotation = new Annotation(text); 32 | 33 | System.out.println("Before annotate method executed "); 34 | Set> annotationSet = annotation.keySet(); 35 | for(Class c : annotationSet) { 36 | System.out.println("\tClass: " + c.getName()); 37 | } 38 | 39 | pipeline.annotate(annotation); 40 | 41 | System.out.println("After annotate method executed "); 42 | annotationSet = annotation.keySet(); 43 | for(Class c : annotationSet) { 44 | System.out.println("\tClass: " + c.getName()); 45 | } 46 | List sentences = annotation.get(SentencesAnnotation.class); 47 | for (CoreMap sentence : sentences) { 48 | for (CoreLabel token: sentence.get(TokensAnnotation.class)) { 49 | String word = token.get(TextAnnotation.class); 50 | String pos = token.get(PartOfSpeechAnnotation.class); 51 | System.out.println(word); 52 | System.out.println(pos); 53 | } 54 | } 55 | 56 | 57 | 58 | Annotation annotation1 = new Annotation("The robber took the cash and ran."); 59 | Annotation annotation2 = new Annotation("The policeman chased him down the street."); 60 | Annotation annotation3 = new Annotation("A passerby, watching the action, tripped the thief " 61 | + "as he passed by."); 62 | Annotation annotation4 = new Annotation("They all lived happily ever after, except for the thief " 63 | + "of course."); 64 | 65 | ArrayList list = new ArrayList(); 66 | list.add(annotation1); 67 | list.add(annotation2); 68 | list.add(annotation3); 69 | list.add(annotation4); 70 | Iterable iterable = list; 71 | pipeline.annotate(iterable); 72 | List sentences1 = annotation2.get(SentencesAnnotation.class); 73 | 74 | for (CoreMap sentence : sentences1) { 75 | for (CoreLabel token : 76 | sentence.get(TokensAnnotation.class)) { 77 | String word = token.get(TextAnnotation.class); 78 | String pos = token.get(PartOfSpeechAnnotation.class); 79 | System.out.println("Word: " + word + " POS Tag: " + pos); 80 | } 81 | } 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /Chapter11/SearchText.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package chapter11; 7 | 8 | import java.io.BufferedReader; 9 | import java.io.File; 10 | import java.io.FileInputStream; 11 | import java.io.FileNotFoundException; 12 | import java.io.FileReader; 13 | import java.io.IOException; 14 | import java.io.InputStream; 15 | import java.util.ArrayList; 16 | import java.util.HashMap; 17 | import java.util.logging.Level; 18 | import java.util.logging.Logger; 19 | import opennlp.tools.sentdetect.SentenceDetectorME; 20 | import opennlp.tools.sentdetect.SentenceModel; 21 | import opennlp.tools.tokenize.WhitespaceTokenizer; 22 | 23 | /** 24 | * 25 | * @author ashish 26 | */ 27 | class StopWords 28 | { 29 | 30 | public String removeStopWords(String words) { 31 | String arr[] = WhitespaceTokenizer.INSTANCE.tokenize(words); 32 | StringBuilder sb = new StringBuilder(); 33 | for (int i = 0; i < arr.length; i++) { 34 | if (words.contains(arr[i])) { 35 | // Do nothing 36 | } else { 37 | sb.append(arr[i]+" "); 38 | } 39 | } 40 | return sb.toString(); 41 | } 42 | } 43 | public class SearchText { 44 | private static String getResourcePath(){ 45 | File currDir = new File("."); 46 | String path = currDir .getAbsolutePath(); 47 | path = path.substring(0, path.length()-2); 48 | String resourcePath = path + File.separator + "src/chapter11/"; 49 | return resourcePath; 50 | } 51 | 52 | public static void main(String args[]){ 53 | try { 54 | InputStream is = new FileInputStream(new File(getResourcePath() + "en-sent.bin")); 55 | FileReader fr = new FileReader(getResourcePath() + "pg164.txt"); 56 | BufferedReader br = new BufferedReader(fr); 57 | System.out.println(getResourcePath() + "en-sent.bin"); 58 | SentenceModel model = new SentenceModel(is); 59 | SentenceDetectorME detector = new SentenceDetectorME(model); 60 | 61 | String line; 62 | StringBuilder sb = new StringBuilder(); 63 | while((line = br.readLine())!=null){ 64 | sb.append(line + " "); 65 | } 66 | String sentences[] = detector.sentDetect(sb.toString()); 67 | for (int i = 0; i < sentences.length; i++) { 68 | sentences[i] = sentences[i].toLowerCase(); 69 | } 70 | 71 | // StopWords stopWords = new StopWords("stop-words_english_2_en.txt"); 72 | // for (int i = 0; i < sentences.length; i++) { 73 | // sentences[i] = stopWords.removeStopWords(sentences[i]); 74 | // } 75 | 76 | HashMap wordMap = new HashMap(); 77 | for (int sentenceIndex = 0; sentenceIndex < sentences.length; sentenceIndex++) { 78 | String words[] = WhitespaceTokenizer.INSTANCE.tokenize(sentences[sentenceIndex]); 79 | Word word; 80 | for (int wordIndex = 0; 81 | wordIndex < words.length; wordIndex++) { 82 | String newWord = words[wordIndex]; 83 | if (wordMap.containsKey(newWord)) { 84 | word = wordMap.remove(newWord); 85 | } else { 86 | word = new Word(); 87 | } 88 | word.addWord(newWord, sentenceIndex, wordIndex); 89 | wordMap.put(newWord, word); 90 | } 91 | // for(String k : wordMap.keySet()){ 92 | // System.out.println(k); 93 | // } 94 | Word sword = wordMap.get("sea"); 95 | ArrayList positions = sword.getPositions(); 96 | for (Positions position : positions) { 97 | System.out.println(sword.getWord() + " is found at line " 98 | + position.sentence + ", word " 99 | + position.position); 100 | } 101 | } 102 | 103 | } catch (FileNotFoundException ex) { 104 | Logger.getLogger(SearchText.class.getName()).log(Level.SEVERE, null, ex); 105 | } catch (IOException ex) { 106 | Logger.getLogger(SearchText.class.getName()).log(Level.SEVERE, null, ex); 107 | } 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /Chapter11/TikaDemo.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package chapter11; 7 | 8 | import java.io.File; 9 | import java.io.IOException; 10 | import java.io.InputStream; 11 | import java.util.Arrays; 12 | import java.util.logging.Level; 13 | import java.util.logging.Logger; 14 | import org.apache.tika.Tika; 15 | import org.apache.tika.exception.TikaException; 16 | import org.apache.tika.metadata.Metadata; 17 | 18 | /** 19 | * 20 | * @author ashish 21 | */ 22 | public class TikaDemo { 23 | private static String getResourcePath(){ 24 | File currDir = new File("."); 25 | String path = currDir .getAbsolutePath(); 26 | path = path.substring(0, path.length()-2); 27 | String resourcePath = path + File.separator + "src/chapter11/TestDocument.pdf"; 28 | return resourcePath; 29 | } 30 | public static void main(String args[]){ 31 | Tika tika = new Tika(); 32 | try{ 33 | File file = new File(getResourcePath()); 34 | String filetype = tika.detect(file); 35 | 36 | System.out.println(filetype); 37 | System.out.println(tika.parseToString(file)); 38 | 39 | 40 | } catch (IOException ex) { 41 | Logger.getLogger(TikaDemo.class.getName()).log(Level.SEVERE, null, ex); 42 | } catch (TikaException ex) { 43 | Logger.getLogger(TikaDemo.class.getName()).log(Level.SEVERE, null, ex); 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /Chapter11/Word.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package chapter11; 7 | 8 | import java.util.ArrayList; 9 | 10 | /** 11 | * 12 | * @author ashish 13 | */ 14 | class Positions { 15 | int sentence; 16 | int position; 17 | 18 | Positions(int sentence, int position) { 19 | this.sentence = sentence; 20 | this.position = position; 21 | } 22 | } 23 | 24 | 25 | public class Word { 26 | private String word; 27 | private final ArrayList positions; 28 | 29 | public Word() { 30 | this.positions = new ArrayList(); 31 | } 32 | 33 | public void addWord(String word, int sentence, 34 | int position) { 35 | this.word = word; 36 | Positions counts = new Positions(sentence, position); 37 | positions.add(counts); 38 | } 39 | 40 | public ArrayList getPositions() { 41 | return positions; 42 | } 43 | 44 | public String getWord() { 45 | return word; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /Chapter11/WordDocExtractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package chapter11; 7 | 8 | import java.io.File; 9 | import java.io.FileInputStream; 10 | import java.io.FileNotFoundException; 11 | import java.io.IOException; 12 | import java.util.logging.Level; 13 | import java.util.logging.Logger; 14 | import org.apache.poi.POITextExtractor; 15 | import org.apache.poi.POIXMLProperties.CoreProperties; 16 | import org.apache.poi.POIXMLProperties.ExtendedProperties; 17 | import org.apache.poi.POIXMLPropertiesTextExtractor; 18 | import org.apache.poi.extractor.ExtractorFactory; 19 | import org.apache.poi.openxml4j.exceptions.OpenXML4JException; 20 | import org.apache.poi.xwpf.usermodel.XWPFDocument; 21 | import org.apache.xmlbeans.XmlException; 22 | 23 | /** 24 | * 25 | * @author ashish 26 | */ 27 | public class WordDocExtractor { 28 | private static String getResourcePath(){ 29 | File currDir = new File("."); 30 | String path = currDir .getAbsolutePath(); 31 | path = path.substring(0, path.length()-2); 32 | String resourcePath = path + File.separator + "src/chapter11/TestDocument.docx"; 33 | return resourcePath; 34 | } 35 | public static void main(String args[]){ 36 | try { 37 | FileInputStream fis = new FileInputStream(getResourcePath()); 38 | POITextExtractor textExtractor = ExtractorFactory.createExtractor(fis); 39 | System.out.println(textExtractor.getText()); 40 | 41 | POITextExtractor metaExtractor = textExtractor.getMetadataTextExtractor(); 42 | System.out.println(metaExtractor.getText()); 43 | fis = new FileInputStream(getResourcePath()); 44 | POIXMLPropertiesTextExtractor properties = new POIXMLPropertiesTextExtractor(new XWPFDocument(fis)); 45 | CoreProperties coreProperties = properties.getCoreProperties(); 46 | System.out.println(properties.getCorePropertiesText()); 47 | 48 | ExtendedProperties extendedProperties = properties.getExtendedProperties(); 49 | System.out.println(properties.getExtendedPropertiesText()); 50 | 51 | } catch (FileNotFoundException ex) { 52 | Logger.getLogger(WordDocExtractor.class.getName()).log(Level.SEVERE, null, ex); 53 | } catch (IOException ex) { 54 | Logger.getLogger(WordDocExtractor.class.getName()).log(Level.SEVERE, null, ex); 55 | } catch (OpenXML4JException | XmlException ex) { 56 | Logger.getLogger(WordDocExtractor.class.getName()).log(Level.SEVERE, null, ex); 57 | } 58 | 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /Chapter12/GenerateAIML.java: -------------------------------------------------------------------------------- 1 | package chapter12; 2 | 3 | /* 4 | * To change this license header, choose License Headers in Project Properties. 5 | * To change this template file, choose Tools | Templates 6 | * and open the template in the editor. 7 | */ 8 | 9 | 10 | import java.io.File; 11 | import org.alicebot.ab.Bot; 12 | import org.alicebot.ab.MagicBooleans; 13 | 14 | /** 15 | * 16 | * @author ashish 17 | */ 18 | public class GenerateAIML { 19 | 20 | private static final boolean TRACE_MODE = false; 21 | static String botName = "appointment"; 22 | 23 | public static void main(String[] args) { 24 | try { 25 | 26 | String resourcesPath = getResourcesPath(); 27 | System.out.println(resourcesPath); 28 | MagicBooleans.trace_mode = TRACE_MODE; 29 | Bot bot = new Bot("appointment", resourcesPath); 30 | 31 | bot.writeAIMLFiles(); 32 | 33 | } catch (Exception e) { 34 | e.printStackTrace(); 35 | } 36 | } 37 | 38 | private static String getResourcesPath(){ 39 | File currDir = new File("."); 40 | String path = currDir .getAbsolutePath(); 41 | path = path.substring(0, path.length()-2); 42 | System.out.println(path); 43 | String resourcePath = path + File.separator + "src/chapter12/mybot"; 44 | return resourcePath; 45 | } 46 | } 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /Chapter12/Mychatbotdemo.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package chapter12; 7 | 8 | import java.io.File; 9 | import org.alicebot.ab.Bot; 10 | import org.alicebot.ab.Chat; 11 | import org.alicebot.ab.History; 12 | import org.alicebot.ab.MagicBooleans; 13 | import org.alicebot.ab.MagicStrings; 14 | import org.alicebot.ab.utils.IOUtils; 15 | 16 | /** 17 | * 18 | * @author ashish 19 | */ 20 | class MyChat{ 21 | 22 | } 23 | 24 | public class Mychatbotdemo { 25 | private static final boolean TRACE_MODE = false; 26 | static String botName = "appointment"; 27 | private static String getResourcePath(){ 28 | File currDir = new File("."); 29 | String path = currDir .getAbsolutePath(); 30 | path = path.substring(0, path.length()-2); 31 | System.out.println(path); 32 | String resourcePath = path + File.separator + "src/chapter12/mybot"; 33 | return resourcePath; 34 | } 35 | public static void main(String args[]){ 36 | try 37 | { 38 | String resourcePath = getResourcePath(); 39 | System.out.println(resourcePath); 40 | MagicBooleans.trace_mode = TRACE_MODE; 41 | Bot bot = new Bot(botName, resourcePath); 42 | Chat chatSession = new Chat(bot); 43 | bot.brain.nodeStats(); 44 | String textLine = ""; 45 | System.out.println("Robot : Hello, I am your appointment scheduler May i know your name"); 46 | while(true){ 47 | 48 | System.out.println("Human : "); 49 | textLine = IOUtils.readInputTextLine(); 50 | if ((textLine==null) || (textLine.length()<1)){ 51 | textLine = MagicStrings.null_input; 52 | } 53 | if(textLine.equals("q")){ 54 | System.exit(0); 55 | } else if (textLine.equals("wq")){ 56 | bot.writeQuit(); 57 | } else { 58 | String request = textLine; 59 | if(MagicBooleans.trace_mode) 60 | System.out.println("STATE=" + request + ":THAT" + ((History)chatSession.thatHistory.get(0)).get(0) + ": Topic" + chatSession.predicates.get("topic")); 61 | String response = chatSession.multisentenceRespond(request); 62 | while(response.contains("<")) 63 | response = response.replace("<", "<"); 64 | while(response.contains(">")) 65 | response = response.replace(">", ">"); 66 | System.out.println("Robot : " + response); 67 | } 68 | } 69 | } 70 | catch(Exception e){ 71 | e.printStackTrace(); 72 | } 73 | 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /Chapter12/Test.java: -------------------------------------------------------------------------------- 1 | ///* 2 | // * To change this license header, choose License Headers in Project Properties. 3 | // * To change this template file, choose Tools | Templates 4 | // * and open the template in the editor. 5 | // */ 6 | //package chapter12; 7 | // 8 | ///** 9 | // * 10 | // * @author ashish 11 | // */ 12 | //import org.alicebot.ab.*; 13 | // 14 | //import java.io.*; 15 | //import java.util.HashMap; 16 | // 17 | // 18 | public class Test { 19 | // 20 | public static void main (String[] args) { 21 | // 22 | // 23 | // 24 | //// MagicStrings.setRootPath(); 25 | // 26 | // AIMLProcessor.extension = new PCAIMLProcessorExtension(); 27 | // mainFunction(args); 28 | // } 29 | // public static void mainFunction (String[] args) { 30 | // String botName = "alice2"; 31 | // MagicBooleans.jp_tokenize = false; 32 | // MagicBooleans.trace_mode = true; 33 | // String action="chat"; 34 | // System.out.println(MagicStrings.program_name_version); 35 | // for (String s : args) { 36 | // //System.out.println(s); 37 | // String[] splitArg = s.split("="); 38 | // if (splitArg.length >= 2) { 39 | // String option = splitArg[0]; 40 | // String value = splitArg[1]; 41 | // //if (MagicBooleans.trace_mode) System.out.println(option+"='"+value+"'"); 42 | // if (option.equals("bot")) botName = value; 43 | // if (option.equals("action")) action = value; 44 | // if (option.equals("trace")) { 45 | // if (value.equals("true")) MagicBooleans.trace_mode = true; 46 | // else MagicBooleans.trace_mode = false; 47 | // } 48 | // if (option.equals("morph")) { 49 | // if (value.equals("true")) MagicBooleans.jp_tokenize = true; 50 | // else { 51 | // MagicBooleans.jp_tokenize = false; 52 | // } 53 | // } 54 | // } 55 | // } 56 | // if (MagicBooleans.trace_mode) System.out.println("Working Directory = " + MagicStrings.root_path); 57 | // Graphmaster.enableShortCuts = true; 58 | // //Timer timer = new Timer(); 59 | // Bot bot = new Bot(botName, MagicStrings.root_path, action); // 60 | // //EnglishNumberToWords.makeSetMap(bot); 61 | // //getGloss(bot, "c:/ab/data/wn30-lfs/wne-2006-12-06.xml"); 62 | // if (MagicBooleans.make_verbs_sets_maps) Verbs.makeVerbSetsMaps(bot); 63 | // //bot.preProcessor.normalizeFile("c:/ab/data/log2.txt", "c:/ab/data/log2normal.txt"); 64 | // //System.exit(0); 65 | // if (bot.brain.getCategories().size() < MagicNumbers.brain_print_size) bot.brain.printgraph(); 66 | // if (MagicBooleans.trace_mode) System.out.println("Action = '"+action+"'"); 67 | // if (action.equals("chat") || action.equals("chat-app")) { 68 | // boolean doWrites = ! action.equals("chat-app"); 69 | // TestAB.testChat(bot, doWrites, MagicBooleans.trace_mode); 70 | // } 71 | // //else if (action.equals("test")) testSuite(bot, MagicStrings.root_path+"/data/find.txt"); 72 | // else if (action.equals("ab")) TestAB.testAB(bot, TestAB.sample_file); 73 | // else if (action.equals("aiml2csv") || action.equals("csv2aiml")) convert(bot, action); 74 | // else if (action.equals("abwq")){AB ab = new AB(bot, TestAB.sample_file); ab.abwq();} 75 | // else if (action.equals("test")) { TestAB.runTests(bot, MagicBooleans.trace_mode); } 76 | // else if (action.equals("shadow")) { MagicBooleans.trace_mode = false; bot.shadowChecker();} 77 | // else if (action.equals("iqtest")) { ChatTest ct = new ChatTest(bot); 78 | // try { 79 | // ct.testMultisentenceRespond(); 80 | // } 81 | // catch (Exception ex) { ex.printStackTrace(); } 82 | // } 83 | // else System.out.println("Unrecognized action "+action); 84 | // } 85 | // public static void convert(Bot bot, String action) { 86 | // if (action.equals("aiml2csv")) bot.writeAIMLIFFiles(); 87 | // else if (action.equals("csv2aiml")) bot.writeAIMLFiles(); 88 | // } 89 | // 90 | // 91 | // public static void getGloss (Bot bot, String filename) { 92 | // System.out.println("getGloss"); 93 | // try{ 94 | // // Open the file that is the first 95 | // // command line parameter 96 | // File file = new File(filename); 97 | // if (file.exists()) { 98 | // FileInputStream fstream = new FileInputStream(filename); 99 | // // Get the object 100 | // getGlossFromInputStream(bot, fstream); 101 | // fstream.close(); 102 | // } 103 | // }catch (Exception e){//Catch exception if any 104 | // System.err.println("Error: " + e.getMessage()); 105 | // } 106 | // } 107 | // public static void getGlossFromInputStream (Bot bot, InputStream in) { 108 | // System.out.println("getGlossFromInputStream"); 109 | // BufferedReader br = new BufferedReader(new InputStreamReader(in)); 110 | // String strLine; 111 | // int cnt = 0; 112 | // int filecnt = 0; 113 | // HashMap def = new HashMap(); 114 | // try { 115 | // //Read File Line By Line 116 | // String word; String gloss; 117 | // word = null; 118 | // gloss = null; 119 | // while ((strLine = br.readLine()) != null) { 120 | // 121 | // if (strLine.contains("")) { 132 | // gloss = strLine.replaceAll("",""); 133 | // gloss = gloss.replaceAll("",""); 134 | // gloss = gloss.trim(); 135 | // System.out.println(gloss); 136 | // 137 | // } 138 | // 139 | // 140 | // if (word != null && gloss != null) { 141 | // word = word.toLowerCase().trim(); 142 | // if (gloss.length() > 2) gloss = gloss.substring(0, 1).toUpperCase()+gloss.substring(1, gloss.length()); 143 | // String definition; 144 | // if (def.keySet().contains(word)) { 145 | // definition = def.get(word); 146 | // definition = definition+"; "+gloss; 147 | // } 148 | // else definition = gloss; 149 | // def.put(word, definition); 150 | // word = null; 151 | // gloss = null; 152 | // } 153 | // } 154 | // Category d = new Category(0,"WNDEF *","*","*","unknown","wndefs"+filecnt+".aiml"); 155 | // bot.brain.addCategory(d); 156 | // for (String x : def.keySet()) { 157 | // word = x; 158 | // gloss = def.get(word)+"."; 159 | // cnt++; 160 | // if (cnt%5000==0) filecnt++; 161 | // 162 | // Category c = new Category(0,"WNDEF "+word,"*","*",gloss,"wndefs"+filecnt+".aiml"); 163 | // System.out.println(cnt+" "+filecnt+" "+c.inputThatTopic()+":"+c.getTemplate()+":"+c.getFilename()); 164 | // Nodemapper node; 165 | // if ((node = bot.brain.findNode(c)) != null) node.category.setTemplate(node.category.getTemplate()+","+gloss); 166 | // bot.brain.addCategory(c); 167 | // 168 | // 169 | // } 170 | // } catch (Exception ex) { 171 | // ex.printStackTrace(); 172 | // } 173 | // } 174 | // 175 | // public static void sraixCache (String filename, Chat chatSession) { 176 | // int limit = 1000; 177 | // try { 178 | // FileInputStream fstream = new FileInputStream(filename); 179 | // // Get the object 180 | // BufferedReader br = new BufferedReader(new InputStreamReader(fstream)); 181 | // String strLine; 182 | // //Read File Line By Line 183 | // int count = 0; 184 | // while ((strLine = br.readLine()) != null && count++ < limit) { 185 | // System.out.println("\n\nHuman: " + strLine); 186 | // 187 | // String response = chatSession.multisentenceRespond(strLine); 188 | // System.out.println("\nVasudev : " + response); 189 | // } 190 | // } catch (Exception ex) { 191 | // ex.printStackTrace(); 192 | // } 193 | } 194 | 195 | 196 | } 197 | -------------------------------------------------------------------------------- /Chapter12/TestClass.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package chapter12; 7 | 8 | 9 | import java.io.File; 10 | import org.alicebot.ab.MagicBooleans; 11 | import org.alicebot.ab.Bot; 12 | import org.alicebot.ab.Chat; 13 | import org.alicebot.ab.History; 14 | import org.alicebot.ab.MagicStrings; 15 | import org.alicebot.ab.utils.IOUtils; 16 | 17 | /** 18 | * 19 | * @author ashish 20 | */ 21 | public class TestClass { 22 | private static final boolean TRACE_MODE = false; 23 | static String botName = "super"; 24 | 25 | private static String getResourcePath(){ 26 | File currDir = new File("."); 27 | String path = currDir .getAbsolutePath(); 28 | path = path.substring(0, path.length()-2); 29 | System.out.println(path); 30 | String resourcePath = path + File.separator + "src/chapter12" + File.separator + "resources"; 31 | return resourcePath; 32 | } 33 | public static void main(String args[]) 34 | { 35 | try 36 | { 37 | String resourcePath = getResourcePath(); 38 | System.out.println(resourcePath); 39 | MagicBooleans.trace_mode = TRACE_MODE; 40 | Bot bot = new Bot("super", resourcePath); 41 | Chat chatSession = new Chat(bot); 42 | bot.brain.nodeStats(); 43 | String textLine = ""; 44 | 45 | while(true){ 46 | System.out.println("Human : "); 47 | textLine = IOUtils.readInputTextLine(); 48 | if ((textLine==null) || (textLine.length()<1)){ 49 | textLine = MagicStrings.null_input; 50 | } 51 | if(textLine.equals("q")){ 52 | System.exit(0); 53 | } else if (textLine.equals("wq")){ 54 | bot.writeQuit(); 55 | } else { 56 | String request = textLine; 57 | if(MagicBooleans.trace_mode) 58 | System.out.println("STATE=" + request + ":THAT" + ((History)chatSession.thatHistory.get(0)).get(0) + ": Topic" + chatSession.predicates.get("topic")); 59 | String response = chatSession.multisentenceRespond(request); 60 | while(response.contains("<")) 61 | response = response.replace("<", "<"); 62 | while(response.contains(">")) 63 | response = response.replace(">", ">"); 64 | System.out.println("Robot : " + response); 65 | } 66 | } 67 | } 68 | catch(Exception e){ 69 | e.printStackTrace(); 70 | } 71 | } 72 | 73 | } 74 | -------------------------------------------------------------------------------- /Chapter12/mybot.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Natural-Language-Processing-with-Java-Second-Edition/f235777262c48f4bbb4e52e45c4936896471080c/Chapter12/mybot.zip -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Natural Language Processing with Java Second Edition 2 | 3 | Book Name 4 | 5 | This is the code repository for [Natural Language Processing with Java Second Edition](https://www.packtpub.com/big-data-and-business-intelligence/natural-language-processing-java-second-edition?utm_source=github&utm_medium=repository&utm_campaign=9781789347999), published by Packt. 6 | 7 | **Techniques for building machine learning and neural network models for NLP** 8 | 9 | ## What is this book about? 10 | Natural Language Processing (NLP) allows you to take any sentence and identify patterns, special names, company names, and more. The second edition of Natural Language Processing with Java teaches you how to perform language analysis with the help of Java libraries, while constantly gaining insights from the outcomes. 11 | 12 | This book covers the following exciting features: 13 | * Understand basic NLP tasks and how they relate to one another 14 | * Discover and use the available tokenization engines 15 | * Apply search techniques to find people, as well as things, within a document 16 | * Construct solutions to identify parts of speech within sentences 17 | * Use parsers to extract relationships between elements of a document 18 | 19 | If you feel this book is for you, get your [copy](https://www.amazon.com/dp/1788993497) today! 20 | 21 | https://www.packtpub.com/ 23 | 24 | 25 | ## Instructions and Navigations 26 | All of the code is organized into folders. For example, Chapter02. 27 | 28 | The code will look like the following: 29 | ``` 30 | System.out.println(tagger.tagString("AFAIK she H8 cth!")); 31 | System.out.println(tagger.tagString( 32 | "BTW had a GR8 tym at the party BBIAM.")); 33 | ``` 34 | 35 | **Following is what you need for this book:** 36 | Natural Language Processing with Java is for you if you are a data analyst, data scientist, or machine learning engineer who wants to extract information from a language using Java. Knowledge of Java programming is needed, while a basic understanding of statistics will be useful but not mandatory. 37 | 38 | With the following software and hardware list you can run all code files present in the book (Chapter 1-12). 39 | 40 | ### Software and Hardware List 41 | 42 | | Chapter | Software required | OS required | 43 | | -------- | ------------------------------------| -----------------------------------| 44 | | 1 | OpenNLP |Windows, Mac OS X, and Linux (Any) | 45 | | | Stanford CoreNLP | | 46 | | | LingPipe | | 47 | | | Standford Tagger | | 48 | | | | | 49 | | 2 | OpenNLP Models | Windows, Mac OS X, and Linux (Any) | 50 | | 3 | LingPipe Models | Windows, Mac OS X, and Linux (Any) | 51 | | 4 | OpenNLPModels | Windows, Mac OS X, and Linux (Any) | 52 | | 5 | Gate Twitter Model | | 53 | | | LingPipe POS Models | Windows, Mac OS X, and Linux (Any) | 54 | | 6 | Stanford Classifier | Windows, Mac OS X, and Linux (Any) | 55 | | 8-12 | Boilerpipe | | 56 | | | POI | | 57 | | | PDFBox | Windows, Mac OS X, and Linux (Any) | 58 | 59 | 60 | 61 | 62 | We also provide a PDF file that has color images of the screenshots/diagrams used in this book. [Click here to download it](http://www.packtpub.com/sites/default/files/downloads/NaturalLanguageProcessingwithJavaSecondEdition_ColorImages.pdf). 63 | 64 | ### Related products 65 | * Java Deep Learning Projects [[Packt]](https://www.packtpub.com/big-data-and-business-intelligence/java-deep-learning-projects?utm_source=github&utm_medium=repository&utm_campaign=9781788997454) [[Amazon]](https://www.amazon.com/dp/178899745X) 66 | 67 | * Hands-On Natural Language Processing with Python [[Packt]](https://www.packtpub.com/big-data-and-business-intelligence/hands-natural-language-processing-python?utm_source=github&utm_medium=repository&utm_campaign=9781789139495) [[Amazon]](https://www.amazon.com/dp/178913949X) 68 | 69 | ## Get to Know the Authors 70 | **Richard M. Reese** 71 | has worked in both industry and academia. For 17 years, he worked in the telephone and aerospace industries, serving in several capacities, including research and development, software development, supervision, and training. He currently teaches at Tarleton State University. Richard has written several Java books and a C Pointer book. He uses a concise and easy-to-follow approach to teaching about topics. His Java books have addressed EJB 3.1, updates to Java 7 and 8, certification, functional programming, jMonkeyEngine, and natural language processing. 72 | 73 | **AshishSingh Bhatia** 74 | is a learner, reader, seeker, and developer at core. He has over 10 years of IT experience in different domains, including banking, ERP, and education. He is persistently passionate about Python, Java, R, and web and mobile development. He is always ready to explore new technologies. 75 | 76 | 77 | ## Other books by the authors 78 | * [Java for Data Science](https://www.packtpub.com/big-data-and-business-intelligence/java-data-science?utm_source=github&utm_medium=repository&utm_campaign=9781785280115) 79 | * [Machine Learning with R Cookbook, Second Edition](https://www.packtpub.com/big-data-and-business-intelligence/machine-learning-r-cookbook-second-edition?utm_source=github&utm_medium=repository&utm_campaign=9781785280115) 80 | 81 | ### Suggestions and Feedback 82 | [Click here](https://docs.google.com/forms/d/e/1FAIpQLSdy7dATC6QmEL81FIUuymZ0Wy9vH1jHkvpY57OiMeKGqib_Ow/viewform) if you have any feedback or suggestions. 83 | --------------------------------------------------------------------------------