├── Chapter01
    └── Chapter1.java
├── Chapter02
    ├── Chapter2.java
    ├── MyTokenizerFactory.java
    ├── PorterStemmer.java
    ├── StemmingLemaEx.java
    ├── StopWords.java
    └── Test.java
├── Chapter03
    ├── Chapter3.java
    ├── SBDDemo.java
    ├── XMLProcessingDemo.java
    └── XMLTest.xml
├── Chapter04
    ├── NERDemo.java
    ├── build.xml
    ├── build
    │   └── classes
    │   │   ├── .netbeans_automatic_build
    │   │   ├── .netbeans_update_resources
    │   │   └── packt
    │   │       ├── Chapter4.class
    │   │       ├── DictionaryChunker.class
    │   │       ├── EmailRegexChunker.class
    │   │       ├── RunChunker.class
    │   │       ├── TimeRegexChunker.class
    │   │       └── TrainEntities.class
    ├── en-ner-all.train
    ├── en-ner-person.eval
    ├── en-ner-person.train
    ├── manifest.mf
    ├── modelFile
    ├── nbproject
    │   ├── build-impl.xml
    │   ├── genfiles.properties
    │   ├── private
    │   │   ├── config.properties
    │   │   ├── private.properties
    │   │   └── private.xml
    │   ├── project.properties
    │   └── project.xml
    ├── old
    │   ├── Chapter4.java
    │   └── TimeRegexChunker.java
    └── src
    │   └── packt
    │       ├── Chapter4.java
    │       ├── DictionaryChunker.java
    │       ├── EmailRegexChunker.java
    │       ├── RunChunker.java
    │       ├── TimeRegexChunker.java
    │       └── TrainEntities.java
├── Chapter05
    └── Chapter5.java
├── Chapter06
    ├── Chapter6.java
    ├── GloveExample.java
    ├── NGramTest.java
    ├── box.prop
    ├── box.test
    ├── box.train
    ├── en-animal.model
    └── en-animal.train
├── Chapter07
    ├── Chapter7.java
    └── President.java
├── Chapter08
    ├── Chapter8.java
    ├── Positions.java
    ├── StopWords.java
    └── Word.java
├── Chapter09
    └── TestMallet.java
├── Chapter10
    ├── CoreferenceDemo.java
    ├── DemoParsing.java
    ├── President.java
    ├── StanfordLexicalDemo.java
    └── WordDependencyDemo.java
├── Chapter11
    ├── HTMLExtractorDemo.java
    ├── PDFExtractor.java
    ├── PipelineDemo.java
    ├── SearchText.java
    ├── TikaDemo.java
    ├── Word.java
    └── WordDocExtractor.java
├── Chapter12
    ├── GenerateAIML.java
    ├── Mychatbotdemo.java
    ├── Test.java
    ├── TestClass.java
    └── mybot.zip
├── LICENSE
└── README.md


/Chapter01/Chapter1.java:
--------------------------------------------------------------------------------
  1 | package packt;
  2 | 
  3 | import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;
  4 | import edu.stanford.nlp.ling.HasWord;
  5 | import edu.stanford.nlp.pipeline.Annotation;
  6 | import edu.stanford.nlp.pipeline.StanfordCoreNLP;
  7 | import edu.stanford.nlp.process.CoreLabelTokenFactory;
  8 | import edu.stanford.nlp.process.DocumentPreprocessor;
  9 | import edu.stanford.nlp.process.PTBTokenizer;
 10 | import java.io.File;
 11 | import java.io.FileInputStream;
 12 | import java.io.FileNotFoundException;
 13 | import java.io.IOException;
 14 | import java.io.InputStream;
 15 | import java.io.Reader;
 16 | import java.io.StringReader;
 17 | import java.util.ArrayList;
 18 | import java.util.Arrays;
 19 | import java.util.LinkedList;
 20 | import java.util.List;
 21 | import java.util.Properties;
 22 | import opennlp.tools.cmdline.postag.POSModelLoader;
 23 | import opennlp.tools.namefind.NameFinderME;
 24 | import opennlp.tools.namefind.TokenNameFinderModel;
 25 | import opennlp.tools.postag.POSModel;
 26 | import opennlp.tools.postag.POSSample;
 27 | import opennlp.tools.postag.POSTaggerME;
 28 | import opennlp.tools.tokenize.SimpleTokenizer;
 29 | import opennlp.tools.tokenize.Tokenizer;
 30 | import opennlp.tools.tokenize.TokenizerME;
 31 | import opennlp.tools.tokenize.TokenizerModel;
 32 | import opennlp.tools.tokenize.WhitespaceTokenizer;
 33 | import opennlp.tools.util.Span;
 34 | 
 35 | public class Chapter1 {
 36 | 
 37 |     public static void main(String[] args) {
 38 | //        apacheOpenNLPExample();
 39 | //        stanfordNLPExample();
 40 |         lingpipeExamples();
 41 | //        findingPartsOfText();
 42 | //        findingSentences();
 43 | //        findingPeopleAndThings();
 44 | //        nameFinderExample();        
 45 | //        detectingPartsOfSpeechExample();
 46 | //        extractingRelationshipsExample();
 47 |     }
 48 | 
 49 |     private static void apacheOpenNLPExample() {
 50 |         try (InputStream is = new FileInputStream(
 51 |                 new File("C:\\OpenNLP Models", "en-token.bin"))) {
 52 |             TokenizerModel model = new TokenizerModel(is);
 53 |             Tokenizer tokenizer = new TokenizerME(model);
 54 |             String tokens[] = tokenizer.tokenize("He lives at 1511 W. Randolph.");
 55 |             for (String a : tokens) {
 56 |                 System.out.print("[" + a + "] ");
 57 |             }
 58 |             System.out.println();
 59 | 
 60 |         } catch (FileNotFoundException ex) {
 61 |             ex.printStackTrace();
 62 |         } catch (IOException ex) {
 63 |             ex.printStackTrace();
 64 |         }
 65 | 
 66 |     }
 67 | 
 68 |     private static void stanfordNLPExample() {
 69 |         PTBTokenizer ptb = new PTBTokenizer(
 70 |                 new StringReader("He lives at 1511 W. Randolph."),
 71 |                 new CoreLabelTokenFactory(), null);
 72 |         while (ptb.hasNext()) {
 73 |             System.out.println(ptb.next());
 74 |         }
 75 | 
 76 |     }
 77 | 
 78 |     private static void lingpipeExamples() {
 79 |         List<String> tokenList = new ArrayList<>();
 80 |         List<String> whiteList = new ArrayList<>();
 81 |         String text = "A sample sentence processed \nby \tthe "
 82 |                 + "LingPipe tokenizer.";
 83 |         com.aliasi.tokenizer.Tokenizer tokenizer = IndoEuropeanTokenizerFactory.INSTANCE.
 84 |                 tokenizer(text.toCharArray(), 0, text.length());
 85 |         tokenizer.tokenize(tokenList, whiteList);
 86 |         for (String element : tokenList) {
 87 |             System.out.print(element + " ");
 88 |         }
 89 |         System.out.println();
 90 | 
 91 |     }
 92 | 
 93 |     private static void splitMethodDemonstration() {
 94 |         String text = "Mr. Smith went to 123 Washington avenue.";
 95 |         String tokens[] = text.split("\\s+");
 96 |         for (String token : tokens) {
 97 |             System.out.println(token);
 98 |         }
 99 |     }
100 | 
101 |     private static void findingPartsOfText() {
102 |         String text = "Mr. Smith went to 123 Washington avenue.";
103 |         String tokens[] = text.split("\\s+");
104 |         for (String token : tokens) {
105 |             System.out.println(token);
106 |         }
107 |     }
108 | 
109 |     private static void findingSentences() {
110 |         String paragraph = "The first sentence. The second sentence.";
111 |         Reader reader = new StringReader(paragraph);
112 |         DocumentPreprocessor documentPreprocessor
113 |                 = new DocumentPreprocessor(reader);
114 |         List<String> sentenceList = new LinkedList<String>();
115 |         for (List<HasWord> element : documentPreprocessor) {
116 |             StringBuilder sentence = new StringBuilder();
117 |             List<HasWord> hasWordList = element;
118 |             for (HasWord token : hasWordList) {
119 |                 sentence.append(token).append(" ");
120 |             }
121 |             sentenceList.add(sentence.toString());
122 |         }
123 |         for (String sentence : sentenceList) {
124 |             System.out.println(sentence);
125 |         }
126 | 
127 |     }
128 | 
129 |     private static void findingPeopleAndThings() {
130 |         String text = "Mr. Smith went to 123 Washington avenue.";
131 |         String target = "Washington";
132 |         int index = text.indexOf(target);
133 |         System.out.println(index);
134 |     }
135 | 
136 |     private static void nameFinderExample() {
137 |         try {
138 |             String[] sentences = {
139 |                 "Tim was a good neighbor. Perhaps not as good a Bob "
140 |                 + "Haywood, but still pretty good. Of course Mr. Adam "
141 |                 + "took the cake!"};
142 |             Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
143 |             TokenNameFinderModel model = new TokenNameFinderModel(new File(
144 |                     "C:\\OpenNLP Models", "en-ner-person.bin"));
145 |             NameFinderME finder = new NameFinderME(model);
146 | 
147 |             for (String sentence : sentences) {
148 |                 // Split the sentence into tokens
149 |                 String[] tokens = tokenizer.tokenize(sentence);
150 | 
151 |                 // Find the names in the tokens and return Span objects
152 |                 Span[] nameSpans = finder.find(tokens);
153 | 
154 |                 // Print the names extracted from the tokens using the Span data
155 |                 System.out.println(Arrays.toString(
156 |                         Span.spansToStrings(nameSpans, tokens)));
157 |             }
158 |         } catch (IOException ex) {
159 |             ex.printStackTrace();
160 |         }
161 |     }
162 | 
163 |     private static void detectingPartsOfSpeechExample() {
164 |         String sentence = "POS processing is useful for enhancing the "
165 |                 + "quality of data sent to other elements of a pipeline.";
166 | 
167 |         POSModel model = new POSModelLoader()
168 |                 .load(new File("C:/Current Books/NLP and Java/Models/", "en-pos-maxent.bin"));
169 |         POSTaggerME tagger = new POSTaggerME(model);
170 | 
171 |         String tokens[] = WhitespaceTokenizer.INSTANCE
172 |                 .tokenize(sentence);
173 |         String[] tags = tagger.tag(tokens);
174 | 
175 |         POSSample sample = new POSSample(tokens, tags);
176 |         String posTokens[] = sample.getSentence();
177 |         String posTags[] = sample.getTags();
178 |         for (int i = 0; i < posTokens.length; i++) {
179 |             System.out.print(posTokens[i] + " - " + posTags[i]);
180 |         }
181 |         System.out.println();
182 | 
183 |         for (int i = 0; i < tokens.length; i++) {
184 |             System.out.print(tokens[i] + "[" + tags[i] + "] ");
185 |         }
186 |     }
187 | 
188 |     private static void extractingRelationshipsExample() {
189 |         Properties properties = new Properties();
190 |         properties.put("annotators", "tokenize, ssplit, parse");
191 |         StanfordCoreNLP pipeline = new StanfordCoreNLP(properties);
192 |         Annotation annotation = new Annotation(
193 |                 "The meaning and purpose of life is plain to see.");
194 |         pipeline.annotate(annotation);
195 |         pipeline.prettyPrint(annotation, System.out);
196 | 
197 |     }
198 | }
199 | 


--------------------------------------------------------------------------------
/Chapter02/MyTokenizerFactory.java:
--------------------------------------------------------------------------------
1 | package packt;
2 | 
3 | import opennlp.tools.tokenize.TokenizerFactory;
4 | 
5 | public class MyTokenizerFactory extends TokenizerFactory {
6 |     
7 | }
8 | 


--------------------------------------------------------------------------------
/Chapter02/PorterStemmer.java:
--------------------------------------------------------------------------------
  1 | package packt;
  2 | 
  3 | //package opennlp.tools.stemmer;
  4 | 
  5 | import opennlp.tools.stemmer.Stemmer;
  6 | 
  7 | 
  8 | /*
  9 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 10 |  * contributor license agreements.  See the NOTICE file distributed with
 11 |  * this work for additional information regarding copyright ownership.
 12 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 13 |  * (the "License"); you may not use this file except in compliance with
 14 |  * the License. You may obtain a copy of the License at
 15 |  *
 16 |  *     http://www.apache.org/licenses/LICENSE-2.0
 17 |  *
 18 |  * Unless required by applicable law or agreed to in writing, software
 19 |  * distributed under the License is distributed on an "AS IS" BASIS,
 20 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 21 |  * See the License for the specific language governing permissions and
 22 |  * limitations under the License.
 23 |  */
 24 | 
 25 | /*
 26 | 
 27 |    Porter stemmer in Java. The original paper is in
 28 | 
 29 |        Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
 30 |        no. 3, pp 130-137,
 31 | 
 32 |    See also http://www.tartarus.org/~martin/PorterStemmer/index.html
 33 | 
 34 |    Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
 35 |    Tthe words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
 36 |    is then out outside the bounds of b.
 37 | 
 38 |    Similarly,
 39 | 
 40 |    Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
 41 |    'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
 42 |    b[j] is then outside the bounds of b.
 43 | 
 44 |    Release 3.
 45 | 
 46 |    [ This version is derived from Release 3, modified by Brian Goetz to
 47 |      optimize for fewer object creations.  ]
 48 | 
 49 | */
 50 | 
 51 | /**
 52 |  *
 53 |  * Stemmer, implementing the Porter Stemming Algorithm
 54 |  *
 55 |  * The Stemmer class transforms a word into its root form.  The input
 56 |  * word can be provided a character at time (by calling add()), or at once
 57 |  * by calling one of the various stem(something) methods.
 58 |  */
 59 | 
 60 | class PorterStemmer implements Stemmer {
 61 |   private char[] b;
 62 |   private int i,    /* offset into b */
 63 |     j, k, k0;
 64 |   private boolean dirty = false;
 65 |   private static final int INC = 50;
 66 |   
 67 |   public PorterStemmer() {
 68 |     b = new char[INC];
 69 |     i = 0;
 70 |   }
 71 | 
 72 |   /**
 73 |    * reset() resets the stemmer so it can stem another word.  If you invoke
 74 |    * the stemmer by calling add(char) and then stem(), you must call reset()
 75 |    * before starting another word.
 76 |    */
 77 |   public void reset() { i = 0; dirty = false; }
 78 | 
 79 |   /**
 80 |    * Add a character to the word being stemmed.  When you are finished
 81 |    * adding characters, you can call stem(void) to process the word.
 82 |    */
 83 |   public void add(char ch) {
 84 |     if (b.length == i) {
 85 |       
 86 |       char[] new_b = new char[i+INC];
 87 |       for (int c = 0; c < i; c++) new_b[c] = b[c]; {
 88 |         b = new_b;
 89 |       }
 90 |     }
 91 |     b[i++] = ch;
 92 |   }
 93 | 
 94 |   /**
 95 |    * After a word has been stemmed, it can be retrieved by toString(),
 96 |    * or a reference to the internal buffer can be retrieved by getResultBuffer
 97 |    * and getResultLength (which is generally more efficient.)
 98 |    */
 99 |   @Override
100 |   public String toString() { return new String(b,0,i); }
101 | 
102 |   /**
103 |    * Returns the length of the word resulting from the stemming process.
104 |    */
105 |   public int getResultLength() { return i; }
106 | 
107 |   /**
108 |    * Returns a reference to a character buffer containing the results of
109 |    * the stemming process.  You also need to consult getResultLength()
110 |    * to determine the length of the result.
111 |    */
112 |   public char[] getResultBuffer() { return b; }
113 | 
114 |   /* cons(i) is true <=> b[i] is a consonant. */
115 | 
116 |   private final boolean cons(int i) {
117 |     switch (b[i]) {
118 |     case 'a': case 'e': case 'i': case 'o': case 'u':
119 |       return false;
120 |     case 'y':
121 |       return (i==k0) ? true : !cons(i-1);
122 |     default:
123 |       return true;
124 |     }
125 |   }
126 | 
127 |   /* m() measures the number of consonant sequences between k0 and j. if c is
128 |      a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
129 |      presence,
130 | 
131 |           <c><v>       gives 0
132 |           <c>vc<v>     gives 1
133 |           <c>vcvc<v>   gives 2
134 |           <c>vcvcvc<v> gives 3
135 |           ....
136 |   */
137 | 
138 |   private final int m() {
139 |     int n = 0;
140 |     int i = k0;
141 |     while(true) {
142 |       if (i > j)
143 |         return n;
144 |       if (! cons(i))
145 |         break;
146 |       i++;
147 |     }
148 |     i++;
149 |     while(true) {
150 |       while(true) {
151 |         if (i > j)
152 |           return n;
153 |         if (cons(i))
154 |           break;
155 |         i++;
156 |       }
157 |       i++;
158 |       n++;
159 |       while(true) {
160 |         if (i > j)
161 |           return n;
162 |         if (! cons(i))
163 |           break;
164 |         i++;
165 |       }
166 |       i++;
167 |     }
168 |   }
169 | 
170 |   /* vowelinstem() is true <=> k0,...j contains a vowel */
171 | 
172 |   private final boolean vowelinstem() {
173 |     int i;
174 |     for (i = k0; i <= j; i++)
175 |       if (! cons(i))
176 |         return true;
177 |     return false;
178 |   }
179 | 
180 |   /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
181 | 
182 |   private final boolean doublec(int j) {
183 |     if (j < k0+1)
184 |       return false;
185 |     if (b[j] != b[j-1])
186 |       return false;
187 |     return cons(j);
188 |   }
189 | 
190 |   /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
191 |      and also if the second c is not w,x or y. this is used when trying to
192 |      restore an e at the end of a short word. e.g.
193 | 
194 |           cav(e), lov(e), hop(e), crim(e), but
195 |           snow, box, tray.
196 | 
197 |   */
198 | 
199 |   private final boolean cvc(int i) {
200 |     if (i < k0+2 || !cons(i) || cons(i-1) || !cons(i-2))
201 |       return false;
202 |     else {
203 |       int ch = b[i];
204 |       if (ch == 'w' || ch == 'x' || ch == 'y') return false;
205 |     }
206 |     return true;
207 |   }
208 | 
209 |   private final boolean ends(String s) {
210 |     int l = s.length();
211 |     int o = k-l+1;
212 |     if (o < k0)
213 |       return false;
214 |     for (int i = 0; i < l; i++)
215 |       if (b[o+i] != s.charAt(i))
216 |         return false;
217 |     j = k-l;
218 |     return true;
219 |   }
220 | 
221 |   /* setto(s) sets (j+1),...k to the characters in the string s, readjusting
222 |      k. */
223 | 
224 |   void setto(String s) {
225 |     int l = s.length();
226 |     int o = j+1;
227 |     for (int i = 0; i < l; i++)
228 |       b[o+i] = s.charAt(i);
229 |     k = j+l;
230 |     dirty = true;
231 |   }
232 | 
233 |   /* r(s) is used further down. */
234 | 
235 |   void r(String s) { if (m() > 0) setto(s); }
236 | 
237 |   /* step1() gets rid of plurals and -ed or -ing. e.g.
238 | 
239 |            caresses  ->  caress
240 |            ponies    ->  poni
241 |            ties      ->  ti
242 |            caress    ->  caress
243 |            cats      ->  cat
244 | 
245 |            feed      ->  feed
246 |            agreed    ->  agree
247 |            disabled  ->  disable
248 | 
249 |            matting   ->  mat
250 |            mating    ->  mate
251 |            meeting   ->  meet
252 |            milling   ->  mill
253 |            messing   ->  mess
254 | 
255 |            meetings  ->  meet
256 | 
257 |   */
258 | 
259 |   private final void step1() {
260 |     if (b[k] == 's') {
261 |       if (ends("sses")) k -= 2;
262 |       else if (ends("ies")) setto("i");
263 |       else if (b[k-1] != 's') k--;
264 |     }
265 |     if (ends("eed")) {
266 |       if (m() > 0)
267 |         k--;
268 |     }
269 |     else if ((ends("ed") || ends("ing")) && vowelinstem()) {
270 |       k = j;
271 |       if (ends("at")) setto("ate");
272 |       else if (ends("bl")) setto("ble");
273 |       else if (ends("iz")) setto("ize");
274 |       else if (doublec(k)) {
275 |         int ch = b[k--];
276 |         if (ch == 'l' || ch == 's' || ch == 'z')
277 |           k++;
278 |       }
279 |       else if (m() == 1 && cvc(k))
280 |         setto("e");
281 |     }
282 |   }
283 | 
284 |   /* step2() turns terminal y to i when there is another vowel in the stem. */
285 | 
286 |   private final void step2() {
287 |     if (ends("y") && vowelinstem()) {
288 |       b[k] = 'i';
289 |       dirty = true;
290 |     }
291 |   }
292 | 
293 |   /* step3() maps double suffices to single ones. so -ization ( = -ize plus
294 |      -ation) maps to -ize etc. note that the string before the suffix must give
295 |      m() > 0. */
296 | 
297 |   private final void step3() {
298 |     if (k == k0) return; /* For Bug 1 */
299 |     switch (b[k-1]) {
300 |     case 'a':
301 |       if (ends("ational")) { r("ate"); break; }
302 |       if (ends("tional")) { r("tion"); break; }
303 |       break;
304 |     case 'c':
305 |       if (ends("enci")) { r("ence"); break; }
306 |       if (ends("anci")) { r("ance"); break; }
307 |       break;
308 |     case 'e':
309 |       if (ends("izer")) { r("ize"); break; }
310 |       break;
311 |     case 'l':
312 |       if (ends("bli")) { r("ble"); break; }
313 |       if (ends("alli")) { r("al"); break; }
314 |       if (ends("entli")) { r("ent"); break; }
315 |       if (ends("eli")) { r("e"); break; }
316 |       if (ends("ousli")) { r("ous"); break; }
317 |       break;
318 |     case 'o':
319 |       if (ends("ization")) { r("ize"); break; }
320 |       if (ends("ation")) { r("ate"); break; }
321 |       if (ends("ator")) { r("ate"); break; }
322 |       break;
323 |     case 's':
324 |       if (ends("alism")) { r("al"); break; }
325 |       if (ends("iveness")) { r("ive"); break; }
326 |       if (ends("fulness")) { r("ful"); break; }
327 |       if (ends("ousness")) { r("ous"); break; }
328 |       break;
329 |     case 't':
330 |       if (ends("aliti")) { r("al"); break; }
331 |       if (ends("iviti")) { r("ive"); break; }
332 |       if (ends("biliti")) { r("ble"); break; }
333 |       break;
334 |     case 'g':
335 |       if (ends("logi")) { r("log"); break; }
336 |     }
337 |   }
338 | 
339 |   /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
340 | 
341 |   private final void step4() {
342 |     switch (b[k]) {
343 |     case 'e':
344 |       if (ends("icate")) { r("ic"); break; }
345 |       if (ends("ative")) { r(""); break; }
346 |       if (ends("alize")) { r("al"); break; }
347 |       break;
348 |     case 'i':
349 |       if (ends("iciti")) { r("ic"); break; }
350 |       break;
351 |     case 'l':
352 |       if (ends("ical")) { r("ic"); break; }
353 |       if (ends("ful")) { r(""); break; }
354 |       break;
355 |     case 's':
356 |       if (ends("ness")) { r(""); break; }
357 |       break;
358 |     }
359 |   }
360 | 
361 |   /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
362 | 
363 |   private final void step5() {
364 |     if (k == k0) return; /* for Bug 1 */
365 |     switch (b[k-1]) {
366 |     case 'a':
367 |       if (ends("al")) break;
368 |       return;
369 |     case 'c':
370 |       if (ends("ance")) break;
371 |       if (ends("ence")) break;
372 |       return;
373 |     case 'e':
374 |       if (ends("er")) break; return;
375 |     case 'i':
376 |       if (ends("ic")) break; return;
377 |     case 'l':
378 |       if (ends("able")) break;
379 |       if (ends("ible")) break; return;
380 |     case 'n':
381 |       if (ends("ant")) break;
382 |       if (ends("ement")) break;
383 |       if (ends("ment")) break;
384 |       /* element etc. not stripped before the m */
385 |       if (ends("ent")) break;
386 |       return;
387 |     case 'o':
388 |       if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break;
389 |       /* j >= 0 fixes Bug 2 */
390 |       if (ends("ou")) break;
391 |       return;
392 |       /* takes care of -ous */
393 |     case 's':
394 |       if (ends("ism")) break;
395 |       return;
396 |     case 't':
397 |       if (ends("ate")) break;
398 |       if (ends("iti")) break;
399 |       return;
400 |     case 'u':
401 |       if (ends("ous")) break;
402 |       return;
403 |     case 'v':
404 |       if (ends("ive")) break;
405 |       return;
406 |     case 'z':
407 |       if (ends("ize")) break;
408 |       return;
409 |     default:
410 |       return;
411 |     }
412 |     if (m() > 1)
413 |       k = j;
414 |   }
415 | 
416 |   /* step6() removes a final -e if m() > 1. */
417 | 
418 |   private final void step6() {
419 |     j = k;
420 |     if (b[k] == 'e') {
421 |       int a = m();
422 |       if (a > 1 || a == 1 && !cvc(k-1))
423 |         k--;
424 |     }
425 |     if (b[k] == 'l' && doublec(k) && m() > 1)
426 |       k--;
427 |   }
428 | 
429 | 
430 |   /**
431 |    * Stem a word provided as a String.  Returns the result as a String.
432 |    */
433 |   public String stem(String s) {
434 |     if (stem(s.toCharArray(), s.length()))
435 |       return toString();
436 |     else
437 |       return s;
438 |   }
439 | 
440 |   /**
441 |    * Stem a word provided as a CharSequence.
442 |    * Returns the result as a CharSequence.
443 |    */
444 |   public CharSequence stem(CharSequence word) {
445 |     return stem(word.toString());
446 |   }
447 |   
448 |   /** Stem a word contained in a char[].  Returns true if the stemming process
449 |    * resulted in a word different from the input.  You can retrieve the
450 |    * result with getResultLength()/getResultBuffer() or toString().
451 |    */
452 |   public boolean stem(char[] word) {
453 |     return stem(word, word.length);
454 |   }
455 | 
456 |   /** Stem a word contained in a portion of a char[] array.  Returns
457 |    * true if the stemming process resulted in a word different from
458 |    * the input.  You can retrieve the result with
459 |    * getResultLength()/getResultBuffer() or toString().
460 |    */
461 |   public boolean stem(char[] wordBuffer, int offset, int wordLen) {
462 |     reset();
463 |     if (b.length < wordLen) {
464 |       b = new char[wordLen - offset];
465 |     }
466 |     System.arraycopy(wordBuffer, offset, b, 0, wordLen);
467 |     i = wordLen;
468 |     return stem(0);
469 |   }
470 | 
471 |   /** Stem a word contained in a leading portion of a char[] array.
472 |    * Returns true if the stemming process resulted in a word different
473 |    * from the input.  You can retrieve the result with
474 |    * getResultLength()/getResultBuffer() or toString().
475 |    */
476 |   public boolean stem(char[] word, int wordLen) {
477 |     return stem(word, 0, wordLen);
478 |   }
479 | 
480 |   /** Stem the word placed into the Stemmer buffer through calls to add().
481 |    * Returns true if the stemming process resulted in a word different
482 |    * from the input.  You can retrieve the result with
483 |    * getResultLength()/getResultBuffer() or toString().
484 |    */
485 |   public boolean stem() {
486 |     return stem(0);
487 |   }
488 | 
489 |   public boolean stem(int i0) {
490 |     k = i - 1;
491 |     k0 = i0;
492 |     if (k > k0+1) {
493 |       step1(); step2(); step3(); step4(); step5(); step6();
494 |     }
495 |     // Also, a word is considered dirty if we lopped off letters
496 |     // Thanks to Ifigenia Vairelles for pointing this out.
497 |     if (i != k+1)
498 |       dirty = true;
499 |     i = k+1;
500 |     return dirty;
501 |   }
502 | }
503 | 
504 | 
505 | 


--------------------------------------------------------------------------------
/Chapter02/StemmingLemaEx.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * To change this license header, choose License Headers in Project Properties.
 3 |  * To change this template file, choose Tools | Templates
 4 |  * and open the template in the editor.
 5 |  */
 6 | package chapter2;
 7 | 
 8 | 
 9 | import java.io.File;
10 | import java.io.FileInputStream;
11 | import java.io.IOException;
12 | import java.io.InputStream;
13 | import java.text.BreakIterator;
14 | import java.util.Locale;
15 | import java.util.regex.Matcher;
16 | import java.util.regex.Pattern;
17 | import opennlp.tools.sentdetect.SentenceDetector;
18 | import opennlp.tools.sentdetect.SentenceDetectorME;
19 | import opennlp.tools.sentdetect.SentenceModel;
20 | import opennlp.tools.stemmer.PorterStemmer;
21 | 
22 | /**
23 |  *
24 |  * @author ashish
25 |  */
26 | public class StemmingLemaEx {
27 |     public static void main(String args[]){
28 |         String words[] = {"bank", "banking", "banks", "banker", "banked", 
29 |     "bankart"};
30 |         PorterStemmer ps = new PorterStemmer();
31 |         for(String w : words){
32 |             String stem = ps.stem(w);
33 |             System.out.println("Word : " + w + " Stem : " + stem);
34 |         }
35 |         String paragraph = "When determining the end of sentences "
36 |             + "we need to consider several factors. Sentences may end with "
37 |             + "exclamation marks! Or possibly questions marks? Within "
38 |             + "sentences we may find numbers like 3.14159, abbreviations "
39 |             + "such as found in Mr. Smith, and possibly ellipses either "
40 |             + "within a sentence …, or at the end of a sentence…";
41 |         String simple = "[.?!]";
42 |         String[] splitString = (paragraph.split(simple));
43 |         for (String string : splitString) {
44 |             System.out.println(string);
45 |         }
46 |         System.out.println("-------------Using Pattern and Matcher-------------");
47 |         Pattern sentencePattern = Pattern.compile(
48 |             "# Match a sentence ending in punctuation or EOS.\n"
49 |             + "[^.!?\\s]    # First char is non-punct, non-ws\n"
50 |             + "[^.!?]*      # Greedily consume up to punctuation.\n"
51 |             + "(?:          # Group for unrolling the loop.\n"
52 |             + "  [.!?]      # (special) inner punctuation ok if\n"
53 |             + "  (?!['\"]?\\s|$)  # not followed by ws or EOS.\n"
54 |             + "  [^.!?]*    # Greedily consume up to punctuation.\n"
55 |             + ")*           # Zero or more (special normal*)\n"
56 |             + "[.!?]?       # Optional ending punctuation.\n"
57 |             + "['\"]?       # Optional closing quote.\n"
58 |             + "(?=\\s|$)",
59 |             Pattern.MULTILINE | Pattern.COMMENTS);
60 |         Matcher matcher = sentencePattern.matcher(paragraph);
61 |         while (matcher.find()) {
62 |             System.out.println(matcher.group());
63 |         }
64 |         System.out.println("-------------Using BreakIterator-------------");
65 |         BreakIterator si = BreakIterator.getSentenceInstance();
66 |         Locale cl = new Locale("en", "US");
67 |         si.setText(paragraph);
68 |         int boundary = si.first();
69 |         while(boundary!=BreakIterator.DONE){
70 |             int begin = boundary;
71 |             System.out.println(boundary + " - ");
72 |             boundary = si.next();
73 |             int end = boundary;
74 |             if(end == BreakIterator.DONE){
75 |                 break;
76 |             }
77 |             System.out.println(boundary + " [ " + paragraph.substring(begin,end) + " ] ");
78 |         }
79 |         System.out.println("-------------Using SentenceDetectorME-------------");
80 |         try{
81 |             InputStream is = new FileInputStream(new File("/home/ashish/Downloads/" + "en-sent.bin"));
82 |             SentenceModel sm = new SentenceModel(is);
83 |             SentenceDetectorME detector = new SentenceDetectorME(sm);
84 |             String sentences [] = detector.sentDetect(paragraph);
85 |             for(String s : sentences){
86 |                 System.out.println(s);
87 |             }
88 |         }
89 |         catch(IOException e){
90 |             System.out.println("Error Detected" + e);
91 |             e.printStackTrace();
92 |         }
93 |     }
94 |     
95 | }
96 | 


--------------------------------------------------------------------------------
/Chapter02/StopWords.java:
--------------------------------------------------------------------------------
 1 | package packt;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.FileReader;
 5 | import java.io.IOException;
 6 | import java.util.ArrayList;
 7 | import java.util.Arrays;
 8 | import java.util.HashSet;
 9 | import java.util.Iterator;
10 | 
11 | public class StopWords {
12 | 
13 |     private String[] defaultStopWords = {"i", "a", "about", "an", "are", "as", "at", 
14 |         "be", "by", "com", "for", "from", "how", "in", "is", "it", "of", "on", 
15 |         "or", "that", "the", "this", "to", "was", "what", "when", "where", 
16 |         "who", "will", "with"};
17 |     
18 |     private static HashSet stopWords  = new HashSet();
19 | 
20 |     public StopWords() {
21 |         stopWords.addAll(Arrays.asList(defaultStopWords));
22 |     }
23 | 
24 |     public StopWords(String fileName) {
25 |         try {
26 |             BufferedReader bufferedreader = 
27 |                     new BufferedReader(new FileReader(fileName));
28 |             while (bufferedreader.ready()) {
29 |                 stopWords.add(bufferedreader.readLine());
30 |             }
31 |         } catch (IOException ex) {
32 |             ex.printStackTrace();
33 |         }
34 |     }
35 |     
36 |     public void addStopWord(String word) {
37 |         stopWords.add(word);
38 |     }
39 | 
40 |     public String[] removeStopWords(String[] words) {
41 |         ArrayList<String> tokens = new ArrayList<String>(Arrays.asList(words));
42 |         for (int i = 0; i < tokens.size(); i++) {
43 | //            System.out.println(stopWords.contains(tokens.get(i)) + " " + tokens.get(i));
44 |             if (stopWords.contains(tokens.get(i))) {
45 |                 tokens.remove(i);
46 |             }
47 |         }
48 |         return (String[]) tokens.toArray(new String[tokens.size()]);
49 |     }
50 | 
51 |     public void displayStopWords() {
52 |         Iterator<String> iterator = stopWords.iterator();
53 |         while(iterator.hasNext()) {
54 |             System.out.print("[" + iterator.next() + "]  ");
55 |         }
56 |     }
57 | }
58 | 


--------------------------------------------------------------------------------
/Chapter02/Test.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * To change this license header, choose License Headers in Project Properties.
  3 |  * To change this template file, choose Tools | Templates
  4 |  * and open the template in the editor.
  5 |  */
  6 | package chapter2;
  7 | 
  8 | import com.aliasi.sentences.IndoEuropeanSentenceModel;
  9 | import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;
 10 | import com.aliasi.tokenizer.TokenizerFactory;
 11 | import edu.stanford.nlp.ling.CoreLabel;
 12 | import edu.stanford.nlp.ling.HasWord;
 13 | import edu.stanford.nlp.pipeline.Annotation;
 14 | import edu.stanford.nlp.pipeline.StanfordCoreNLP;
 15 | import edu.stanford.nlp.process.CoreLabelTokenFactory;
 16 | import edu.stanford.nlp.process.DocumentPreprocessor;
 17 | import edu.stanford.nlp.process.DocumentProcessor;
 18 | import edu.stanford.nlp.process.PTBTokenizer;
 19 | import edu.stanford.nlp.process.WordTokenFactory;
 20 | import java.io.BufferedOutputStream;
 21 | import java.io.File;
 22 | import java.io.FileInputStream;
 23 | import java.io.IOException;
 24 | import java.io.InputStream;
 25 | import java.io.Reader;
 26 | import java.io.StreamTokenizer;
 27 | import java.io.StringReader;
 28 | import java.io.UnsupportedEncodingException;
 29 | import java.text.BreakIterator;
 30 | import java.util.ArrayList;
 31 | import java.util.Iterator;
 32 | import java.util.List;
 33 | import java.util.Properties;
 34 | import java.util.Scanner;
 35 | import java.util.StringTokenizer;
 36 | import opennlp.tools.tokenize.SimpleTokenizer;
 37 | import opennlp.tools.tokenize.TokenSample;
 38 | import opennlp.tools.tokenize.TokenSampleStream;
 39 | import opennlp.tools.tokenize.Tokenizer;
 40 | //import opennlp.tools.tokenize.TokenizerFactory;
 41 | import opennlp.tools.tokenize.TokenizerME;
 42 | import opennlp.tools.tokenize.TokenizerModel;
 43 | import opennlp.tools.tokenize.WhitespaceTokenizer;
 44 | import opennlp.tools.util.InputStreamFactory;
 45 | import opennlp.tools.util.ObjectStream;
 46 | import opennlp.tools.util.PlainTextByLineStream;
 47 | 
 48 | /**
 49 |  *
 50 |  * @author ashish
 51 |  */
 52 | public class Test {
 53 |     private static String getResourcePath(){
 54 |         File currDir = new File(".");
 55 |         String path = currDir .getAbsolutePath();
 56 |         path = path.substring(0, path.length()-2);
 57 |         System.out.println(path);
 58 |             String resourcePath = path + File.separator  + "src/chapter2/";
 59 |         return resourcePath;
 60 |     }
 61 |     public static void main(String args[]){
 62 |         Scanner s = new Scanner("Let's pause, and then reflect");
 63 |         s.useDelimiter("[,.]");
 64 |         List<String> l = new ArrayList<>();
 65 |         while(s.hasNext()){
 66 |             String token = s.next();
 67 |             l.add(token);
 68 |         }
 69 |         for(String token : l){
 70 |             System.out.println(token);
 71 |         }
 72 |         String text = "Mr. Smith went to 123 Washington avenue";
 73 |         String tokens[] = text.split("\\s+");
 74 |         for(String token: tokens){
 75 |             System.out.println(token);
 76 |         }
 77 |         BreakIterator b = BreakIterator.getWordInstance();
 78 |         text = "Let's pause, and then reflect";
 79 |         b.setText(text);
 80 |         int boundary = b.first();
 81 |         while(boundary!=BreakIterator.DONE){
 82 |             int begin = boundary;
 83 |             System.out.println(boundary);
 84 |             boundary = b.next();
 85 |             int end = boundary;
 86 |             if(end==BreakIterator.DONE){
 87 |                 break;
 88 |             }
 89 |             System.out.println(boundary + "[" + text.substring(begin,end) + "]");
 90 |         }
 91 |         
 92 |         try{
 93 |             StreamTokenizer t = new StreamTokenizer(
 94 |                     new StringReader("Let's pause, and then reflect."));
 95 |             boolean isEOF = false;
 96 |             while(!isEOF){
 97 |                 int token = t.nextToken();
 98 |                 switch(token){
 99 |                     case StreamTokenizer.TT_EOF:
100 |                         isEOF = true;
101 |                         break;
102 |                     case StreamTokenizer.TT_EOL:
103 |                         break;
104 |                     case StreamTokenizer.TT_WORD:
105 |                         System.out.println(t.sval);
106 |                         break;
107 |                     case StreamTokenizer.TT_NUMBER:
108 |                         System.out.println(t.nval);
109 |                         break;
110 |                     default:
111 |                         System.out.println((char)token);
112 |                 }
113 |             }
114 |             
115 |         }
116 |         catch(IOException e){
117 |             e.printStackTrace();
118 |         }
119 |         catch(Exception e){
120 |             e.printStackTrace();
121 |         }
122 |         
123 |         // Using OpenNLP
124 |         
125 |         String paragraph = "Let's pause, \nand then reflect.";
126 |         SimpleTokenizer simpletokenizer = SimpleTokenizer.INSTANCE;
127 |         String simpletokens[] = simpletokenizer.tokenize(paragraph);
128 |         for(String token : simpletokens){
129 |             System.out.println(token);
130 |         }
131 |         
132 |         tokens = WhitespaceTokenizer.INSTANCE.tokenize(paragraph);
133 |         for (String token : tokens) {
134 |             System.out.println(token);
135 |         }
136 |         
137 |         try
138 |         {
139 |             InputStream modelis = new FileInputStream(new File(getResourcePath() + "en-token.bin"));
140 |             TokenizerModel model = new TokenizerModel(modelis);
141 |             Tokenizer tokenizer = new TokenizerME(model);
142 |             tokens= tokenizer.tokenize(paragraph);
143 |             for (String token : tokens){
144 |                 System.out.println(token);    
145 |             }
146 |         }
147 |         catch(IOException e){
148 |             e.printStackTrace();
149 |         }
150 |         
151 |         
152 |         
153 |         PTBTokenizer ptb = new PTBTokenizer(new StringReader(paragraph), new WordTokenFactory(),null);
154 |         while(ptb.hasNext()){
155 |             System.out.println(ptb.next());
156 |         }
157 |         
158 |         CoreLabelTokenFactory ctf = new CoreLabelTokenFactory();
159 |         ptb = new PTBTokenizer(new StringReader(paragraph), ctf, "invertible=true");
160 |         while(ptb.hasNext()){
161 |            CoreLabel cl = (CoreLabel)ptb.next();
162 |             System.out.println(cl.originalText() + "(" + cl.beginPosition() + "-" + cl.endPosition() + ")" );
163 |         }
164 |         
165 |         Reader reader = new StringReader(paragraph);
166 |         DocumentPreprocessor dp = new DocumentPreprocessor(reader);
167 |         Iterator<List<HasWord>> it = dp.iterator();
168 |         while(it.hasNext()){
169 |             List<HasWord> sentence = it.next();
170 |             for(HasWord token : sentence){
171 |                 System.out.println(token);
172 |             }
173 |         }
174 |         Properties prop = new Properties();
175 |         prop.put("annonators", "tokenize, ssplit");
176 |         StanfordCoreNLP pipeline = new StanfordCoreNLP(prop);
177 | //        Annotation ann = new Annotation(paragraph);
178 |         Annotation ann = new Annotation(paragraph);
179 |         pipeline.annotate(ann);
180 |         pipeline.prettyPrint(ann, System.out);
181 |         
182 |         // LingPipe Tokenizers
183 |         char texts[] = paragraph.toCharArray();
184 |         
185 |         TokenizerFactory tfac = IndoEuropeanTokenizerFactory.INSTANCE;
186 |         com.aliasi.tokenizer.Tokenizer tokens1 = tfac.tokenizer(texts, 0, texts.length);
187 |         for(String t : tokens1){
188 |             System.out.println(t);
189 |         }
190 |         
191 |         BufferedOutputStream bos = null;
192 |         try{
193 |             ObjectStream<String> linestream = new PlainTextByLineStream((InputStreamFactory) new FileInputStream("training.train"),"UTF-8");
194 |             ObjectStream<TokenSample> samplestream = new TokenSampleStream(linestream);
195 | //            TokenizerModel model = TokenizerME.train(samplestream, factory, mlParams)
196 |             
197 |         }
198 |         catch(UnsupportedEncodingException e){
199 |             e.printStackTrace();
200 |         }
201 |         catch(IOException e){
202 |             e.printStackTrace();
203 |         }
204 |         
205 |         
206 |     }
207 |     
208 | }
209 | 


--------------------------------------------------------------------------------
/Chapter03/SBDDemo.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * To change this license header, choose License Headers in Project Properties.
 3 |  * To change this template file, choose Tools | Templates
 4 |  * and open the template in the editor.
 5 |  */
 6 | package chapter3;
 7 | 
 8 | import java.util.regex.Matcher;
 9 | import java.util.regex.Pattern;
10 | 
11 | /**
12 |  *
13 |  * @author ashish
14 |  */
15 | public class SBDDemo {
16 |     private static String paragraph = "When determining the end of sentences "
17 |     + "we need to consider several factors. Sentences may end with "
18 |     + "exclamation marks! Or possibly questions marks? Within "
19 |     + "sentences we may find numbers like 3.14159, abbreviations "
20 |     + "such as found in Mr. Smith, and possibly ellipses either "
21 |     + "within a sentence …, or at the end of a sentence…";
22 |     
23 |     public static void main(String args[]){
24 |         System.out.println("--------- Simple regex ---------");
25 |         String simple = "[.?!]";
26 |         String[] splitString = (paragraph.split(simple));
27 |         for (String string : splitString) {
28 |             System.out.println(string);
29 |         }
30 |         System.out.println(">>>> Using Pattern and Matcher --------");
31 |         Pattern sentencePattern = Pattern.compile(
32 |             "# Match a sentence ending in punctuation or EOS.\n"
33 |             + "[^.!?\\s]    # First char is non-punct, non-ws\n"
34 |             + "[^.!?]*      # Greedily consume up to punctuation.\n"
35 |             + "(?:          # Group for unrolling the loop.\n"
36 |             + "  [.!?]      # (special) inner punctuation ok if\n"
37 |             + "  (?!['\"]?\\s|$)  # not followed by ws or EOS.\n"
38 |             + "  [^.!?]*    # Greedily consume up to punctuation.\n"
39 |             + ")*           # Zero or more (special normal*)\n"
40 |             + "[.!?]?       # Optional ending punctuation.\n"
41 |             + "['\"]?       # Optional closing quote.\n"
42 |             + "(?=\\s|$)",
43 |             Pattern.MULTILINE | Pattern.COMMENTS);
44 |         Matcher matcher = sentencePattern.matcher(paragraph);
45 |         while (matcher.find()) {
46 |             System.out.println(matcher.group());
47 |         }
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/Chapter03/XMLProcessingDemo.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * To change this license header, choose License Headers in Project Properties.
 3 |  * To change this template file, choose Tools | Templates
 4 |  * and open the template in the editor.
 5 |  */
 6 | package chapter3;
 7 | 
 8 | import edu.stanford.nlp.process.DocumentPreprocessor;
 9 | import edu.stanford.nlp.process.DocumentProcessor;
10 | import java.io.File;
11 | import java.io.FileNotFoundException;
12 | import java.io.FileReader;
13 | import java.io.Reader;
14 | import java.util.List;
15 | import java.util.ListIterator;
16 | import java.util.logging.Level;
17 | import java.util.logging.Logger;
18 | 
19 | /**
20 |  *
21 |  * @author ashish
22 |  */
23 | public class XMLProcessingDemo {
24 |     private static String getResourcePath(){
25 |         File currDir = new File(".");
26 |         String path = currDir .getAbsolutePath();
27 |         path = path.substring(0, path.length()-2);
28 |         System.out.println(path);
29 |             String resourcePath = path + File.separator  + "src/chapter3/XMLTest.xml";
30 |         return resourcePath;
31 |     }
32 |     
33 |     public static void main(String args[]){
34 |         try {
35 |             Reader reader = new FileReader(getResourcePath());
36 |             DocumentPreprocessor dp = new DocumentPreprocessor(reader, DocumentPreprocessor.DocType.XML);
37 |             dp.setElementDelimiter("sentence");
38 |             for(List sentence : dp){
39 |                 ListIterator list = sentence.listIterator();
40 |                 while (list.hasNext()) { 
41 |                     System.out.print(list.next() + " "); 
42 |                 } 
43 |                 System.out.println(); 
44 |                 
45 |             }
46 |         } catch (FileNotFoundException ex) {
47 |             Logger.getLogger(XMLProcessingDemo.class.getName()).log(Level.SEVERE, null, ex);
48 |         }
49 |         
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/Chapter03/XMLTest.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <?xml-stylesheet type="text/xsl"?>
 3 | <document>
 4 |  <sentences>
 5 |  <sentence id="1">
 6 |  <word>When</word>
 7 |  <word>the</word>
 8 |  <word>day</word>
 9 |  <word>is</word>
10 |  <word>done</word>
11 |  <word>we</word>
12 |  <word>can</word>
13 |  <word>sleep</word>
14 |  <word>.</word>
15 |  </sentence>
16 |  <sentence id="2">
17 |  <word>When</word>
18 |  <word>the</word>
19 |  <word>morning</word>
20 |  <word>comes</word>
21 |  <word>we</word>
22 |  <word>can</word>
23 |  <word>wake</word>
24 |  <word>.</word>
25 |  </sentence>
26 |  <sentence id="3">
27 |  <word>After</word>
28 |  <word>that</word>
29 |  <word>who</word>


--------------------------------------------------------------------------------
/Chapter04/NERDemo.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * To change this license header, choose License Headers in Project Properties.
 3 |  * To change this template file, choose Tools | Templates
 4 |  * and open the template in the editor.
 5 |  */
 6 | package chapter4;
 7 | 
 8 | 
 9 | import java.io.File;
10 | import java.io.FileInputStream;
11 | import java.io.InputStream;
12 | import opennlp.tools.namefind.NameFinderME;
13 | import opennlp.tools.namefind.TokenNameFinder;
14 | import opennlp.tools.namefind.TokenNameFinderModel;
15 | import opennlp.tools.tokenize.Tokenizer;
16 | import opennlp.tools.tokenize.TokenizerME;
17 | import opennlp.tools.tokenize.TokenizerModel;
18 | import opennlp.tools.util.Span;
19 | 
20 | /**
21 |  *
22 |  * @author ashish
23 |  */
24 | public class NERDemo {
25 |     private static String getResourcePath(){
26 |         File currDir = new File(".");
27 |         String path = currDir .getAbsolutePath();
28 |         path = path.substring(0, path.length()-2);
29 |         System.out.println(path);
30 |             String resourcePath = path + File.separator  + "src/chapter4/";
31 |         return resourcePath;
32 |     }
33 |     public static void main(String args[]){
34 |         String sentences[] = {"Joe was the last person to see Fred. ", 
35 |             "He saw him in Boston at McKenzie's pub at 3:00 where he " 
36 |             + " paid $2.45 for an ale. ", 
37 |             "Joe wanted to go to Vermont for the day to visit a cousin who " 
38 |             + "works at IBM, but Sally and he had to look for Fred"}; 
39 |         String sentence = "He was the last person to see Fred."; 
40 |         try
41 |         {
42 |             InputStream tokenStream = new FileInputStream(new File(getResourcePath()+ "en-token.bin"));
43 |             InputStream modelStream = new FileInputStream(new File(getResourcePath() + "en-ner-person.bin"));
44 |             TokenizerModel tokenModel = new TokenizerModel(tokenStream);
45 |             Tokenizer tokenizer = new TokenizerME(tokenModel);
46 |             TokenNameFinderModel entityModel = new TokenNameFinderModel(modelStream);
47 |             NameFinderME nameFinder = new NameFinderME(entityModel);
48 |             String tokens1[] = tokenizer.tokenize(sentence);
49 |             Span nameSpans1[] = nameFinder.find(tokens1);
50 |             for (int i = 0; i < nameSpans1.length; i++) { 
51 |                 System.out.println("Span: " + nameSpans1[i].toString()); 
52 |                 System.out.println("Entity: " 
53 |                     + tokens1[nameSpans1[i].getStart()]); 
54 |             } 
55 |             
56 |             System.out.println("---------- Multiple Sentences -----------");
57 |             for (String sentence1 : sentences) { 
58 |                 String tokens[] = tokenizer.tokenize(sentence1); 
59 |                 Span nameSpans[] = nameFinder.find(tokens); 
60 |                 for (int i = 0; i < nameSpans.length; i++) { 
61 |                     System.out.println("Span: " + nameSpans[i].toString()); 
62 |                     System.out.println("Entity: "  
63 |                         + tokens[nameSpans[i].getStart()]); 
64 |                 } 
65 |                 System.out.println(); 
66 |             } 
67 |             
68 |         }
69 |         catch(Exception e){
70 |             System.out.println(e);
71 |         }
72 |     }
73 |     
74 | }
75 | 


--------------------------------------------------------------------------------
/Chapter04/build.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!-- You may freely edit this file. See commented blocks below for -->
 3 | <!-- some examples of how to customize the build. -->
 4 | <!-- (If you delete it and reopen the project it will be recreated.) -->
 5 | <!-- By default, only the Clean and Build commands use this build script. -->
 6 | <!-- Commands such as Run, Debug, and Test only use this build script if -->
 7 | <!-- the Compile on Save feature is turned off for the project. -->
 8 | <!-- You can turn off the Compile on Save (or Deploy on Save) setting -->
 9 | <!-- in the project's Project Properties dialog box.-->
10 | <project name="Chapter_4" default="default" basedir=".">
11 |     <description>Builds, tests, and runs the project Chapter 4.</description>
12 |     <import file="nbproject/build-impl.xml"/>
13 |     <!--
14 | 
15 |     There exist several targets which are by default empty and which can be 
16 |     used for execution of your tasks. These targets are usually executed 
17 |     before and after some main targets. They are: 
18 | 
19 |       -pre-init:                 called before initialization of project properties
20 |       -post-init:                called after initialization of project properties
21 |       -pre-compile:              called before javac compilation
22 |       -post-compile:             called after javac compilation
23 |       -pre-compile-single:       called before javac compilation of single file
24 |       -post-compile-single:      called after javac compilation of single file
25 |       -pre-compile-test:         called before javac compilation of JUnit tests
26 |       -post-compile-test:        called after javac compilation of JUnit tests
27 |       -pre-compile-test-single:  called before javac compilation of single JUnit test
28 |       -post-compile-test-single: called after javac compilation of single JUunit test
29 |       -pre-jar:                  called before JAR building
30 |       -post-jar:                 called after JAR building
31 |       -post-clean:               called after cleaning build products
32 | 
33 |     (Targets beginning with '-' are not intended to be called on their own.)
34 | 
35 |     Example of inserting an obfuscator after compilation could look like this:
36 | 
37 |         <target name="-post-compile">
38 |             <obfuscate>
39 |                 <fileset dir="${build.classes.dir}"/>
40 |             </obfuscate>
41 |         </target>
42 | 
43 |     For list of available properties check the imported 
44 |     nbproject/build-impl.xml file. 
45 | 
46 | 
47 |     Another way to customize the build is by overriding existing main targets.
48 |     The targets of interest are: 
49 | 
50 |       -init-macrodef-javac:     defines macro for javac compilation
51 |       -init-macrodef-junit:     defines macro for junit execution
52 |       -init-macrodef-debug:     defines macro for class debugging
53 |       -init-macrodef-java:      defines macro for class execution
54 |       -do-jar:                  JAR building
55 |       run:                      execution of project 
56 |       -javadoc-build:           Javadoc generation
57 |       test-report:              JUnit report generation
58 | 
59 |     An example of overriding the target for project execution could look like this:
60 | 
61 |         <target name="run" depends="Chapter_4-impl.jar">
62 |             <exec dir="bin" executable="launcher.exe">
63 |                 <arg file="${dist.jar}"/>
64 |             </exec>
65 |         </target>
66 | 
67 |     Notice that the overridden target depends on the jar target and not only on 
68 |     the compile target as the regular run target does. Again, for a list of available 
69 |     properties which you can use, check the target you are overriding in the
70 |     nbproject/build-impl.xml file. 
71 | 
72 |     -->
73 | </project>
74 | 


--------------------------------------------------------------------------------
/Chapter04/build/classes/.netbeans_automatic_build:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Natural-Language-Processing-with-Java-Second-Edition/f235777262c48f4bbb4e52e45c4936896471080c/Chapter04/build/classes/.netbeans_automatic_build


--------------------------------------------------------------------------------
/Chapter04/build/classes/.netbeans_update_resources:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Natural-Language-Processing-with-Java-Second-Edition/f235777262c48f4bbb4e52e45c4936896471080c/Chapter04/build/classes/.netbeans_update_resources


--------------------------------------------------------------------------------
/Chapter04/build/classes/packt/Chapter4.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Natural-Language-Processing-with-Java-Second-Edition/f235777262c48f4bbb4e52e45c4936896471080c/Chapter04/build/classes/packt/Chapter4.class


--------------------------------------------------------------------------------
/Chapter04/build/classes/packt/DictionaryChunker.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Natural-Language-Processing-with-Java-Second-Edition/f235777262c48f4bbb4e52e45c4936896471080c/Chapter04/build/classes/packt/DictionaryChunker.class


--------------------------------------------------------------------------------
/Chapter04/build/classes/packt/EmailRegexChunker.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Natural-Language-Processing-with-Java-Second-Edition/f235777262c48f4bbb4e52e45c4936896471080c/Chapter04/build/classes/packt/EmailRegexChunker.class


--------------------------------------------------------------------------------
/Chapter04/build/classes/packt/RunChunker.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Natural-Language-Processing-with-Java-Second-Edition/f235777262c48f4bbb4e52e45c4936896471080c/Chapter04/build/classes/packt/RunChunker.class


--------------------------------------------------------------------------------
/Chapter04/build/classes/packt/TimeRegexChunker.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Natural-Language-Processing-with-Java-Second-Edition/f235777262c48f4bbb4e52e45c4936896471080c/Chapter04/build/classes/packt/TimeRegexChunker.class


--------------------------------------------------------------------------------
/Chapter04/build/classes/packt/TrainEntities.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Natural-Language-Processing-with-Java-Second-Edition/f235777262c48f4bbb4e52e45c4936896471080c/Chapter04/build/classes/packt/TrainEntities.class


--------------------------------------------------------------------------------
/Chapter04/en-ner-all.train:
--------------------------------------------------------------------------------
1 | <START:person>Joe<END> was the last person to see <START:person>Fred<END>. He saw him in <START:location>Boston<END> at McKenzie's pub at 3:00 where he paid $2.45 for an ale. <START:person>Joe<END> wanted to go to <START:location>Vermont<END> for the day to visit a cousin who works at <START:organization>IBM<END>, but <START:person>Sally<END> and he had to look for <START:person>Fred<END>.
2 | 


--------------------------------------------------------------------------------
/Chapter04/en-ner-person.eval:
--------------------------------------------------------------------------------
1 | <START:person> Bill <END> went to the farm to see <START:person> Sally <END>. 
2 | Unable to find <START:person> Sally <END> he went to town.
3 | There he saw <START:person> Fred <END> who had seen <START:person> Sally <END> at the book store with <START:person> Mary <END>.
4 | 


--------------------------------------------------------------------------------
/Chapter04/en-ner-person.train:
--------------------------------------------------------------------------------
1 | <START:person> Joe <END> was the last person to see <START:person> Fred <END>. 
2 | He saw him in Boston at McKenzie's pub at 3:00 where he paid $2.45 for an ale. 
3 | <START:person> Joe <END> wanted to go to Vermont for the day to visit a cousin who works at IBM, but <START:person> Sally <END> and he had to look for <START:person> Fred <END>.
4 | 


--------------------------------------------------------------------------------
/Chapter04/manifest.mf:
--------------------------------------------------------------------------------
1 | Manifest-Version: 1.0
2 | X-COMMENT: Main-Class will be added automatically by build
3 | 
4 | 


--------------------------------------------------------------------------------
/Chapter04/modelFile:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Natural-Language-Processing-with-Java-Second-Edition/f235777262c48f4bbb4e52e45c4936896471080c/Chapter04/modelFile


--------------------------------------------------------------------------------
/Chapter04/nbproject/genfiles.properties:
--------------------------------------------------------------------------------
1 | build.xml.data.CRC32=be185f10
2 | build.xml.script.CRC32=2181029d
3 | build.xml.stylesheet.CRC32=8064a381@1.75.1.48
4 | # This file is used by a NetBeans-based IDE to track changes in generated files such as build-impl.xml.
5 | # Do not edit this file. You may delete it but then the IDE will never regenerate such files for you.
6 | nbproject/build-impl.xml.data.CRC32=be185f10
7 | nbproject/build-impl.xml.script.CRC32=b30cc88e
8 | nbproject/build-impl.xml.stylesheet.CRC32=876e7a8f@1.75.1.48
9 | 


--------------------------------------------------------------------------------
/Chapter04/nbproject/private/config.properties:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Natural-Language-Processing-with-Java-Second-Edition/f235777262c48f4bbb4e52e45c4936896471080c/Chapter04/nbproject/private/config.properties


--------------------------------------------------------------------------------
/Chapter04/nbproject/private/private.properties:
--------------------------------------------------------------------------------
1 | compile.on.save=true
2 | do.depend=false
3 | do.jar=true
4 | javac.debug=true
5 | javadoc.preview=true
6 | user.properties.file=C:\\Users\\Richard\\AppData\\Roaming\\NetBeans\\8.0.2\\build.properties
7 | 


--------------------------------------------------------------------------------
/Chapter04/nbproject/private/private.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project-private xmlns="http://www.netbeans.org/ns/project-private/1">
 3 |     <editor-bookmarks xmlns="http://www.netbeans.org/ns/editor-bookmarks/2" lastBookmarkId="0"/>
 4 |     <open-files xmlns="http://www.netbeans.org/ns/projectui-open-files/2">
 5 |         <group name="NLP Chapter 4">
 6 |             <file>file:/C:/Current%20Books/NLP%20and%20Java/Chapter%204/Chapter%204/src/packt/Chapter4.java</file>
 7 |         </group>
 8 |     </open-files>
 9 | </project-private>
10 | 


--------------------------------------------------------------------------------
/Chapter04/nbproject/project.properties:
--------------------------------------------------------------------------------
  1 | annotation.processing.enabled=true
  2 | annotation.processing.enabled.in.editor=false
  3 | annotation.processing.processors.list=
  4 | annotation.processing.run.all.processors=true
  5 | annotation.processing.source.output=${build.generated.sources.dir}/ap-source-output
  6 | application.title=Chapter 4
  7 | application.vendor=Richard
  8 | build.classes.dir=${build.dir}/classes
  9 | build.classes.excludes=**/*.java,**/*.form
 10 | # This directory is removed when the project is cleaned:
 11 | build.dir=build
 12 | build.generated.dir=${build.dir}/generated
 13 | build.generated.sources.dir=${build.dir}/generated-sources
 14 | # Only compile against the classpath explicitly listed here:
 15 | build.sysclasspath=ignore
 16 | build.test.classes.dir=${build.dir}/test/classes
 17 | build.test.results.dir=${build.dir}/test/results
 18 | # Uncomment to specify the preferred debugger connection transport:
 19 | #debug.transport=dt_socket
 20 | debug.classpath=\
 21 |     ${run.classpath}
 22 | debug.test.classpath=\
 23 |     ${run.test.classpath}
 24 | # Files in build.classes.dir which should be excluded from distribution jar
 25 | dist.archive.excludes=
 26 | # This directory is removed when the project is cleaned:
 27 | dist.dir=dist
 28 | dist.jar=${dist.dir}/Chapter_4.jar
 29 | dist.javadoc.dir=${dist.dir}/javadoc
 30 | endorsed.classpath=
 31 | excludes=
 32 | file.reference.ejml-0.23.jar=C:\\Current Books in Progress\\NLP and Java\\Downloads\\Stanford\\stanford-corenlp-full-2014-08-27\\ejml-0.23.jar
 33 | file.reference.javax.json-api-1.0-sources.jar=C:\\Current Books in Progress\\NLP and Java\\Downloads\\Stanford\\stanford-corenlp-full-2014-08-27\\javax.json-api-1.0-sources.jar
 34 | file.reference.javax.json.jar=C:\\Current Books in Progress\\NLP and Java\\Downloads\\Stanford\\stanford-corenlp-full-2014-08-27\\javax.json.jar
 35 | file.reference.joda-time-2.1-sources.jar=C:\\Current Books in Progress\\NLP and Java\\Downloads\\Stanford\\stanford-corenlp-full-2014-08-27\\joda-time-2.1-sources.jar
 36 | file.reference.joda-time.jar=C:\\Current Books in Progress\\NLP and Java\\Downloads\\Stanford\\stanford-corenlp-full-2014-08-27\\joda-time.jar
 37 | file.reference.jollyday-0.4.7-sources.jar=C:\\Current Books in Progress\\NLP and Java\\Downloads\\Stanford\\stanford-corenlp-full-2014-08-27\\jollyday-0.4.7-sources.jar
 38 | file.reference.jollyday.jar=C:\\Current Books in Progress\\NLP and Java\\Downloads\\Stanford\\stanford-corenlp-full-2014-08-27\\jollyday.jar
 39 | file.reference.jwnl-1.3.3.jar=C:\\Downloads\\OpenNLP\\apache-opennlp-1.5.3\\lib\\jwnl-1.3.3.jar
 40 | file.reference.lingpipe-4.1.0.jar=C:\\Current Books\\NLP and Java\\Downloads\\lingpipe-4.1.0\\lingpipe-4.1.0.jar
 41 | file.reference.opennlp-maxent-3.0.3.jar=C:\\Downloads\\OpenNLP\\apache-opennlp-1.5.3\\lib\\opennlp-maxent-3.0.3.jar
 42 | file.reference.opennlp-maxent-3.0.3.jar-1=C:\\Current Books\\NLP and Java\\Downloads\\apache-opennlp-1.5.3\\lib\\opennlp-maxent-3.0.3.jar
 43 | file.reference.opennlp-tools-1.5.3.jar=C:\\Downloads\\OpenNLP\\apache-opennlp-1.5.3\\lib\\opennlp-tools-1.5.3.jar
 44 | file.reference.opennlp-tools-1.5.3.jar-1=C:\\Current Books\\NLP and Java\\Downloads\\apache-opennlp-1.5.3\\lib\\opennlp-tools-1.5.3.jar
 45 | file.reference.opennlp-uima-1.5.3.jar=C:\\Downloads\\OpenNLP\\apache-opennlp-1.5.3\\lib\\opennlp-uima-1.5.3.jar
 46 | file.reference.stanford-corenlp-3.4.1-javadoc.jar=C:\\Current Books in Progress\\NLP and Java\\Downloads\\Stanford\\stanford-corenlp-full-2014-08-27\\stanford-corenlp-3.4.1-javadoc.jar
 47 | file.reference.stanford-corenlp-3.4.1-models.jar=C:\\Current Books in Progress\\NLP and Java\\Downloads\\Stanford\\stanford-corenlp-full-2014-08-27\\stanford-corenlp-3.4.1-models.jar
 48 | file.reference.stanford-corenlp-3.4.1-sources.jar=C:\\Current Books in Progress\\NLP and Java\\Downloads\\Stanford\\stanford-corenlp-full-2014-08-27\\stanford-corenlp-3.4.1-sources.jar
 49 | file.reference.stanford-corenlp-3.4.1.jar=C:\\Current Books in Progress\\NLP and Java\\Downloads\\Stanford\\stanford-corenlp-full-2014-08-27\\stanford-corenlp-3.4.1.jar
 50 | file.reference.stanford-ner-3.5.0-javadoc.jar=C:\\Current Books\\NLP and Java\\Downloads\\stanford-ner-2014-10-26\\stanford-ner-3.5.0-javadoc.jar
 51 | file.reference.stanford-ner-3.5.0.jar=C:\\Current Books\\NLP and Java\\Downloads\\stanford-ner-2014-10-26\\stanford-ner-3.5.0.jar
 52 | file.reference.stanford-ner.jar=C:\\Current Books\\NLP and Java\\Downloads\\stanford-ner-2014-10-26\\stanford-ner.jar
 53 | file.reference.xom-1.2.10-src.jar=C:\\Current Books in Progress\\NLP and Java\\Downloads\\Stanford\\stanford-corenlp-full-2014-08-27\\xom-1.2.10-src.jar
 54 | file.reference.xom.jar=C:\\Current Books in Progress\\NLP and Java\\Downloads\\Stanford\\stanford-corenlp-full-2014-08-27\\xom.jar
 55 | includes=**
 56 | jar.compress=false
 57 | javac.classpath=\
 58 |     ${file.reference.jwnl-1.3.3.jar}:\
 59 |     ${file.reference.opennlp-maxent-3.0.3.jar}:\
 60 |     ${file.reference.opennlp-tools-1.5.3.jar}:\
 61 |     ${file.reference.opennlp-uima-1.5.3.jar}:\
 62 |     ${file.reference.ejml-0.23.jar}:\
 63 |     ${file.reference.javax.json-api-1.0-sources.jar}:\
 64 |     ${file.reference.javax.json.jar}:\
 65 |     ${file.reference.joda-time-2.1-sources.jar}:\
 66 |     ${file.reference.joda-time.jar}:\
 67 |     ${file.reference.jollyday-0.4.7-sources.jar}:\
 68 |     ${file.reference.jollyday.jar}:\
 69 |     ${file.reference.stanford-corenlp-3.4.1-javadoc.jar}:\
 70 |     ${file.reference.stanford-corenlp-3.4.1-models.jar}:\
 71 |     ${file.reference.stanford-corenlp-3.4.1-sources.jar}:\
 72 |     ${file.reference.stanford-corenlp-3.4.1.jar}:\
 73 |     ${file.reference.xom-1.2.10-src.jar}:\
 74 |     ${file.reference.xom.jar}:\
 75 |     ${file.reference.lingpipe-4.1.0.jar}:\
 76 |     ${file.reference.stanford-ner-3.5.0-javadoc.jar}:\
 77 |     ${file.reference.stanford-ner-3.5.0.jar}:\
 78 |     ${file.reference.stanford-ner.jar}:\
 79 |     ${file.reference.opennlp-maxent-3.0.3.jar-1}:\
 80 |     ${file.reference.opennlp-tools-1.5.3.jar-1}
 81 | # Space-separated list of extra javac options
 82 | javac.compilerargs=
 83 | javac.deprecation=false
 84 | javac.processorpath=\
 85 |     ${javac.classpath}
 86 | javac.source=1.8
 87 | javac.target=1.8
 88 | javac.test.classpath=\
 89 |     ${javac.classpath}:\
 90 |     ${build.classes.dir}
 91 | javac.test.processorpath=\
 92 |     ${javac.test.classpath}
 93 | javadoc.additionalparam=
 94 | javadoc.author=false
 95 | javadoc.encoding=${source.encoding}
 96 | javadoc.noindex=false
 97 | javadoc.nonavbar=false
 98 | javadoc.notree=false
 99 | javadoc.private=false
100 | javadoc.splitindex=true
101 | javadoc.use=true
102 | javadoc.version=false
103 | javadoc.windowtitle=
104 | main.class=packt.Chapter4
105 | manifest.file=manifest.mf
106 | meta.inf.dir=${src.dir}/META-INF
107 | mkdist.disabled=false
108 | platform.active=default_platform
109 | run.classpath=\
110 |     ${javac.classpath}:\
111 |     ${build.classes.dir}
112 | # Space-separated list of JVM arguments used when running the project.
113 | # You may also define separate properties like run-sys-prop.name=value instead of -Dname=value.
114 | # To set system properties for unit tests define test-sys-prop.name=value:
115 | run.jvmargs=
116 | run.test.classpath=\
117 |     ${javac.test.classpath}:\
118 |     ${build.test.classes.dir}
119 | source.encoding=UTF-8
120 | src.dir=src
121 | test.src.dir=test
122 | 


--------------------------------------------------------------------------------
/Chapter04/nbproject/project.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://www.netbeans.org/ns/project/1">
 3 |     <type>org.netbeans.modules.java.j2seproject</type>
 4 |     <configuration>
 5 |         <data xmlns="http://www.netbeans.org/ns/j2se-project/3">
 6 |             <name>Chapter 4</name>
 7 |             <source-roots>
 8 |                 <root id="src.dir"/>
 9 |             </source-roots>
10 |             <test-roots>
11 |                 <root id="test.src.dir"/>
12 |             </test-roots>
13 |         </data>
14 |     </configuration>
15 | </project>
16 | 


--------------------------------------------------------------------------------
/Chapter04/old/Chapter4.java:
--------------------------------------------------------------------------------
  1 | package packt;
  2 | 
  3 | import com.aliasi.chunk.Chunk;
  4 | import com.aliasi.chunk.Chunker;
  5 | import com.aliasi.chunk.Chunking;
  6 | import com.aliasi.dict.DictionaryEntry;
  7 | import com.aliasi.dict.ExactDictionaryChunker;
  8 | import com.aliasi.dict.MapDictionary;
  9 | import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;
 10 | import com.aliasi.util.AbstractExternalizable;
 11 | import edu.stanford.nlp.ie.crf.CRFClassifier;
 12 | import edu.stanford.nlp.ling.CoreAnnotations;
 13 | import edu.stanford.nlp.ling.CoreLabel;
 14 | import java.io.BufferedOutputStream;
 15 | import java.io.File;
 16 | import java.io.FileInputStream;
 17 | import java.io.FileOutputStream;
 18 | import java.io.IOException;
 19 | import java.io.InputStream;
 20 | import java.io.OutputStream;
 21 | import java.util.ArrayList;
 22 | import java.util.List;
 23 | import java.util.Set;
 24 | import java.util.regex.Matcher;
 25 | import java.util.regex.Pattern;
 26 | import opennlp.tools.namefind.NameFinderME;
 27 | import opennlp.tools.namefind.NameSample;
 28 | import opennlp.tools.namefind.NameSampleDataStream;
 29 | import opennlp.tools.namefind.TokenNameFinderEvaluator;
 30 | import opennlp.tools.namefind.TokenNameFinderModel;
 31 | import opennlp.tools.tokenize.Tokenizer;
 32 | import opennlp.tools.tokenize.TokenizerME;
 33 | import opennlp.tools.tokenize.TokenizerModel;
 34 | import opennlp.tools.util.ObjectStream;
 35 | import opennlp.tools.util.PlainTextByLineStream;
 36 | import opennlp.tools.util.Span;
 37 | import opennlp.tools.util.eval.FMeasure;
 38 | 
 39 | public class Chapter4 {
 40 | 
 41 |     private static final String sentences[] = {"Joe was the last person to see Fred. ",
 42 |         "He saw him in Boston at McKenzie's pub at 3:00 where he paid "
 43 |         + "$2.45 for an ale. ",
 44 |         "Joe wanted to go to Vermont for the day to visit a cousin who "
 45 |         + "works at IBM, but Sally and he had to look for Fred"};
 46 | 
 47 |     private static String regularExpressionText
 48 |             = "He left his email address (rgb@colorworks.com) and his "
 49 |             + "phone number,800-555-1234. We believe his current address "
 50 |             + "is 100 Washington Place, Seattle, CO 12345-1234. I "
 51 |             + "understand you can also call at 123-555-1234 between "
 52 |             + "8:00 AM and 4:30 most days. His URL is http://example.com "
 53 |             + "and he was born on February 25, 1954 or 2/25/1954.";
 54 | 
 55 |     private static MapDictionary<String> dictionary;
 56 | 
 57 |     public static void main(String[] args) {
 58 |         usingRegularExpressions();
 59 | //        usingOpenNLP();
 60 | //        usingStanfordNER();
 61 | //        usingLingPipeNER();
 62 | //        trainingOpenNLPNERModel();
 63 |     }
 64 | 
 65 |     public static File getModelDir() {
 66 |         return new File("C:\\Current Books in Progress\\NLP and Java\\Models");
 67 |     }
 68 | 
 69 |     private static void usingRegularExpressions() {
 70 |         usingJavaRegularExpressions();
 71 | //        usingLingPipeRegExChunker();
 72 | //        usingLingPipeRegularExpressions();
 73 |     }
 74 | 
 75 |     private static void usingJavaRegularExpressions() {
 76 |         String phoneNumberRE = "\\d{3}-\\d{3}-\\d{4}";
 77 |         String urlRegex = "\\b(https?|ftp|file|ldap)://"
 78 |                 + "[-A-Za-z0-9+&@#/%?=~_|!:,.;]"
 79 |                 + "*[-A-Za-z0-9+&@#/%=~_|]";
 80 |         String zipCodeRegEx = "[0-9]{5}(\\-?[0-9]{4})?";
 81 |         String emailRegEx = "[a-zA-Z0-9'._%+-]+@"
 82 |                 + "(?:[a-zA-Z0-9-]+\\.)"
 83 |                 + "+[a-zA-Z]{2,4}";
 84 |         String timeRE = "([01]?[0-9]|2[0-3]):[0-5][0-9]";
 85 |         String dateRE = "(0?[1-9]|[12][0-9]|3[01])/(0?[1-9]|1[012])/((19|20)\\\\d\\\\d)";
 86 |         dateRE = "((0?[13578]|10|12)(-|\\/)(([1-9])|(0[1-9])|([12])([0-9]?)|(3[01]?))(-|\\/)((19)([2-9])(\\d{1})|(20)([01])(\\d{1})|([8901])(\\d{1}))|(0?[2469]|11)(-|\\/)(([1-9])|(0[1-9])|([12])([0-9]?)|(3[0]?))(-|\\/)((19)([2-9])(\\d{1})|(20)([01])(\\d{1})|([8901])(\\d{1})))";
 87 |         Pattern pattern = Pattern.compile(phoneNumberRE + "|" + timeRE + "|" + emailRegEx);
 88 | //        regularExpressionText = "(888)555-1234 888-SEL-HIGH 888-555-1234-J88-W3S";
 89 |         Matcher matcher = pattern.matcher(regularExpressionText);
 90 |         System.out.println("---Searching ...");
 91 |         while (matcher.find()) {
 92 |             System.out.println(matcher.group() + " [" + matcher.start()
 93 |                     + ":" + matcher.end() + "]");
 94 |         }
 95 |         System.out.println("---Done Searching ...");
 96 | 
 97 |     }
 98 | 
 99 |     private static void usingLingPipeRegExChunker() {
100 |         String timeRE = "(([0-1]?[0-9])|([2][0-3])):([0-5]?[0-9])(:([0-5]?[0-9]))?";
101 |         Chunker chunker = new TimeRegexChunker();
102 | //        chunker = new RegExChunker(timeRE,"time",1.0);
103 |         Chunking chunking = chunker.chunk(regularExpressionText);
104 |         Set<Chunk> chunkSet = chunking.chunkSet();
105 |         displayChunkSet(chunker, regularExpressionText);
106 |     }
107 | 
108 |     private static void usingLingPipeRegularExpressions() {
109 |         try {
110 |             File modelFile = new File(getModelDir(),
111 |                     "ne-en-news-muc6.AbstractCharLmRescoringChunker");
112 |             Chunker chunker = (Chunker) AbstractExternalizable.readObject(modelFile);
113 |             for (int i = 0; i < sentences.length; ++i) {
114 |                 Chunking chunking = chunker.chunk(sentences[i]);
115 |                 System.out.println("Chunking=" + chunking);
116 |             }
117 |             for (String sentence : sentences) {
118 |                 displayChunkSet(chunker, sentence);
119 |             }
120 | 
121 |         } catch (IOException | ClassNotFoundException ex) {
122 |             // Handle exception
123 |         }
124 | 
125 |     }
126 | 
127 |     // ------ OpenNLP-----------------------------------
128 |     private static void usingOpenNLP() {
129 |         System.out.println("OpenNLP Examples");
130 |         usingOpenNLPNameFinderME();
131 | //        usingMultipleNERModels();
132 |     }
133 | 
134 |     private static void usingOpenNLPNameFinderME() {
135 |         System.out.println("OpenNLP NameFinderME Examples");
136 |         try (InputStream tokenStream = new FileInputStream(
137 |                 new File(getModelDir(), "en-token.bin"));
138 |                 InputStream modelStream = new FileInputStream(
139 |                         new File(getModelDir(), "en-ner-person.bin"));) {
140 | 
141 |             TokenizerModel tokenModel = new TokenizerModel(tokenStream);
142 |             Tokenizer tokenizer = new TokenizerME(tokenModel);
143 | 
144 |             TokenNameFinderModel entityModel
145 |                     = new TokenNameFinderModel(modelStream);
146 |             NameFinderME nameFinder = new NameFinderME(entityModel);
147 | 
148 |             // Single sentence
149 |             {
150 |                 System.out.println("Single sentence");
151 |                 StringBuilder builder = new StringBuilder();
152 |                 String sentence = "He was the last person to see Fred.";
153 | 
154 |                 String tokens[] = tokenizer.tokenize(sentence);
155 |                 Span nameSpans[] = nameFinder.find(tokens);
156 | 
157 |                 for (int i = 0; i < nameSpans.length; i++) {
158 |                     System.out.println("Span: " + nameSpans[i].toString());
159 |                     System.out.println("Entity: "
160 |                             + tokens[nameSpans[i].getStart()]);
161 |                 }
162 |             }
163 |             System.out.println();
164 |             for (String sentence : sentences) {
165 |                 String tokens[] = tokenizer.tokenize(sentence);
166 |                 Span nameSpans[] = nameFinder.find(tokens);
167 |                 double[] spanProbs = nameFinder.probs(nameSpans);
168 | 
169 |                 for (int i = 0; i < nameSpans.length; i++) {
170 |                     System.out.println("Span: " + nameSpans[i].toString());
171 |                     System.out.println("Entity: "
172 |                             + tokens[nameSpans[i].getStart()]);
173 |                     System.out.println("Probability: " + spanProbs[i]);
174 |                 }
175 |                 System.out.println();
176 |             }
177 |         } catch (Exception ex) {
178 |             ex.printStackTrace();
179 |         }
180 |     }
181 | 
182 |     private static void usingMultipleNERModels() {
183 |         // Models - en-ner-person.bin en-ner-location.bin en-ner-money.bin 
184 |         // en-ner-organization.bin en-ner-time.bin
185 |         try {
186 |             InputStream tokenStream = new FileInputStream(
187 |                     new File(getModelDir(), "en-token.bin"));
188 | 
189 |             TokenizerModel tokenModel = new TokenizerModel(tokenStream);
190 |             Tokenizer tokenizer = new TokenizerME(tokenModel);
191 | 
192 |             String modelNames[] = {"en-ner-person.bin", "en-ner-location.bin",
193 |                 "en-ner-organization.bin"};
194 |             ArrayList<String> list = new ArrayList();
195 |             for (String name : modelNames) {
196 |                 TokenNameFinderModel entityModel = new TokenNameFinderModel(
197 |                         new FileInputStream(
198 |                                 new File(getModelDir(), name)));
199 |                 NameFinderME nameFinder = new NameFinderME(entityModel);
200 |                 for (int index = 0; index < sentences.length; index++) {
201 |                     String tokens[] = tokenizer.tokenize(sentences[index]);
202 |                     Span nameSpans[] = nameFinder.find(tokens);
203 |                     for (Span span : nameSpans) {
204 |                         list.add("Sentence: " + index
205 |                                 + " Span: " + span.toString() + " Entity: "
206 |                                 + tokens[span.getStart()]);
207 |                     }
208 |                 }
209 |             }
210 |             System.out.println("Multiple Entities");
211 |             for (String element : list) {
212 |                 System.out.println(element);
213 |             }
214 |         } catch (Exception ex) {
215 |             ex.printStackTrace();
216 |         }
217 |     }
218 | 
219 |     private static void usingStanfordNER() {
220 |         String model = getModelDir() + "\\english.conll.4class.distsim.crf.ser.gz";
221 |         CRFClassifier<CoreLabel> classifier = CRFClassifier.getClassifierNoExceptions(model);
222 | 
223 |         String sentence = "";
224 |         for (String element : sentences) {
225 |             sentence += element;
226 |         }
227 | 
228 |         List<List<CoreLabel>> entityList = classifier.classify(sentence);
229 | 
230 |         for (List<CoreLabel> internalList : entityList) {
231 |             for (CoreLabel coreLabel : internalList) {
232 |                 String word = coreLabel.word();
233 |                 String category = coreLabel.get(CoreAnnotations.AnswerAnnotation.class);
234 | //                System.out.println(word + ":" + category);
235 |                 if (!"O".equals(category)) {
236 |                     System.out.println(word + ":" + category);
237 |                 }
238 | 
239 |             }
240 | 
241 |         }
242 |     }
243 | 
244 |     private static void usingLingPipeNER() {
245 | //        usingLingPipeRexExChunker();
246 |         usingExactDictionaryChunker();
247 | 
248 |     }
249 | 
250 |     private static void usingLingPipeRexExChunker() {
251 |         try {
252 |             File modelFile = new File(getModelDir(),
253 |                     "ne-en-news-muc6.AbstractCharLmRescoringChunker");
254 |             Chunker chunker
255 |                     = (Chunker) AbstractExternalizable.readObject(modelFile);
256 | 
257 |             for (String sentence : sentences) {
258 |                 displayChunkSet(chunker, sentence);
259 |             }
260 |         } catch (IOException | ClassNotFoundException ex) {
261 |             ex.printStackTrace();
262 |         }
263 |     }
264 | 
265 |     private static void displayChunkSet(Chunker chunker, String text) {
266 |         Chunking chunking = chunker.chunk(text);
267 |         Set<Chunk> set = chunking.chunkSet();
268 |         for (Chunk chunk : set) {
269 |             System.out.println("Type: " + chunk.type() + " Entity: ["
270 |                     + text.substring(chunk.start(), chunk.end())
271 |                     + "] Score: " + chunk.score());
272 |         }
273 |     }
274 | 
275 |     private static void initializeDictionary() {
276 |         dictionary = new MapDictionary<String>();
277 |         dictionary.addEntry(
278 |                 new DictionaryEntry<String>("Joe", "PERSON", 1.0));
279 |         dictionary.addEntry(
280 |                 new DictionaryEntry<String>("Fred", "PERSON", 1.0));
281 |         dictionary.addEntry(
282 |                 new DictionaryEntry<String>("Boston", "PLACE", 1.0));
283 |         dictionary.addEntry(
284 |                 new DictionaryEntry<String>("pub", "PLACE", 1.0));
285 |         dictionary.addEntry(
286 |                 new DictionaryEntry<String>("Vermont", "PLACE", 1.0));
287 |         dictionary.addEntry(
288 |                 new DictionaryEntry<String>("IBM", "ORGANIZATION", 1.0));
289 |         dictionary.addEntry(
290 |                 new DictionaryEntry<String>("Sally", "PERSON", 1.0));
291 |     }
292 | 
293 |     private static void usingExactDictionaryChunker() {
294 |         initializeDictionary();
295 |         System.out.println("\nDICTIONARY\n" + dictionary);
296 | 
297 |         ExactDictionaryChunker dictionaryChunker
298 |                 = new ExactDictionaryChunker(dictionary,
299 |                         IndoEuropeanTokenizerFactory.INSTANCE, true, false);
300 | 
301 |         for (String sentence : sentences) {
302 |             System.out.println("\nTEXT=" + sentence);
303 |             displayChunkSet(dictionaryChunker, sentence);
304 |         }
305 |     }
306 | 
307 |     // Training Models
308 |     private static void trainingOpenNLPNERModel() {
309 |         try (OutputStream modelOutputStream = new BufferedOutputStream(
310 |                 new FileOutputStream(new File("modelFile")));) {
311 |             ObjectStream<String> lineStream = new PlainTextByLineStream(
312 |                     new FileInputStream("en-ner-person.train"), "UTF-8");
313 |             ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream);
314 | 
315 |             TokenNameFinderModel model = NameFinderME.train("en", "person", sampleStream,
316 |                     null, 100, 5);
317 | 
318 |             model.serialize(modelOutputStream);
319 |         } catch (IOException ex) {
320 |             ex.printStackTrace();
321 |         }
322 |     }
323 | }
324 | 


--------------------------------------------------------------------------------
/Chapter04/old/TimeRegexChunker.java:
--------------------------------------------------------------------------------
 1 | package packt;
 2 | 
 3 | import com.aliasi.chunk.RegExChunker;
 4 | 
 5 | public class TimeRegexChunker extends RegExChunker {
 6 |     private final static String TIME_RE = 
 7 |         "(([0-1]?[0-9])|([2][0-3])):([0-5]?[0-9])(:([0-5]?[0-9]))?";
 8 |     private final static String CHUNK_TYPE = "time";
 9 |     private final static double CHUNK_SCORE = 1.0;
10 |     
11 |     public TimeRegexChunker() {
12 |         super(TIME_RE,CHUNK_TYPE,CHUNK_SCORE);
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/Chapter04/src/packt/Chapter4.java:
--------------------------------------------------------------------------------
  1 | package packt;
  2 | 
  3 | import com.aliasi.chunk.Chunk;
  4 | import com.aliasi.chunk.Chunker;
  5 | import com.aliasi.chunk.Chunking;
  6 | import com.aliasi.dict.DictionaryEntry;
  7 | import com.aliasi.dict.ExactDictionaryChunker;
  8 | import com.aliasi.dict.MapDictionary;
  9 | import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;
 10 | import com.aliasi.util.AbstractExternalizable;
 11 | import edu.stanford.nlp.ie.crf.CRFClassifier;
 12 | import edu.stanford.nlp.ling.CoreAnnotations;
 13 | import edu.stanford.nlp.ling.CoreLabel;
 14 | import java.io.BufferedOutputStream;
 15 | import java.io.File;
 16 | import java.io.FileInputStream;
 17 | import java.io.FileOutputStream;
 18 | import java.io.IOException;
 19 | import java.io.InputStream;
 20 | import java.io.OutputStream;
 21 | import java.util.ArrayList;
 22 | import java.util.List;
 23 | import java.util.Set;
 24 | import java.util.regex.Matcher;
 25 | import java.util.regex.Pattern;
 26 | import opennlp.tools.namefind.NameFinderME;
 27 | import opennlp.tools.namefind.NameSample;
 28 | import opennlp.tools.namefind.NameSampleDataStream;
 29 | import opennlp.tools.namefind.TokenNameFinderEvaluator;
 30 | import opennlp.tools.namefind.TokenNameFinderModel;
 31 | import opennlp.tools.sentdetect.SentenceDetectorEvaluator;
 32 | import opennlp.tools.tokenize.Tokenizer;
 33 | import opennlp.tools.tokenize.TokenizerME;
 34 | import opennlp.tools.tokenize.TokenizerModel;
 35 | import opennlp.tools.util.ObjectStream;
 36 | import opennlp.tools.util.PlainTextByLineStream;
 37 | import opennlp.tools.util.Span;
 38 | import opennlp.tools.util.eval.FMeasure;
 39 | 
 40 | public class Chapter4 {
 41 | 
 42 |     private static final String sentences[] = {"Joe was the last person to see Fred. ",
 43 |         "He saw him in Boston at McKenzie's pub at 3:00 where he paid "
 44 |         + "$2.45 for an ale. ",
 45 |         "Joe wanted to go to Vermont for the day to visit a cousin who "
 46 |         + "works at IBM, but Sally and he had to look for Fred"};
 47 | 
 48 |     private static String regularExpressionText
 49 |             = "He left his email address (rgb@colorworks.com) and his "
 50 |             + "phone number,800-555-1234. We believe his current address "
 51 |             + "is 100 Washington Place, Seattle, CO 12345-1234. I "
 52 |             + "understand you can also call at 123-555-1234 between "
 53 |             + "8:00 AM and 4:30 most days. His URL is http://example.com "
 54 |             + "and he was born on February 25, 1954 or 2/25/1954.";
 55 | 
 56 |     private static MapDictionary<String> dictionary;
 57 | 
 58 |     public static void main(String[] args) {
 59 |         usingRegularExpressions();
 60 | //        usingOpenNLP();
 61 | //        usingStanfordNER();
 62 | //        usingLingPipeNER();
 63 | //        trainingOpenNLPNERModel();
 64 |     }
 65 | 
 66 |     public static File getModelDir() {
 67 |         return new File("C:/Current Books/NLP and Java/Models");
 68 |     }
 69 | 
 70 |     private static void usingRegularExpressions() {
 71 |         usingJavaRegularExpressions();
 72 | //        usingLingPipeRegExChunker();
 73 | //        usingLingPipeRegularExpressions();
 74 |     }
 75 | 
 76 |     private static void usingJavaRegularExpressions() {
 77 |         String phoneNumberRE = "\\d{3}-\\d{3}-\\d{4}";
 78 |         String urlRegex = "\\b(https?|ftp|file|ldap)://"
 79 |                 + "[-A-Za-z0-9+&@#/%?=~_|!:,.;]"
 80 |                 + "*[-A-Za-z0-9+&@#/%=~_|]";
 81 |         String zipCodeRegEx = "[0-9]{5}(\\-?[0-9]{4})?";
 82 |         String emailRegEx = "[a-zA-Z0-9'._%+-]+@"
 83 |                 + "(?:[a-zA-Z0-9-]+\\.)"
 84 |                 + "+[a-zA-Z]{2,4}";
 85 |         String timeRE = "([01]?[0-9]|2[0-3]):[0-5][0-9]";
 86 |         String dateRE = "(0?[1-9]|[12][0-9]|3[01])/(0?[1-9]|1[012])/((19|20)\\\\d\\\\d)";
 87 |         dateRE = "((0?[13578]|10|12)(-|\\/)(([1-9])|(0[1-9])|([12])([0-9]?)|(3[01]?))(-|\\/)((19)([2-9])(\\d{1})|(20)([01])(\\d{1})|([8901])(\\d{1}))|(0?[2469]|11)(-|\\/)(([1-9])|(0[1-9])|([12])([0-9]?)|(3[0]?))(-|\\/)((19)([2-9])(\\d{1})|(20)([01])(\\d{1})|([8901])(\\d{1})))";
 88 |         Pattern pattern = Pattern.compile(phoneNumberRE + "|" + timeRE + "|" + emailRegEx);
 89 |         regularExpressionText = "(888)555-1111 888-SEL-HIGH 888-555-2222-J88-W3S";
 90 |         Matcher matcher = pattern.matcher(regularExpressionText);
 91 |         System.out.println("---Searching ...");
 92 |         while (matcher.find()) {
 93 |             System.out.println(matcher.group() + " [" + matcher.start()
 94 |                     + ":" + matcher.end() + "]");
 95 |         }
 96 |         System.out.println("---Done Searching ...");
 97 | 
 98 |     }
 99 | 
100 |     private static void usingLingPipeRegExChunker() {
101 |         String timeRE = "(([0-1]?[0-9])|([2][0-3])):([0-5]?[0-9])(:([0-5]?[0-9]))?";
102 |         Chunker chunker = new TimeRegexChunker();
103 | //        chunker = new RegExChunker(timeRE,"time",1.0);
104 |         Chunking chunking = chunker.chunk(regularExpressionText);
105 |         Set<Chunk> chunkSet = chunking.chunkSet();
106 |         displayChunkSet(chunker, regularExpressionText);
107 |     }
108 | 
109 |     private static void usingLingPipeRegularExpressions() {
110 |         try {
111 |             File modelFile = new File(getModelDir(),
112 |                     "ne-en-news-muc6.AbstractCharLmRescoringChunker");
113 |             Chunker chunker = (Chunker) AbstractExternalizable.readObject(modelFile);
114 |             for (int i = 0; i < sentences.length; ++i) {
115 |                 Chunking chunking = chunker.chunk(sentences[i]);
116 |                 System.out.println("Chunking=" + chunking);
117 |             }
118 |             for (String sentence : sentences) {
119 |                 displayChunkSet(chunker, sentence);
120 |             }
121 | 
122 |         } catch (IOException | ClassNotFoundException ex) {
123 |             // Handle exception
124 |         }
125 | 
126 |     }
127 | 
128 |     // ------ OpenNLP-----------------------------------
129 |     private static void usingOpenNLP() {
130 |         System.out.println("OpenNLP Examples");
131 |         usingOpenNLPNameFinderME();
132 | //        usingMultipleNERModels();
133 |     }
134 | 
135 |     private static void usingOpenNLPNameFinderME() {
136 |         System.out.println("OpenNLP NameFinderME Examples");
137 |         try (InputStream tokenStream = new FileInputStream(
138 |                 new File(getModelDir(), "en-token.bin"));
139 |                 InputStream modelStream = new FileInputStream(
140 |                         new File(getModelDir(), "en-ner-time.bin"));) {
141 | 
142 |             TokenizerModel tokenModel = new TokenizerModel(tokenStream);
143 |             Tokenizer tokenizer = new TokenizerME(tokenModel);
144 | 
145 |             TokenNameFinderModel entityModel
146 |                     = new TokenNameFinderModel(modelStream);
147 |             NameFinderME nameFinder = new NameFinderME(entityModel);
148 | 
149 |             // Single sentence
150 |             {
151 |                 System.out.println("Single sentence");
152 |                 StringBuilder builder = new StringBuilder();
153 |                 String sentence = "He was the last person to see Fred.";
154 | 
155 |                 String tokens[] = tokenizer.tokenize(sentence);
156 |                 Span nameSpans[] = nameFinder.find(tokens);
157 | 
158 |                 for (int i = 0; i < nameSpans.length; i++) {
159 |                     System.out.println("Span: " + nameSpans[i].toString());
160 |                     System.out.println("Entity: "
161 |                             + tokens[nameSpans[i].getStart()]);
162 |                 }
163 |             }
164 |             System.out.println();
165 |             System.out.println("Sentences");
166 |             for (String sentence : sentences) {
167 |                 String tokens[] = tokenizer.tokenize(sentence);
168 |                 Span nameSpans[] = nameFinder.find(tokens);
169 |                 double[] spanProbs = nameFinder.probs(nameSpans);
170 | 
171 |                 for (int i = 0; i < nameSpans.length; i++) {
172 |                     System.out.println("Span: " + nameSpans[i].toString());
173 |                     System.out.println("Entity: "
174 |                             + tokens[nameSpans[i].getStart()]);
175 |                     System.out.println("Probability: " + spanProbs[i]);
176 |                 }
177 |                 System.out.println();
178 |             }
179 |         } catch (Exception ex) {
180 |             ex.printStackTrace();
181 |         }
182 |     }
183 | 
184 |     private static void usingMultipleNERModels() {
185 |         // Models - en-ner-person.bin en-ner-location.bin en-ner-money.bin 
186 |         // en-ner-organization.bin en-ner-time.bin
187 |         try {
188 |             InputStream tokenStream = new FileInputStream(
189 |                     new File(getModelDir(), "en-token.bin"));
190 | 
191 |             TokenizerModel tokenModel = new TokenizerModel(tokenStream);
192 |             Tokenizer tokenizer = new TokenizerME(tokenModel);
193 | 
194 |             String modelNames[] = {"en-ner-person.bin", "en-ner-location.bin",
195 |                 "en-ner-organization.bin"};
196 |             ArrayList<String> list = new ArrayList();
197 |             for (String name : modelNames) {
198 |                 TokenNameFinderModel entityModel = new TokenNameFinderModel(
199 |                         new FileInputStream(
200 |                                 new File(getModelDir(), name)));
201 |                 NameFinderME nameFinder = new NameFinderME(entityModel);
202 |                 for (int index = 0; index < sentences.length; index++) {
203 |                     String tokens[] = tokenizer.tokenize(sentences[index]);
204 |                     Span nameSpans[] = nameFinder.find(tokens);
205 |                     for (Span span : nameSpans) {
206 |                         list.add("Sentence: " + index
207 |                                 + " Span: " + span.toString() + " Entity: "
208 |                                 + tokens[span.getStart()]);
209 |                     }
210 |                 }
211 |             }
212 |             System.out.println("Multiple Entities");
213 |             for (String element : list) {
214 |                 System.out.println(element);
215 |             }
216 |         } catch (Exception ex) {
217 |             ex.printStackTrace();
218 |         }
219 |     }
220 | 
221 |     private static void usingStanfordNER() {
222 |         String model = getModelDir() + "\\english.conll.4class.distsim.crf.ser.gz";
223 |         CRFClassifier<CoreLabel> classifier = CRFClassifier.getClassifierNoExceptions(model);
224 | 
225 |         String sentence = "";
226 |         for (String element : sentences) {
227 |             sentence += element;
228 |         }
229 | 
230 |         List<List<CoreLabel>> entityList = classifier.classify(sentence);
231 | 
232 |         for (List<CoreLabel> internalList : entityList) {
233 |             for (CoreLabel coreLabel : internalList) {
234 |                 String word = coreLabel.word();
235 |                 String category = coreLabel.get(CoreAnnotations.AnswerAnnotation.class);
236 | //                System.out.println(word + ":" + category);
237 |                 if (!"O".equals(category)) {
238 |                     System.out.println(word + ":" + category);
239 |                 }
240 | 
241 |             }
242 | 
243 |         }
244 |     }
245 | 
246 |     private static void usingLingPipeNER() {
247 | //        usingLingPipeRexExChunker();
248 |         usingExactDictionaryChunker();
249 | 
250 |     }
251 | 
252 |     private static void usingLingPipeRexExChunker() {
253 |         try {
254 |             File modelFile = new File(getModelDir(),
255 |                     "ne-en-news-muc6.AbstractCharLmRescoringChunker");
256 |             Chunker chunker
257 |                     = (Chunker) AbstractExternalizable.readObject(modelFile);
258 | 
259 |             for (String sentence : sentences) {
260 |                 displayChunkSet(chunker, sentence);
261 |             }
262 |         } catch (IOException | ClassNotFoundException ex) {
263 |             ex.printStackTrace();
264 |         }
265 |     }
266 | 
267 |     private static void displayChunkSet(Chunker chunker, String text) {
268 |         Chunking chunking = chunker.chunk(text);
269 |         Set<Chunk> set = chunking.chunkSet();
270 |         for (Chunk chunk : set) {
271 |             System.out.println("Type: " + chunk.type() + " Entity: ["
272 |                     + text.substring(chunk.start(), chunk.end())
273 |                     + "] Score: " + chunk.score());
274 |         }
275 |     }
276 | 
277 |     private static void initializeDictionary() {
278 |         dictionary = new MapDictionary<String>();
279 |         dictionary.addEntry(
280 |                 new DictionaryEntry<String>("Joe", "PERSON", 1.0));
281 |         dictionary.addEntry(
282 |                 new DictionaryEntry<String>("Fred", "PERSON", 1.0));
283 |         dictionary.addEntry(
284 |                 new DictionaryEntry<String>("Boston", "PLACE", 1.0));
285 |         dictionary.addEntry(
286 |                 new DictionaryEntry<String>("pub", "PLACE", 1.0));
287 |         dictionary.addEntry(
288 |                 new DictionaryEntry<String>("Vermont", "PLACE", 1.0));
289 |         dictionary.addEntry(
290 |                 new DictionaryEntry<String>("IBM", "ORGANIZATION", 1.0));
291 |         dictionary.addEntry(
292 |                 new DictionaryEntry<String>("Sally", "PERSON", 1.0));
293 |     }
294 | 
295 |     private static void usingExactDictionaryChunker() {
296 |         initializeDictionary();
297 |         System.out.println("\nDICTIONARY\n" + dictionary);
298 | 
299 |         ExactDictionaryChunker dictionaryChunker
300 |                 = new ExactDictionaryChunker(dictionary,
301 |                         IndoEuropeanTokenizerFactory.INSTANCE, true, false);
302 | 
303 |         for (String sentence : sentences) {
304 |             System.out.println("\nTEXT=" + sentence);
305 |             displayChunkSet(dictionaryChunker, sentence);
306 |         }
307 |     }
308 | 
309 |     // Training Models
310 |     private static void trainingOpenNLPNERModel() {
311 |         try (OutputStream modelOutputStream = new BufferedOutputStream(
312 |                 new FileOutputStream(new File("modelFile")));) {
313 |             ObjectStream<String> lineStream = new PlainTextByLineStream(
314 |                     new FileInputStream("en-ner-person.train"), "UTF-8");
315 |             ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream);
316 | 
317 |             TokenNameFinderModel model = NameFinderME.train("en", "person", sampleStream,
318 |                     null, 100, 5);
319 | 
320 |             model.serialize(modelOutputStream);
321 | 
322 |             System.out.println("TokenNameFinderEvaluator");
323 |             TokenNameFinderEvaluator evaluator = new TokenNameFinderEvaluator(new NameFinderME(model));
324 |             
325 |             lineStream = new PlainTextByLineStream(
326 |                     new FileInputStream("en-ner-person.eval"), "UTF-8");
327 |             sampleStream = new NameSampleDataStream(lineStream);
328 |             evaluator.evaluate(sampleStream);
329 | 
330 |             FMeasure result = evaluator.getFMeasure();
331 |             System.out.println(result.toString());
332 |         } catch (IOException ex) {
333 |             ex.printStackTrace();
334 |         }
335 |     }
336 | }
337 | 


--------------------------------------------------------------------------------
/Chapter04/src/packt/DictionaryChunker.java:
--------------------------------------------------------------------------------
 1 | package packt;
 2 | 
 3 | import com.aliasi.chunk.Chunk;
 4 | import com.aliasi.chunk.Chunker;
 5 | import com.aliasi.chunk.Chunking;
 6 | import com.aliasi.dict.DictionaryEntry;
 7 | import com.aliasi.dict.MapDictionary;
 8 | import com.aliasi.dict.TrieDictionary;
 9 | import com.aliasi.dict.Dictionary;
10 | import com.aliasi.dict.ExactDictionaryChunker;
11 | 
12 | import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;
13 | 
14 | import java.util.Iterator;
15 | import java.util.Set;
16 | 
17 | public class DictionaryChunker {
18 |     private static final String sentences[] = {"Joe was the last person to see Fred. ",
19 |         "He saw him in Boston at McKenzie's pub at 3:00 where he paid "
20 |         + "$2.45 for an ale. ",
21 |         "Joe wanted to go to Vermont for the day to visit a cousin who "
22 |         + "works at IBM, but Sally and he had to look for Fred"};
23 |     static final double CHUNK_SCORE = 1.0;
24 | 
25 |     public static void main(String[] args) {
26 | 
27 |         MapDictionary<String> dictionary = new MapDictionary<String>();
28 |         dictionary.addEntry(new DictionaryEntry<String>("Joe","PERSON",CHUNK_SCORE));
29 |         dictionary.addEntry(new DictionaryEntry<String>("Fred","PERSON",CHUNK_SCORE));
30 |         dictionary.addEntry(new DictionaryEntry<String>("Boston","PLACE",CHUNK_SCORE));
31 |         dictionary.addEntry(new DictionaryEntry<String>("pub","PLACE",CHUNK_SCORE));
32 |         dictionary.addEntry(new DictionaryEntry<String>("Vermont","PLACE",CHUNK_SCORE));
33 |         dictionary.addEntry(new DictionaryEntry<String>("IBM","ORGANIZATION",CHUNK_SCORE));
34 |         dictionary.addEntry(new DictionaryEntry<String>("Sally","PERSON",CHUNK_SCORE));
35 | 
36 | 
37 |         ExactDictionaryChunker dictionaryChunkerTT
38 |             = new ExactDictionaryChunker(dictionary,
39 |                                          IndoEuropeanTokenizerFactory.INSTANCE,
40 |                                          true,true);
41 | 
42 |         ExactDictionaryChunker dictionaryChunkerTF
43 |             = new ExactDictionaryChunker(dictionary,
44 |                                          IndoEuropeanTokenizerFactory.INSTANCE,
45 |                                          true,false);
46 | 
47 |         ExactDictionaryChunker dictionaryChunkerFT
48 |             = new ExactDictionaryChunker(dictionary,
49 |                                          IndoEuropeanTokenizerFactory.INSTANCE,
50 |                                          false,true);
51 | 
52 |         ExactDictionaryChunker dictionaryChunkerFF
53 |             = new ExactDictionaryChunker(dictionary,
54 |                                          IndoEuropeanTokenizerFactory.INSTANCE,
55 |                                          false,false);
56 | 
57 | 
58 | 
59 |         System.out.println("\nDICTIONARY\n" + dictionary);
60 | 
61 |         for (int i = 0; i < sentences.length; ++i) {
62 |             String text = sentences[i];
63 |             System.out.println("\n\nTEXT=" + text);
64 | 
65 |             chunk(dictionaryChunkerTT,text);
66 |             chunk(dictionaryChunkerTF,text);
67 |             chunk(dictionaryChunkerFT,text);
68 |             chunk(dictionaryChunkerFF,text);
69 |         }
70 | 
71 |     }
72 | 
73 |     static void chunk(ExactDictionaryChunker chunker, String text) {
74 |         System.out.println("\nChunker."
75 |                            + " All matches=" + chunker.returnAllMatches()
76 |                            + " Case sensitive=" + chunker.caseSensitive());
77 |         Chunking chunking = chunker.chunk(text);
78 |         for (Chunk chunk : chunking.chunkSet()) {
79 |             int start = chunk.start();
80 |             int end = chunk.end();
81 |             String type = chunk.type();
82 |             double score = chunk.score();
83 |             String phrase = text.substring(start,end);
84 |             System.out.println("     phrase=|" + phrase + "|"
85 |                                + " start=" + start
86 |                                + " end=" + end
87 |                                + " type=" + type
88 |                                + " score=" + score);
89 |         }
90 |     }
91 | 
92 | }
93 | 
94 | 


--------------------------------------------------------------------------------
/Chapter04/src/packt/EmailRegexChunker.java:
--------------------------------------------------------------------------------
 1 | package packt;
 2 | 
 3 | import com.aliasi.chunk.RegExChunker;
 4 | 
 5 | public class EmailRegexChunker extends RegExChunker {
 6 |     // From: http://alias-i.com/lingpipe/demos/tutorial/ne/read-me.html
 7 |     public EmailRegexChunker() {
 8 |         super(EMAIL_REGEX,CHUNK_TYPE,CHUNK_SCORE);
 9 |     }
10 | 
11 |     private final static String EMAIL_REGEX
12 |         = "[A-Za-z0-9](([_\\.\\-]?[a-zA-Z0-9]+)*)@([A-Za-z0-9]+)(([\\.\\-]?[a-zA-Z0-9]+)*)\\.([A-Za-z]{2,})";
13 | 
14 |     private final static String CHUNK_TYPE = "email";
15 | 
16 |     private final static double CHUNK_SCORE = 0.0;
17 | 
18 | }
19 | 


--------------------------------------------------------------------------------
/Chapter04/src/packt/RunChunker.java:
--------------------------------------------------------------------------------
 1 |  package packt;
 2 | 
 3 | import com.aliasi.chunk.Chunker;
 4 | import com.aliasi.chunk.Chunking;
 5 | 
 6 | import com.aliasi.util.AbstractExternalizable;
 7 | 
 8 | import java.io.File;
 9 | 
10 | public class RunChunker {
11 |     private static final String sentences[] = {"Joe was the last person to see Fred. ",
12 |         "He saw him in Boston at McKenzie's pub at 3:00 where he paid "
13 |         + "$2.45 for an ale. ",
14 |         "Joe wanted to go to Vermont for the day to visit a cousin who "
15 |         + "works at IBM, but Sally and he had to look for Fred"};
16 |     
17 |     public static void main(String[] args) throws Exception {
18 | 	File modelFile = new File(getModelDir(), "ne-en-news-muc6.AbstractCharLmRescoringChunker");
19 | 
20 | 	System.out.println("Reading chunker from file=" + modelFile);
21 | 	Chunker chunker 
22 | 	    = (Chunker) AbstractExternalizable.readObject(modelFile);
23 | 
24 | 	for (int i = 0; i < sentences.length; ++i) {
25 | 	    Chunking chunking = chunker.chunk(sentences[i]);
26 | 	    System.out.println("Chunking=" + chunking);
27 | 	}
28 | 
29 |     }
30 |     
31 |     public static File getModelDir() {
32 |         return new File("C:\\Current Books in Progress\\NLP and Java\\Models");
33 |     }
34 | }
35 | 
36 | 


--------------------------------------------------------------------------------
/Chapter04/src/packt/TimeRegexChunker.java:
--------------------------------------------------------------------------------
 1 | package packt;
 2 | 
 3 | import com.aliasi.chunk.RegExChunker;
 4 | 
 5 | public class TimeRegexChunker extends RegExChunker {
 6 |     private final static String TIME_RE = 
 7 |         "(([0-1]?[0-9])|([2][0-3])):([0-5]?[0-9])(:([0-5]?[0-9]))?";
 8 |     private final static String CHUNK_TYPE = "time";
 9 |     private final static double CHUNK_SCORE = 1.0;
10 |     
11 |     public TimeRegexChunker() {
12 |         super(TIME_RE,CHUNK_TYPE,CHUNK_SCORE);
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/Chapter04/src/packt/TrainEntities.java:
--------------------------------------------------------------------------------
 1 | package packt;
 2 | 
 3 | import java.io.File;
 4 | import java.io.IOException;
 5 | 
 6 | import com.aliasi.chunk.CharLmHmmChunker;
 7 | //import com.aliasi.corpus.parsers.Muc6ChunkParser;
 8 | import com.aliasi.hmm.HmmCharLmEstimator;
 9 | import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;
10 | import com.aliasi.tokenizer.TokenizerFactory;
11 | import com.aliasi.util.AbstractExternalizable;
12 | 
13 | @SuppressWarnings("deprecation")
14 | public class TrainEntities {
15 | 
16 |     static final int MAX_N_GRAM = 50;
17 |     static final int NUM_CHARS = 300;
18 |     static final double LM_INTERPOLATION = MAX_N_GRAM; // default behavior
19 | 
20 |     public static void main(String[] args) throws IOException {
21 |         File corpusFile = new File("inputfile.txt");// my annotated file
22 |         File modelFile = new File("outputmodelfile.model"); 
23 | 
24 |         System.out.println("Setting up Chunker Estimator");
25 |         TokenizerFactory factory
26 |             = IndoEuropeanTokenizerFactory.INSTANCE;
27 |         HmmCharLmEstimator hmmEstimator
28 |             = new HmmCharLmEstimator(MAX_N_GRAM,NUM_CHARS,LM_INTERPOLATION);
29 |         CharLmHmmChunker chunkerEstimator
30 |             = new CharLmHmmChunker(factory,hmmEstimator);
31 | 
32 |         System.out.println("Setting up Data Parser");
33 | //        Muc6ChunkParser parser = new Muc6ChunkParser();  
34 | //        parser.setHandler( chunkerEstimator);
35 | 
36 |         System.out.println("Training with Data from File=" + corpusFile);
37 | //        parser.parse(corpusFile);
38 | 
39 |         System.out.println("Compiling and Writing Model to File=" + modelFile);
40 |         AbstractExternalizable.compileTo(chunkerEstimator,modelFile);
41 |     }
42 | 
43 | }


--------------------------------------------------------------------------------
/Chapter06/Chapter6.java:
--------------------------------------------------------------------------------
  1 | package packt;
  2 | 
  3 | import com.aliasi.classify.Classification;
  4 | import com.aliasi.classify.Classified;
  5 | import com.aliasi.classify.DynamicLMClassifier;
  6 | import com.aliasi.classify.JointClassification;
  7 | import com.aliasi.classify.JointClassifier;
  8 | import com.aliasi.classify.LMClassifier;
  9 | import com.aliasi.lm.NGramProcessLM;
 10 | import com.aliasi.util.AbstractExternalizable;
 11 | import com.aliasi.util.Compilable;
 12 | import com.aliasi.util.Files;
 13 | import edu.stanford.nlp.classify.Classifier;
 14 | import edu.stanford.nlp.classify.ColumnDataClassifier;
 15 | import edu.stanford.nlp.ie.crf.CRFClassifier;
 16 | import edu.stanford.nlp.ling.CoreAnnotations;
 17 | import edu.stanford.nlp.ling.Datum;
 18 | import edu.stanford.nlp.neural.rnn.RNNCoreAnnotations;
 19 | import edu.stanford.nlp.objectbank.ObjectBank;
 20 | import edu.stanford.nlp.pipeline.Annotation;
 21 | import edu.stanford.nlp.pipeline.StanfordCoreNLP;
 22 | import edu.stanford.nlp.sentiment.SentimentCoreAnnotations;
 23 | import edu.stanford.nlp.stats.Counter;
 24 | import edu.stanford.nlp.trees.Tree;
 25 | import edu.stanford.nlp.util.CoreMap;
 26 | import java.io.BufferedOutputStream;
 27 | import java.io.File;
 28 | import java.io.FileInputStream;
 29 | import java.io.FileOutputStream;
 30 | import java.io.IOException;
 31 | import java.io.InputStream;
 32 | import java.io.OutputStream;
 33 | import java.util.Properties;
 34 | import java.util.Set;
 35 | import opennlp.tools.doccat.DoccatModel;
 36 | import opennlp.tools.doccat.DocumentCategorizerME;
 37 | import opennlp.tools.doccat.DocumentSample;
 38 | import opennlp.tools.doccat.DocumentSampleStream;
 39 | import opennlp.tools.util.ObjectStream;
 40 | import opennlp.tools.util.PlainTextByLineStream;
 41 | 
 42 | public class Chapter6 {
 43 | 
 44 |     private static String inputText = null;
 45 |     private static String toto = "Toto belongs to Dorothy Gale, the heroine of "
 46 |             + "the first and many subsequent books. In the first "
 47 |             + "book, he never spoke, although other animals, native "
 48 |             + "to Oz, did. In subsequent books, other animals "
 49 |             + "gained the ability to speak upon reaching Oz or "
 50 |             + "similar lands, but Toto remained speechless.";
 51 |     private static String garfield = "Garfield is a comic strip created by Jim "
 52 |             + "Davis. Published since June 19, 1978, it chronicles "
 53 |             + "the life of the title character, the cat Garfield "
 54 |             + "(named after the grandfather of Davis); his owner, "
 55 |             + "Jon Arbuckle; and Jon's dog, Odie.";
 56 | 
 57 |     private static String calico = "This cat is also known as a calimanco cat or "
 58 |             + "clouded tiger cat, and by the abbreviation 'tortie'. "
 59 |             + "In the cat fancy, a tortoiseshell cat is patched "
 60 |             + "over with red (or its dilute form, cream) and black "
 61 |             + "(or its dilute blue) mottled throughout the coat.";
 62 | 
 63 |     public static void main(String[] args) {
 64 |         inputText = toto;
 65 | //        inputText = garfield;
 66 | //        inputText = calico;
 67 | //        trainingOpenNLPModel();
 68 | //        usingOpenNLP();
 69 | //        usingStandfordClassifier();
 70 | //        usingStanfordSentimentAnalysis();
 71 |         usingLingPipe();
 72 |     }
 73 | 
 74 |     private static void trainingOpenNLPModel() {
 75 |         DoccatModel model = null;
 76 |         try (InputStream dataIn = new FileInputStream("en-animal.train");
 77 |                 OutputStream dataOut = new FileOutputStream("en-animal.model");) {
 78 |             ObjectStream<String> lineStream
 79 |                     = new PlainTextByLineStream(dataIn, "UTF-8");
 80 |             ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream);
 81 |             model = DocumentCategorizerME.train("en", sampleStream);
 82 | 
 83 |             // Save the model
 84 |             OutputStream modelOut = null;
 85 |             modelOut = new BufferedOutputStream(dataOut);
 86 |             model.serialize(modelOut);
 87 |         } catch (IOException e) {
 88 |             e.printStackTrace();
 89 |         }
 90 |     }
 91 | 
 92 |     private static void usingOpenNLP() {
 93 |         try (InputStream modelIn = new FileInputStream(
 94 |                 new File("en-animal.model"));) {
 95 |             DoccatModel model = new DoccatModel(modelIn);
 96 |             DocumentCategorizerME categorizer = new DocumentCategorizerME(model);
 97 |             double[] outcomes = categorizer.categorize(inputText);
 98 |             for (int i = 0; i < categorizer.getNumberOfCategories(); i++) {
 99 |                 String category = categorizer.getCategory(i);
100 |                 System.out.println(category + " - " + outcomes[i]);
101 |             }
102 |             System.out.println(categorizer.getBestCategory(outcomes));
103 |             System.out.println(categorizer.getAllResults(outcomes));
104 |         } catch (IOException ex) {
105 |             ex.printStackTrace();
106 |         }
107 |     }
108 | 
109 |     public static File getModelDir() {
110 |         return new File("C:\\Current Books in Progress\\NLP and Java\\Models");
111 |     }
112 | 
113 |     private static void usingStandfordClassifier() {
114 | //        String dir = "C:/Current Books in Progress/NLP and Java/Downloads/Stanford/stanford-classifier-2014-10-26/";
115 |         ColumnDataClassifier cdc = new ColumnDataClassifier("box.prop");
116 |         Classifier<String, String> classifier
117 |                 = cdc.makeClassifier(cdc.readTrainingExamples("box.train"));
118 |         for (String line : ObjectBank.getLineIterator("box.test", "utf-8")) {
119 |             // instead of the method in the line below, if you have the individual elements
120 |             // already you can use cdc.makeDatumFromStrings(String[])
121 |             Datum<String, String> datum = cdc.makeDatumFromLine(line);
122 |             System.out.println("Datum: {" + line + "]\tPredicted Category: " + classifier.classOf(datum));
123 | //            System.out.println(" Scores: " + classifier.scoresOf(datum));
124 | //            Counter<String> counter = classifier.scoresOf(datum);
125 | //            Set<String> set = counter.keySet();
126 | //            for (String element : set) {
127 | //                System.out.printf("Scores - %-6s: %5.2f ", element, counter.getCount(element));
128 | //            }
129 | //            System.out.println();
130 |         }
131 | 
132 |  
133 |         System.out.println();
134 |         String sample[] = {"", "6.90", "9.8", "15.69"};
135 |         Datum<String, String> datum = cdc.makeDatumFromStrings(sample);
136 |         System.out.println("Category: " + classifier.classOf(datum));
137 |     }
138 | 
139 |     private static void usingStanfordSentimentAnalysis() {
140 |         String review = "An overly sentimental film with a somewhat "
141 |                 + "problematic message, but its sweetness and charm "
142 |                 + "are occasionally enough to approximate true depth "
143 |                 + "and grace. ";
144 | 
145 |         String sam = "Sam was an odd sort of fellow. Not prone to angry and "
146 |                 + "not prone to merriment. Overall, an odd fellow.";
147 |         String mary = "Mary thought that custard pie was the best pie in the "
148 |                 + "world. However, she loathed chocolate pie.";
149 |         Properties props = new Properties();
150 |         props.put("annotators", "tokenize, ssplit, parse, sentiment");
151 |         StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
152 | 
153 |         Annotation annotation = new Annotation(review);
154 |         pipeline.annotate(annotation);
155 | 
156 |         System.out.println("---sentimentText");
157 |         String[] sentimentText = {"Very Negative", "Negative", "Neutral",
158 |             "Positive", "Very Positive"};
159 |         for (CoreMap sentence : annotation.get(
160 |                 CoreAnnotations.SentencesAnnotation.class)) {
161 |             Tree tree = sentence.get(
162 |                     SentimentCoreAnnotations.AnnotatedTree.class);
163 |             System.out.println("---Number of children: " + tree.numChildren());
164 |             System.out.println("[" + tree.getChild(0) + "][" + tree.getChild(1) + "]");
165 |             tree.printLocalTree();
166 |             int score = RNNCoreAnnotations.getPredictedClass(tree);
167 |             System.out.println(sentimentText[score]);
168 |         }
169 | 
170 |         // Classifer
171 |         CRFClassifier crf
172 |                 = CRFClassifier.getClassifierNoExceptions(
173 |                         "C:/Current Books in Progress/NLP and Java/Models"
174 |                         + "/english.all.3class.distsim.crf.ser.gz");
175 |         String S1 = "Good afternoon Rajat Raina, how are you today?";
176 |         String S2 = "I go to school at Stanford University, which is located in California.";
177 |         System.out.println(crf.classifyToString(S1));
178 |         System.out.println(crf.classifyWithInlineXML(S2));
179 |         System.out.println(crf.classifyToString(S2, "xml", true));
180 | 
181 |         Object classification[] = crf.classify(S2).toArray();
182 |         for (int i = 0; i < classification.length; i++) {
183 |             System.out.println(classification[i]);
184 |         }
185 |     }
186 | 
187 |     //----------------------------------------------------------------------------
188 |     private static String[] categories
189 |             = {"soc.religion.christian",
190 |                 "talk.religion.misc",
191 |                 "alt.atheism",
192 |                 "misc.forsale"};
193 | 
194 |     private static JointClassifier<CharSequence> compiledClassifier;
195 | 
196 |     private static void usingLingPipe() {
197 | //        trainingLingPipeClassificationModels();
198 | //        usingLingPipeModels();
199 |         usingLingPipeSentimentAnalysis();
200 | //        classifyLingPipeSLanguageAnalysis();
201 |     }
202 | 
203 |     private static int nGramSize = 6;
204 |     private static DynamicLMClassifier<NGramProcessLM> classifier
205 |             = DynamicLMClassifier.createNGramProcess(categories, nGramSize);
206 | 
207 |     private static void trainingLingPipeClassificationModels() {
208 |         final String directory = "C:/Current Books/NLP and Java/Downloads/lingpipe-4.1.0/demos";
209 |         final File trainingDirectory
210 |                 = new File(directory + "/data/fourNewsGroups/4news-train");
211 | 
212 |         for (int i = 0; i < categories.length; ++i) {
213 |             final File classDir = new File(trainingDirectory, categories[i]);
214 | 
215 |             String[] trainingFiles = classDir.list();
216 |             for (int j = 0; j < trainingFiles.length; ++j) {
217 |                 try {
218 |                     File file = new File(classDir, trainingFiles[j]);
219 |                     String text = Files.readFromFile(file, "ISO-8859-1");
220 |                     Classification classification
221 |                             = new Classification(categories[i]);
222 |                     Classified<CharSequence> classified
223 |                             = new Classified<>((CharSequence)text, classification);
224 |                     classifier.handle(classified);
225 | 
226 |                 } catch (IOException ex) {
227 |                     ex.printStackTrace();
228 |                 }
229 |             }
230 |         }
231 | 
232 |         try {
233 |             AbstractExternalizable.compileTo(
234 |                     (Compilable) classifier,
235 |                     new File("classifier.model"));
236 |         } catch (IOException ex) {
237 |             ex.printStackTrace();
238 |         }
239 | 
240 |     }
241 | 
242 |     private static void usingLingPipeModels() {
243 |         String text = "Finding a home for sale has never been "
244 |                 + "easier. With Homes.com, you can search new "
245 |                 + "homes, foreclosures, multi-family homes, "
246 |                 + "as well as condos and townhouses for sale. "
247 |                 + "You can even search our real estate agent "
248 |                 + "directory to work with a professional "
249 |                 + "Realtor and find your perfect home.";
250 | //        text = "Luther taught that salvation and subsequently "
251 | //                + "eternity in heaven is not earned by good deeds "
252 | //                + "but is received only as a free gift of God's "
253 | //                + "grace through faith in Jesus Christ as redeemer "
254 | //                + "from sin and subsequently eternity in Hell.";
255 |         LMClassifier classifier = null;
256 |         try {
257 |             //        text =
258 | //                "Homeowners may employ the services of marketing or online listing companies or market their own property but do not pay a commission and represent themselves with the help of a lawyer or Solicitor (mostly in Commonwealth) throughout the sale.";
259 |             classifier = (LMClassifier) AbstractExternalizable.readObject(new File("classifier.model"));
260 |         } catch (IOException | ClassNotFoundException ex) {
261 |             ex.printStackTrace();
262 |         }
263 |         JointClassification classification
264 |                 = classifier.classify(text);
265 | 
266 |         System.out.println("---------------");
267 |         System.out.println("Text: " + text);
268 |         String bestCategory = classification.bestCategory();
269 |         System.out.println("Best Category: " + bestCategory);
270 |         for (int i = 0; i < categories.length; i++) {
271 |             double score = classification.score(i);
272 |             double probability = classification.jointLog2Probability(i);
273 |             String category = classification.category(i);
274 |             System.out.printf(" %3d  - Category: %-24s  Score: %6.2f  jointLog2Probability: %6.2f%n",
275 |                     i, category, score, probability);
276 |         }
277 | 
278 | //            }
279 | //        }        
280 |     }
281 | 
282 |     private static void usingLingPipeSentimentAnalysis() {
283 |         categories = new String[2];
284 |         categories[0] = "neg";
285 |         categories[1] = "pos";
286 |         nGramSize = 8;
287 |         classifier = DynamicLMClassifier.createNGramProcess(categories, nGramSize);
288 | 
289 |         trainingLingPipeSentimentAnalysis();
290 |         classifyLingPipeSentimentAnalysis();
291 |     }
292 | 
293 |     private static void trainingLingPipeSentimentAnalysis() {
294 |         String directory = "C:/Current Books/NLP and Java/Downloads/Sentiment Data";
295 |         File trainingDirectory = new File(directory, "txt_sentoken");
296 |         System.out.println("\nTraining.");
297 |         for (int i = 0; i < categories.length; ++i) {
298 |             Classification classification
299 |                     = new Classification(categories[i]);
300 |             File file = new File(trainingDirectory, categories[i]);
301 |             File[] trainingFiles = file.listFiles();
302 |             for (int j = 0; j < trainingFiles.length; ++j) {
303 |                 try {
304 |                     String review = Files.readFromFile(trainingFiles[j], "ISO-8859-1");
305 |                     Classified<CharSequence> classified;
306 |                     classified = new Classified<>((CharSequence)review, classification);
307 |                     classifier.handle(classified);
308 |                 } catch (IOException ex) {
309 |                     ex.printStackTrace();
310 |                 }
311 |             }
312 |         }
313 |     }
314 | 
315 |     private static void classifyLingPipeSentimentAnalysis() {
316 |         System.out.println("---------------");
317 |         //http://www.rottentomatoes.com/m/forrest_gump/
318 |         String review = "An overly sentimental film with a somewhat "
319 |                 + "problematic message, but its sweetness and charm "
320 |                 + "are occasionally enough to approximate true depth "
321 |                 + "and grace. ";
322 |         System.out.println("Text: " + review);
323 |         Classification classification
324 |                 = classifier.classify(review);
325 |         String bestCategory = classification.bestCategory();
326 |         System.out.println("Best Category: " + bestCategory);
327 | 
328 |         for (String category : classifier.categories()) {
329 |             System.out.println(category);
330 |         }
331 |     }
332 | 
333 |     private static void classifyLingPipeSLanguageAnalysis() {
334 |         System.out.println("---------------");
335 |         //http://www.rottentomatoes.com/m/forrest_gump/
336 |         String text = "An overly sentimental film with a somewhat "
337 |                 + "problematic message, but its sweetness and charm "
338 |                 + "are occasionally enough to approximate true depth "
339 |                 + "and grace. ";
340 |         text = "Svenska är ett östnordiskt språk som talas av cirka "
341 |                 + "tio miljoner personer[1], främst i Finland "
342 |                 + "och Sverige.";
343 | //        text = "¡Buenos días, clase! Good morning, class! Hola, ¿Cómo están hoy? Hello, how are you today? Adiós, ¡hasta luego! Bye, see you soon!";
344 |         System.out.println("Text: " + text);
345 |         LMClassifier classifier = null;
346 |         try {
347 |             classifier = (LMClassifier) AbstractExternalizable.readObject(
348 |                     new File("C:/Current Books/NLP and Java/Models/langid-leipzig.classifier"));
349 |         } catch (IOException | ClassNotFoundException ex) {
350 |             ex.printStackTrace();
351 |         }
352 | 
353 |         Classification classification
354 |                 = classifier.classify(text);
355 |         String bestCategory = classification.bestCategory();
356 |         System.out.println("Best Language: " + bestCategory);
357 | 
358 |         for (String category : classifier.categories()) {
359 |             System.out.println(category);
360 |         }
361 |     }
362 | }
363 | 


--------------------------------------------------------------------------------
/Chapter06/GloveExample.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * To change this license header, choose License Headers in Project Properties.
 3 |  * To change this template file, choose Tools | Templates
 4 |  * and open the template in the editor.
 5 |  */
 6 | package chapter6;
 7 | 
 8 | import glove.GloVe;
 9 | import glove.objects.Cooccurrence;
10 | import glove.objects.Vocabulary;
11 | import glove.utils.Methods;
12 | import glove.utils.Options;
13 | import java.io.File;
14 | import java.util.List;
15 | import org.jblas.DoubleMatrix;
16 | 
17 | /**
18 |  *
19 |  * @author ashish
20 |  */
21 | public class GloveExample {
22 |     private static String getResourcePath(){
23 |         File currDir = new File(".");
24 |         String path = currDir .getAbsolutePath();
25 |         path = path.substring(0, path.length()-2);
26 |         System.out.println(path);
27 |             String resourcePath = path + File.separator  + "src/chapter6/";
28 |         return resourcePath;
29 |     }
30 |     public static void main(String args[]){
31 |         String file = getResourcePath() + "test.txt";
32 |         
33 |         Options options = new Options(); 
34 |         options.debug = true;
35 |         
36 |         Vocabulary vocab = GloVe.build_vocabulary(file, options);
37 |         
38 |         options.window_size = 3;
39 |         List<Cooccurrence> c =  GloVe.build_cooccurrence(vocab, file, options);
40 |         
41 |         options.iterations = 10;
42 |         options.vector_size = 10;
43 |         options.debug = true;
44 |         DoubleMatrix W = GloVe.train(vocab, c, options);  
45 | 
46 |         List<String> similars = Methods.most_similar(W, vocab, "graph", 15);
47 |         for(String similar : similars) {
48 |             System.out.println("@" + similar);
49 |         }
50 |         
51 |     }
52 |     
53 | }
54 | 


--------------------------------------------------------------------------------
/Chapter06/NGramTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * To change this license header, choose License Headers in Project Properties.
 3 |  * To change this template file, choose Tools | Templates
 4 |  * and open the template in the editor.
 5 |  */
 6 | package chapter6;
 7 | 
 8 | import opennlp.tools.ngram.NGramModel;
 9 | import opennlp.tools.tokenize.WhitespaceTokenizer;
10 | import opennlp.tools.util.StringList;
11 | 
12 | /**
13 |  *
14 |  * @author ashish
15 |  */
16 | public class NGramTest {
17 |     public static void main(String args[]){
18 |         String sampletext = "This is n-gram model";
19 |         System.out.println(sampletext);
20 |         
21 |         StringList tokens = new StringList(WhitespaceTokenizer.INSTANCE.tokenize(sampletext));
22 |         System.out.println("Tokens " + tokens);
23 |         
24 |         NGramModel nGramModel = new NGramModel();
25 |         nGramModel.add(tokens,2,4); // minlength and maxlength
26 |         
27 |         System.out.println("Total ngrams: " + nGramModel.numberOfGrams());
28 |         for (StringList ngram : nGramModel) {
29 |             System.out.println(nGramModel.getCount(ngram) + " - " + ngram);
30 |         }
31 |     }
32 |     
33 | }
34 | 


--------------------------------------------------------------------------------
/Chapter06/box.prop:
--------------------------------------------------------------------------------
1 | useClassFeature=true
2 | 1.realValued=true
3 | 2.realValued=true
4 | 3.realValued=true
5 | trainFile=.box.train
6 | testFile=.box.test
7 | 


--------------------------------------------------------------------------------
/Chapter06/box.test:
--------------------------------------------------------------------------------
 1 | small	1.33	3.50	5.43
 2 | small	1.18	1.73	3.14
 3 | small	2.29	2.69	4.18
 4 | small	2.94	2.74	1.71
 5 | small	1.41	2.72	5.21
 6 | small	1.27	2.97	5.93
 7 | small	1.69	1.41	5.42
 8 | small	2.80	2.64	2.79
 9 | small	2.01	2.42	4.46
10 | small	2.15	2.66	4.55
11 | medium	4.10	6.31	8.71
12 | medium	3.15	4.85	8.23
13 | medium	3.99	4.17	6.76
14 | medium	3.29	4.56	9.31
15 | medium	4.45	5.33	8.46
16 | medium	3.60	4.77	7.74
17 | medium	3.01	6.98	10.57
18 | medium	4.10	4.92	10.40
19 | medium	4.42	4.20	8.85
20 | medium	3.15	6.53	9.26
21 | large	6.90	9.82	15.69
22 | large	7.57	10.83	15.55
23 | large	7.78	9.16	16.26
24 | large	7.81	10.80	15.86
25 | large	6.62	10.44	12.50
26 | large	7.82	8.31	15.09
27 | large	6.21	9.96	12.75
28 | large	7.57	8.46	15.25
29 | large	6.01	9.35	16.64
30 | large	6.76	9.66	15.44


--------------------------------------------------------------------------------
/Chapter06/box.train:
--------------------------------------------------------------------------------
 1 | small	2.34	1.60	1.50
 2 | small	2.28	1.19	4.26
 3 | small	1.94	1.99	3.79
 4 | small	1.41	1.89	3.10
 5 | small	1.36	1.99	4.98
 6 | small	1.71	2.60	5.09
 7 | small	1.92	3.91	1.20
 8 | small	1.17	3.14	5.69
 9 | small	2.68	3.05	2.30
10 | small	2.22	1.03	2.99
11 | small	1.44	2.73	5.73
12 | small	2.96	1.88	4.43
13 | small	2.90	1.09	1.17
14 | small	1.01	1.53	5.95
15 | small	2.47	2.06	5.79
16 | small	1.46	1.81	2.64
17 | small	2.58	3.47	1.18
18 | small	1.06	1.89	3.83
19 | small	2.51	1.18	2.83
20 | small	2.24	2.42	3.92
21 | medium	3.43	6.78	7.69
22 | medium	3.05	4.96	8.65
23 | medium	4.38	5.86	10.27
24 | medium	4.74	6.23	6.23
25 | medium	3.71	5.85	6.24
26 | medium	4.10	4.22	6.33
27 | medium	4.21	6.08	7.09
28 | medium	5.00	6.53	7.84
29 | medium	4.70	5.72	8.47
30 | medium	3.54	6.00	10.70
31 | medium	3.63	4.42	7.29
32 | medium	3.92	6.48	7.12
33 | medium	4.32	6.61	9.01
34 | medium	3.45	5.02	9.67
35 | medium	4.88	6.90	10.38
36 | medium	3.17	6.00	7.15
37 | medium	3.36	5.64	9.38
38 | medium	4.78	5.35	8.83
39 | medium	4.48	5.32	7.39
40 | medium	3.68	4.63	11.00
41 | large	6.52	8.74	14.16
42 | large	6.68	10.64	12.38
43 | large	6.19	8.48	15.12
44 | large	6.44	9.63	13.36
45 | large	6.71	9.71	16.72
46 | large	6.84	8.64	14.70
47 | large	6.21	8.50	15.70
48 | large	7.60	8.90	13.25
49 | large	6.12	9.92	12.66
50 | large	6.58	9.91	13.19
51 | large	6.03	10.63	12.25
52 | large	6.12	8.74	16.00
53 | large	6.36	9.13	14.17
54 | large	7.93	10.94	15.48
55 | large	6.40	10.21	14.36
56 | large	6.41	9.94	12.70
57 | large	6.46	10.37	12.80
58 | large	7.25	10.81	15.67
59 | large	6.04	10.22	14.28
60 | large	7.63	9.51	14.96


--------------------------------------------------------------------------------
/Chapter06/en-animal.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Natural-Language-Processing-with-Java-Second-Edition/f235777262c48f4bbb4e52e45c4936896471080c/Chapter06/en-animal.model


--------------------------------------------------------------------------------
/Chapter06/en-animal.train:
--------------------------------------------------------------------------------
 1 | dog The most widespread form of interspecies bonding occurs between humans and dogs and the keeping of dogs as companions, particularly by elites, has a long history. (As a possible example, at the Natufian culture site of Ain Mallaha in Israel, dated to 12,000 BC, the remains of an elderly human and a four-to-five-month-old puppy were found buried together). However, pet dog populations grew significantly after World War II as suburbanization increased. In the 1950s and 1960s, dogs were kept outside more often than they tend to be today (using the expression "in the doghouse" to describe exclusion from the group signifies the distance between the doghouse and the home) and were still primarily functional, acting as a guard, children's playmate, or walking companion. From the 1980s, there have been changes in the role of the pet dog, such as the increased role of dogs in the emotional support of their human guardians. People and dogs have become increasingly integrated and implicated in each other's lives, to the point where pet dogs actively shape the way a family and home are experienced.
 2 | dog There have been two major trends in the changing status of pet dogs. The first has been the 'commodification' of the dog, shaping it to conform to human expectations of personality and behaviour. The second has been the broadening of the concept of the family and the home to include dogs-as-dogs within everyday routines and practices.
 3 | dog There are a vast range of commodity forms available to transform a pet dog into an ideal companion. The list of goods, services and places available is enormous: from dog perfumes, couture, furniture and housing, to dog groomers, therapists, trainers and caretakers, dog cafes, spas, parks and beaches, and dog hotels, airlines and cemeteries. While dog training as an organized activity can be traced back to the 18th century, in the last decades of the 20th century it became a high profile issue as many normal dog behaviors such as barking, jumping up, digging, rolling in dung, fighting, and urine marking[further explanation needed] became increasingly incompatible with the new role of a pet dog. Dog training books, classes and television programs proliferated as the process of commodifying the pet dog continued.
 4 | dog An Australian Cattle Dog in reindeer antlers sits on Santa's lap
 5 | dog A pet dog taking part in Christmas traditions
 6 | dog The majority of contemporary people with dogs describe their pet as part of the family, although some ambivalence about the relationship is evident in the popular reconceptualization of the dog–human family as a pack. A dominance model of dog–human relationships has been promoted by some dog trainers, such as on the television program Dog Whisperer. However it has been disputed that "trying to achieve status" is characteristic of dog–human interactions. Pet dogs play an active role in family life; for example, a study of conversations in dog–human families showed how family members use the dog as a resource, talking to the dog, or talking through the dog, to mediate their interactions with each other.
 7 | dog Another study of dogs' roles in families showed many dogs have set tasks or routines undertaken as family members, the most common of which was helping with the washing-up by licking the plates in the dishwasher, and bringing in the newspaper from the lawn. Increasingly, human family members are engaging in activities centered on the perceived needs and interests of the dog, or in which the dog is an integral partner, such as Dog Dancing and Doga.
 8 | dog According to statistics published by the American Pet Products Manufacturers Association in the National Pet Owner Survey in 2009–2010, it is estimated there are 77.5 million people with pet dogs in the United States.[49] The same survey shows nearly 40% of American households own at least one dog, of which 67% own just one dog, 25% two dogs and nearly 9% more than two dogs. There does not seem to be any gender preference among dogs as pets, as the statistical data reveal an equal number of female and male dog pets. Yet, although several programs are undergoing to promote pet adoption, less than a fifth of the owned dogs come from a shelter.
 9 | dog The latest study using Magnetic resonance imaging (MRI) to humans and dogs together proved that dogs have same response to voices and use the same parts of the brain as humans to do so. This gives dogs the ability to recognize emotional human sounds, making them friendly social pets to humans.
10 | cat Cats are common pets in Europe and North America, and their worldwide population is difficult to ascertain, with estimates ranging from anywhere between 200 million to 600 million. In 1998 there were around 76 million cats in Europe, 7 million in Japan and 3 million in Australia.:4 A 2007 report stated that about 37 million US households owned cats, with an average of 2.2 cats per household giving a total population of around 82 million; in contrast, there are about 72 million pet dogs in that country. Cats exceeded dogs in number as pets in the United States in 1985 for the first time, in part because the development of kitty litter in the mid-20th century eliminated the unpleasantly powerful smell of cat urine.
11 | cat Although cat ownership has commonly been associated with women, a 2007 Gallup poll reported that men and women were equally likely to own a cat. The ratio of pedigree/purebred cats to random-bred cats varies from country to country. However, generally speaking, purebreds are less than 10% of the total population.
12 | cat The concept of a cat breed appeared in Britain during the late 19th century. The current list of cat breeds is quite large: with the Cat Fanciers' Association recognizing 41 breeds, of which 16 are "natural breeds" that probably emerged before humans began breeding pedigree cats, while the others were developed over the latter half of the 20th century. The owners and breeders of show cats compete to see whose animal bears the closest resemblance to the "ideal" definition and standard of the breed (see selective breeding). Because of common crossbreeding in populated areas, many cats are simply identified as belonging to the homogeneous breeds of domestic longhair and domestic shorthair, depending on their type of fur. In the United Kingdom and Australasia, non-purebred cats are referred in slang as moggies (derived from "Maggie", short for Margaret, reputed to have been a common name for cows and calves in 18th century England and latter applied to housecats during the Victorian era). In the United States, a non-purebred cat is sometimes referred to as a barn or alley cat, even if it is not a stray.
13 | cat Cats come in a variety of colors and patterns. These are physical properties and should not be confused with a breed of cat. Furthermore, cats may show the color and/or pattern particular to a certain breed without actually being of that breed. For example, cats may have point coloration, but not be Siamese.
14 | cat A natural behavior in cats is to hook their front claws periodically into suitable surfaces and pull backwards. Cats, like humans, keep their muscles trim by stretching. However, a cat cannot keep his claw muscles in trim by this method.[citation needed] Cats, therefore, have found another method, as described above. Additionally, such periodic scratching serves to clean and sharpen their claws. Indoor cats may benefit from being provided with a scratching post so that they are less likely to use carpet or furniture, which they can easily ruin. However, cats may simply ignore such a device. Commercial scratching posts typically are covered in carpeting or upholstery, but some authorities[who?] advise against this practice, as not making it clear to the cat which surfaces are permissible and which are not; they suggest using a plain wooden surface, or reversing the carpeting on the posts so that the rougher texture of the carpet backing is a more attractive alternative to the cat than the floor covering. However, see the comment above about claw muscles. Scratching posts made of sisal rope or corrugated cardboard are also common.
15 | cat Although scratching can serve cats to keep their claws from growing excessively long, their nails can be trimmed if necessary. Another response to indoor scratching is onychectomy, commonly known as declawing. This is a surgical procedure to remove the claw and first bone of each digit of a cat's paws. Declawing is most commonly only performed on the front feet. A related procedure is tendonectomy, which involves cutting a tendon needed for cats to extend their claws. Declawing is a major surgical procedure and can produce pain, and infections.


--------------------------------------------------------------------------------
/Chapter07/Chapter7.java:
--------------------------------------------------------------------------------
  1 | package packt;
  2 | 
  3 | import edu.stanford.nlp.dcoref.CorefChain;
  4 | import edu.stanford.nlp.dcoref.CorefChain.CorefMention;
  5 | import edu.stanford.nlp.dcoref.CorefCoreAnnotations.CorefChainAnnotation;
  6 | import edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations;
  7 | import edu.stanford.nlp.ie.machinereading.structure.RelationMention;
  8 | import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
  9 | import edu.stanford.nlp.ling.CoreLabel;
 10 | import edu.stanford.nlp.ling.Sentence;
 11 | import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
 12 | import edu.stanford.nlp.pipeline.Annotation;
 13 | import edu.stanford.nlp.pipeline.StanfordCoreNLP;
 14 | import edu.stanford.nlp.process.CoreLabelTokenFactory;
 15 | import edu.stanford.nlp.process.PTBTokenizer;
 16 | import edu.stanford.nlp.process.Tokenizer;
 17 | import edu.stanford.nlp.process.TokenizerFactory;
 18 | import edu.stanford.nlp.trees.GrammaticalStructure;
 19 | import edu.stanford.nlp.trees.GrammaticalStructureFactory;
 20 | import edu.stanford.nlp.trees.Tree;
 21 | import edu.stanford.nlp.trees.TreebankLanguagePack;
 22 | import edu.stanford.nlp.trees.TypedDependency;
 23 | import edu.stanford.nlp.util.CoreMap;
 24 | import java.io.BufferedReader;
 25 | import java.io.FileInputStream;
 26 | import java.io.FileReader;
 27 | import java.io.IOException;
 28 | import java.io.InputStream;
 29 | import java.io.StringReader;
 30 | import java.util.ArrayList;
 31 | import java.util.HashSet;
 32 | import java.util.Iterator;
 33 | import java.util.List;
 34 | import java.util.Map;
 35 | import java.util.Properties;
 36 | import java.util.Set;
 37 | import opennlp.tools.cmdline.parser.ParserTool;
 38 | import opennlp.tools.parser.Parse;
 39 | import opennlp.tools.parser.ParserFactory;
 40 | import opennlp.tools.parser.ParserModel;
 41 | import opennlp.tools.parser.Parser;
 42 | import opennlp.tools.tokenize.SimpleTokenizer;
 43 | 
 44 | public class Chapter7 {
 45 | 
 46 |     public static void main(String[] args) {
 47 |         usingOpenNLP();
 48 | //        usingStanford();
 49 |     }
 50 | 
 51 |     static Set<String> nounPhrases = new HashSet<>();
 52 | 
 53 |     private static void usingOpenNLP() {
 54 |         String fileLocation = getModelDir() + "/en-parser-chunking.bin";
 55 |         System.out.println(fileLocation);
 56 |         try (InputStream modelInputStream = new FileInputStream(fileLocation);) {
 57 |             ParserModel model = new ParserModel(modelInputStream);
 58 |             Parser parser = ParserFactory.create(model);
 59 |             String sentence = "The cow jumped over the moon";
 60 |             // Used to demonstrate difference between NER and Parser
 61 |             sentence = "He was the last person to see Fred.";
 62 | 
 63 |             Parse parses[] = ParserTool.parseLine(sentence, parser, 3);
 64 |             for (Parse parse : parses) {
 65 |                 // First display
 66 |                 parse.show();
 67 |                 // Second display
 68 | //                parse.showCodeTree();
 69 |                 // Third display
 70 | //                System.out.println("Children");
 71 | //                Parse children[] = parse.getChildren();
 72 | //                for (Parse parseElement : children) {
 73 | //                    System.out.println(parseElement);
 74 | //                    System.out.println(parseElement.getText());
 75 | //                    System.out.println(parseElement.getType());
 76 | //                    Parse tags[] = parseElement.getTagNodes();
 77 | //                    System.out.println("Tags");
 78 | //                    for (Parse tag : tags) {
 79 | //                        System.out.println("[" + tag + "]" + " type: " + tag.getType()
 80 | //                                + "  Probability: " + tag.getProb()
 81 | //                                + "  Label: " + tag.getLabel());
 82 | //                    }
 83 | //                }
 84 |             }
 85 |         } catch (IOException ex) {
 86 |             ex.printStackTrace();
 87 |         }
 88 |     }
 89 | 
 90 |     public static String getModelDir() {
 91 |         return "C:/Current Books/NLP and Java/Models";
 92 |     }
 93 | 
 94 |     private static void usingStanfordParsers() {
 95 |         usingStanford();
 96 | //        usingStanfordLexicalizedParser();
 97 |     }
 98 | 
 99 |     private static void usingStanford() {
100 |         usingStanfordLexicalizedParser();
101 | //        usingStanfordRelationExtraction();
102 | //        usingStanfordCoreferenceResolution();
103 | //        extractingRelations();
104 |     }
105 | 
106 |     private static void usingStanfordLexicalizedParser() {
107 |         String parserModel = "C:/Current Books in Progress/NLP and Java/Models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
108 |         LexicalizedParser lexicalizedParser = LexicalizedParser.loadModel(parserModel);
109 | 
110 |         // This option shows parsing a list of correctly tokenized words
111 |         System.out.println("---First option");
112 |         String[] senetenceArray = {"The", "cow", "jumped", "over", "the", "moon", "."};
113 |         List<CoreLabel> words = Sentence.toCoreLabelList(senetenceArray);
114 | 
115 |         Tree parseTree = lexicalizedParser.apply(words);
116 |         parseTree.pennPrint();
117 |         System.out.println();
118 | 
119 |         // This option shows loading and using an explicit tokenizer
120 |         System.out.println("---Second option");
121 |         String sentence = "The cow jumped over the moon.";
122 |         TokenizerFactory<CoreLabel> tokenizerFactory
123 |                 = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
124 |         Tokenizer<CoreLabel> tokenizer
125 |                 = tokenizerFactory.getTokenizer(new StringReader(sentence));
126 |         List<CoreLabel> wordList = tokenizer.tokenize();
127 |         parseTree = lexicalizedParser.apply(wordList);
128 | 
129 |         TreebankLanguagePack tlp = lexicalizedParser.treebankLanguagePack(); // PennTreebankLanguagePack for English
130 |         GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
131 |         GrammaticalStructure gs = gsf.newGrammaticalStructure(parseTree);
132 |         List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
133 |         System.out.println(tdl);
134 |         for (TypedDependency dependency : tdl) {
135 |             System.out.println("Governor Word: [" + dependency.gov() + "] Relation: [" + dependency.reln().getLongName()
136 |                     + "] Dependent Word: [" + dependency.dep() + "]");
137 |         }
138 |         System.out.println();
139 | 
140 |         // You can also use a TreePrint object to print trees and dependencies
141 | //        System.out.println("---Using TreePrint");
142 | //        TreePrint treePrint = new TreePrint("penn,typedDependenciesCollapsed");
143 | //        treePrint.printTree(parseTree);
144 | //        System.out.println("TreePrint Formats");
145 | //        for (String format : TreePrint.outputTreeFormats) {
146 | //            System.out.println(format);
147 | //        }
148 | //        System.out.println();
149 |     }
150 | 
151 |     private static void usingStanfordCoreferenceResolution() {
152 |         System.out.println("StanfordCoreferenceResolution");
153 |         String sentence = "He took his cash and she took her change " 
154 |                 + "and together they bought their lunch.";
155 |         Properties props = new Properties();
156 |         props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
157 |         StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
158 | 
159 |         Annotation annotation = new Annotation(sentence);
160 |         pipeline.annotate(annotation);
161 |         System.out.println("Sentence: " + sentence);
162 |         Map<Integer, CorefChain> corefChainMap = annotation.get(CorefChainAnnotation.class);
163 | 
164 |         Set<Integer> set = corefChainMap.keySet();
165 |         Iterator<Integer> setIterator = set.iterator();
166 |         while(setIterator.hasNext()) {
167 |             CorefChain corefChain = corefChainMap.get(setIterator.next());
168 |             System.out.println("CorefChain: " + corefChain);
169 |             System.out.print("ClusterId: " + corefChain.getChainID());
170 |             CorefMention mention = corefChain.getRepresentativeMention();
171 |             System.out.println(" CorefMention: " + mention + " Span: [" + mention.mentionSpan + "]");
172 | 
173 |             List<CorefMention> mentionList = corefChain.getMentionsInTextualOrder();
174 |             Iterator<CorefMention> mentionIterator = mentionList.iterator();
175 |             while(mentionIterator.hasNext()) {
176 |                 CorefMention cfm = mentionIterator.next();
177 |                 System.out.println("\tMention: " + cfm + " Span: [" + mention.mentionSpan + "]");
178 |                 System.out.print("\tMention Type: " + cfm.mentionType + " Gender: " + cfm.gender);
179 |                 System.out.println(" Start: " + cfm.startIndex + " End: " + cfm.endIndex);
180 |             }
181 |             System.out.println();
182 |         }
183 |     }
184 | 
185 |     private static void extractingRelations() {
186 |         String question = "Who is the 32rd president of the United States?";
187 | //        question = "Who was the 32rd president of the United States?";
188 | //        question = "The 32rd president of the United States was who?";
189 | //        question = "The 32rd president is who of the United States?";
190 | //        question = "What was the 3rd President's party?";
191 | //        question = "When was the 12th president inaugurated";
192 | //        question = "Where is the 30th president's home town?";
193 | 
194 |         String parserModel = "C:/Current Books/NLP and Java/Models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
195 |         LexicalizedParser lexicalizedParser = LexicalizedParser.loadModel(parserModel);
196 | 
197 |         TokenizerFactory<CoreLabel> tokenizerFactory
198 |                 = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
199 |         Tokenizer<CoreLabel> tokenizer
200 |                 = tokenizerFactory.getTokenizer(new StringReader(question));
201 |         List<CoreLabel> wordList = tokenizer.tokenize();
202 |         Tree parseTree = lexicalizedParser.apply(wordList);
203 | 
204 |         TreebankLanguagePack tlp = lexicalizedParser.treebankLanguagePack();
205 |         GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
206 |         GrammaticalStructure gs = gsf.newGrammaticalStructure(parseTree);
207 |         List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
208 |         System.out.println(tdl);
209 |         for (TypedDependency dependency : tdl) {
210 |             System.out.println("Governor Word: [" + dependency.gov() + "] Relation: [" + dependency.reln().getLongName()
211 |                     + "] Dependent Word: [" + dependency.dep() + "]");
212 |         }
213 | 
214 |         System.out.println();
215 |         System.out.println("You asked: " + question);
216 |         for (TypedDependency dependency : tdl) {
217 |             if ("nominal subject".equals(dependency.reln().getLongName())
218 |                     && "who".equalsIgnoreCase(dependency.gov().originalText())) {
219 |                 System.out.println("Found Who question --- Governor Word: [" + dependency.gov() + "] Relation: [" + dependency.reln().getLongName()
220 |                         + "] Dependent Word: [" + dependency.dep() + "]");
221 |                 processWhoQuestion(tdl);
222 |             } else if ("nominal subject".equals(dependency.reln().getLongName())
223 |                     && "what".equalsIgnoreCase(dependency.gov().originalText())) {
224 |                 System.out.println("Found What question --- Governor Word: [" + dependency.gov() + "] Relation: [" + dependency.reln().getLongName()
225 |                         + "] Dependent Word: [" + dependency.dep() + "]");
226 |             } else if ("adverbial modifier".equals(dependency.reln().getLongName())
227 |                     && "when".equalsIgnoreCase(dependency.dep().originalText())) {
228 |                 System.out.println("Found When question --- Governor Word: [" + dependency.gov() + "] Relation: [" + dependency.reln().getLongName()
229 |                         + "] Dependent Word: [" + dependency.dep() + "]");
230 |             } else if ("adverbial modifier".equals(dependency.reln().getLongName())
231 |                     && "where".equalsIgnoreCase(dependency.dep().originalText())) {
232 |                 System.out.println("Found Where question --- Governor Word: [" + dependency.gov() + "] Relation: [" + dependency.reln().getLongName()
233 |                         + "] Dependent Word: [" + dependency.dep() + "]");
234 |             }
235 |         }
236 |     }
237 | 
238 |     private static void processWhoQuestion(List<TypedDependency> tdl) {
239 |         System.out.println("Processing Who Question");
240 |         List<President> list = createPresidentList();
241 |         for (TypedDependency dependency : tdl) {
242 |             if ("president".equalsIgnoreCase(dependency.gov().originalText())
243 |                     && "adjectival modifier".equals(dependency.reln().getLongName())) {
244 |                 String positionText = dependency.dep().originalText();
245 |                 int position = getOrder(positionText) - 1;
246 |                 System.out.println("The president is " + list.get(position).getName());
247 |             }
248 |         }
249 |     }
250 | 
251 |     private static int getOrder(String position) {
252 |         String tmp = "";
253 |         int i = 0;
254 |         while (Character.isDigit(position.charAt(i))) {
255 |             tmp += position.charAt(i++);
256 |         }
257 |         return Integer.parseInt(tmp);
258 |     }
259 | 
260 |     private static List<President> createPresidentList() {
261 |         ArrayList<President> list = new ArrayList<>();
262 |         String line = null;
263 |         try (FileReader reader = new FileReader("PresidentList");
264 |                 BufferedReader br = new BufferedReader(reader)) {
265 |             while ((line = br.readLine()) != null) {
266 |                 SimpleTokenizer simpleTokenizer = SimpleTokenizer.INSTANCE;
267 |                 String tokens[] = simpleTokenizer.tokenize(line);
268 |                 String name = "";
269 |                 String start = "";
270 |                 String end = "";
271 |                 int i = 0;
272 |                 while (!"(".equals(tokens[i])) {
273 |                     name += tokens[i] + " ";
274 |                     i++;
275 |                 }
276 |                 start = tokens[i + 1];
277 |                 end = tokens[i + 3];
278 |                 if (end.equalsIgnoreCase("present")) {
279 |                     end = start;
280 |                 }
281 |                 list.add(new President(name, Integer.parseInt(start),
282 |                         Integer.parseInt(end)));
283 |             }
284 |         } catch (IOException ex) {
285 |             ex.printStackTrace();
286 |         }
287 |         return list;
288 |     }
289 | }
290 | 


--------------------------------------------------------------------------------
/Chapter07/President.java:
--------------------------------------------------------------------------------
 1 | package packt;
 2 | 
 3 | public class President {
 4 |     private String name;
 5 |     private int start;
 6 |     private int end;
 7 | 
 8 |     public President(String name, int start, int end) {
 9 |         this.name = name;
10 |         this.start = start;
11 |         this.end = end;
12 |     }
13 | 
14 |     public int getStart() {
15 |         return start;
16 |     }
17 | 
18 |     public int getEnd() {
19 |         return end;
20 |     }
21 | 
22 |     public String getName() {
23 |         return name;
24 |     }
25 |     
26 |     
27 | }
28 | 


--------------------------------------------------------------------------------
/Chapter08/Chapter8.java:
--------------------------------------------------------------------------------
  1 | package packt;
  2 | 
  3 | import de.l3s.boilerpipe.BoilerpipeProcessingException;
  4 | import de.l3s.boilerpipe.document.TextBlock;
  5 | import de.l3s.boilerpipe.document.TextDocument;
  6 | import de.l3s.boilerpipe.sax.BoilerpipeSAXInput;
  7 | import de.l3s.boilerpipe.sax.HTMLDocument;
  8 | import de.l3s.boilerpipe.sax.HTMLFetcher;
  9 | import edu.stanford.nlp.dcoref.CorefChain;
 10 | import edu.stanford.nlp.dcoref.CorefCoreAnnotations.CorefChainAnnotation;
 11 | import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
 12 | import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
 13 | import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
 14 | import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
 15 | import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
 16 | import edu.stanford.nlp.ling.CoreLabel;
 17 | import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
 18 | import edu.stanford.nlp.pipeline.Annotation;
 19 | import edu.stanford.nlp.pipeline.StanfordCoreNLP;
 20 | import edu.stanford.nlp.semgraph.SemanticGraph;
 21 | import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation;
 22 | import edu.stanford.nlp.trees.Tree;
 23 | import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation;
 24 | import edu.stanford.nlp.util.CoreMap;
 25 | import java.io.BufferedReader;
 26 | import java.io.File;
 27 | import java.io.FileInputStream;
 28 | import java.io.FileNotFoundException;
 29 | import java.io.FileReader;
 30 | import java.io.IOException;
 31 | import java.io.InputStream;
 32 | import java.net.MalformedURLException;
 33 | import java.net.URL;
 34 | import java.util.ArrayList;
 35 | import java.util.HashMap;
 36 | import java.util.List;
 37 | import java.util.Map;
 38 | import java.util.Properties;
 39 | import java.util.Set;
 40 | import opennlp.tools.sentdetect.SentenceDetectorME;
 41 | import opennlp.tools.sentdetect.SentenceModel;
 42 | import opennlp.tools.tokenize.WhitespaceTokenizer;
 43 | import org.apache.pdfbox.pdmodel.PDDocument;
 44 | import org.apache.pdfbox.util.PDFTextStripper;
 45 | import org.apache.poi.POITextExtractor;
 46 | import org.apache.poi.POIXMLProperties.CoreProperties;
 47 | import org.apache.poi.POIXMLProperties.CustomProperties;
 48 | import org.apache.poi.POIXMLProperties.ExtendedProperties;
 49 | import org.apache.poi.POIXMLPropertiesTextExtractor;
 50 | import org.apache.poi.extractor.ExtractorFactory;
 51 | import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
 52 | import org.apache.poi.xwpf.usermodel.XWPFDocument;
 53 | import org.apache.xmlbeans.XmlException;
 54 | import org.xml.sax.InputSource;
 55 | import org.xml.sax.SAXException;
 56 | 
 57 | public class Chapter8 {
 58 | 
 59 |     public static void main(String[] args) {
 60 |         extractingText();
 61 |         searches();
 62 |         usingStanfordPipeline();
 63 |         usingStanfordPipelineParallel();
 64 |     }
 65 | 
 66 |     private static void usingStanfordPipeline() {
 67 |         String text = "The robber took the cash and ran.";
 68 |         Properties props = new Properties();
 69 |         props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
 70 | //        String path = "C:\\Current Books\\NLP and Java\\Downloads\\stanford-ner-2014-10-26\\classifiers";
 71 | //        props.put("ner.model",path+"/english.muc.7class.distsim.crf.ser.gz");
 72 |         StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
 73 | 
 74 |         Annotation annotation = new Annotation(text);
 75 |         System.out.println("Before annotate method executed ");
 76 |         Set<Class<?>> annotationSet = annotation.keySet();
 77 |         for (Class c : annotationSet) {
 78 |             System.out.println("\tClass: " + c.getCanonicalName());
 79 |         }
 80 |         pipeline.annotate(annotation);
 81 |         System.out.println("After annotate method executed ");
 82 |         annotationSet = annotation.keySet();
 83 |         for (Class c : annotationSet) {
 84 |             System.out.println("\tClass: " + c.getCanonicalName());
 85 |         }
 86 | 
 87 |         System.out.println("Total time: " + pipeline.timingInformation());
 88 |         List<CoreMap> sentences = annotation.get(SentencesAnnotation.class);
 89 | 
 90 |         for (CoreMap sentence : sentences) {
 91 |             for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
 92 |                 String word = token.get(TextAnnotation.class);
 93 |                 System.out.println("text of the token: " + word);
 94 |                 String pos = token.get(PartOfSpeechAnnotation.class);
 95 |                 System.out.println("POS Tag: " + pos);
 96 |                 String ne = token.get(NamedEntityTagAnnotation.class);
 97 |                 System.out.println("ne: " + ne);
 98 |                 Map<Integer, CorefChain> graph
 99 |                         = token.get(CorefChainAnnotation.class);
100 |                 System.out.println("graph: " + graph);
101 |             }
102 |             Tree tree = sentence.get(TreeAnnotation.class);
103 |             System.out.println("tree: " + tree);
104 | 
105 |             SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class);
106 |             System.out.println("dependencies: " + dependencies);
107 | 
108 |             Map<Integer, CorefChain> graph
109 |                     = annotation.get(CorefChainAnnotation.class);
110 |             System.out.println("graph: " + graph);
111 |         }
112 |     }
113 | 
114 |     private static void usingStanfordPipelineParallel() {
115 |         Properties props = new Properties();
116 |         props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
117 |         String path = "C:\\Current Books\\NLP and Java\\Downloads\\stanford-ner-2014-10-26\\classifiers";
118 |         props.put("ner.model", path + "/english.muc.7class.distsim.crf.ser.gz");
119 |         StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
120 | 
121 |         Annotation annotation1 = new Annotation("The robber took the cash and ran.");
122 |         Annotation annotation2 = new Annotation("The policeman chased him down the street.");
123 |         Annotation annotation3 = new Annotation("A passerby, watching the action, tripped the thief as he passed by.");
124 |         Annotation annotation4 = new Annotation("They all lived happily everafter, except for the thief of course.");
125 |         ArrayList<Annotation> list = new ArrayList();
126 |         list.add(annotation1);
127 |         list.add(annotation2);
128 |         list.add(annotation3);
129 |         list.add(annotation4);
130 |         Iterable<Annotation> iterable = list;
131 | 
132 |         pipeline.annotate(iterable);
133 | 
134 |         System.out.println("Total time: " + pipeline.timingInformation());
135 |         List<CoreMap> sentences = annotation2.get(SentencesAnnotation.class);
136 | 
137 |         for (CoreMap sentence : sentences) {
138 |             for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
139 |                 String word = token.get(TextAnnotation.class);
140 |                 String pos = token.get(PartOfSpeechAnnotation.class);
141 |                 System.out.println("Word: " + word + " POS Tag: " + pos);
142 |             }
143 |         }
144 |     }
145 | 
146 |     private static void searches() {
147 |         try (InputStream is = new FileInputStream(
148 |                 new File("C:/Current Books/NLP and Java/Models/en-sent.bin"));
149 |                 FileReader fr = new FileReader("Twenty Thousands.txt");
150 |                 BufferedReader br = new BufferedReader(fr)) {
151 |             SentenceModel model = new SentenceModel(is);
152 |             SentenceDetectorME detector = new SentenceDetectorME(model);
153 |             String line;
154 |             StringBuilder sb = new StringBuilder();
155 |             while ((line = br.readLine()) != null) {
156 |                 sb.append(line + " ");
157 |             }
158 |             String sentences[] = detector.sentDetect(sb.toString());
159 |             System.out.println(sentences.length);
160 |             // Convert each character to lowercase
161 |             for (int i = 0; i < sentences.length; i++) {
162 |                 sentences[i] = sentences[i].toLowerCase();
163 |             }
164 | 
165 |             // Remove stopwords
166 |             StopWords stopWords = new StopWords("stop-words_english_2_en.txt");
167 |             for (int i = 0; i < sentences.length; i++) {
168 |                 sentences[i] = stopWords.removeStopWords(sentences[i]);
169 |             }
170 | 
171 |             // Create map
172 |             HashMap<String, Word> wordMap = new HashMap();
173 |             for (int sentenceIndex = 0; sentenceIndex < sentences.length; sentenceIndex++) {
174 |                 String words[] = WhitespaceTokenizer.INSTANCE.tokenize(sentences[sentenceIndex]);
175 |                 Word word;
176 |                 for (int wordIndex = 0; wordIndex < words.length; wordIndex++) {
177 |                     String newWord = words[wordIndex];
178 |                     if (wordMap.containsKey(newWord)) {
179 |                         word = wordMap.remove(newWord);
180 |                     } else {
181 |                         word = new Word();
182 |                     }
183 |                     word.addWord(newWord, sentenceIndex, wordIndex);
184 |                     wordMap.put(newWord, word);
185 |                 }
186 |             }
187 |             System.out.println(wordMap.size());
188 | 
189 |             // Locate word in document
190 |             Word word = wordMap.get("reef");
191 |             ArrayList<Positions> positions = word.getPositions();
192 |             for (Positions position : positions) {
193 |                 System.out.println(word.getWord() + " is found at line " 
194 |                         + position.sentence + ", word " + position.position);
195 |             }
196 |         } catch (FileNotFoundException ex) {
197 |             ex.printStackTrace();
198 |         } catch (IOException ex) {
199 |             ex.printStackTrace();
200 |         }
201 | 
202 |     }
203 | 
204 |     private static void extractingText() {
205 |         usingBoilerpipe();
206 |         usingPOI();
207 |         usingPDFBox();
208 |     }
209 | 
210 |     private static void usingBoilerpipe() {
211 |         try {
212 |             URL url = new URL("http://en.wikipedia.org/wiki/Berlin");
213 |             HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
214 |             InputSource is = htmlDoc.toInputSource();
215 |             TextDocument document
216 |                     = new BoilerpipeSAXInput(is).getTextDocument();
217 | 
218 |             System.out.println(document.getText(true, true));
219 | 
220 |             System.out.println("--------------------------------");
221 |             List<TextBlock> blocks = document.getTextBlocks();
222 |             for (TextBlock block : blocks) {
223 |                 System.out.println(block.isContent());
224 |                 System.out.println(block.getText());
225 |                 System.out.println(block.getNumWords());
226 |                 System.out.println("------");
227 |             }
228 |         } catch (MalformedURLException ex) {
229 |             ex.printStackTrace();
230 |         } catch (BoilerpipeProcessingException | SAXException | IOException ex) {
231 |             ex.printStackTrace();
232 |         }
233 |     }
234 | 
235 |     private static void usingPDFBox() {
236 |         try {
237 |             File file = new File("TestDocument.pdf");
238 |             PDDocument pdDocument = PDDocument.load(file);
239 |             PDFTextStripper stripper = new PDFTextStripper();
240 |             String text = stripper.getText(pdDocument);
241 |             System.out.println(text);
242 |             pdDocument.close();
243 |         } catch (IOException ex) {
244 |             ex.printStackTrace();
245 |         }
246 |     }
247 | 
248 |     private static void usingPOI() {
249 |         try {
250 |             FileInputStream fis = new FileInputStream("TestDocument.docx");
251 |             POITextExtractor textExtractor = ExtractorFactory.createExtractor(fis);
252 |             System.out.println(textExtractor.getText());
253 |             
254 |             POITextExtractor metaExtractor = textExtractor.getMetadataTextExtractor();
255 |             System.out.println(metaExtractor.getText());
256 |             System.out.println();
257 |             
258 |             fis = new FileInputStream("TestDocument.docx");
259 |             POIXMLPropertiesTextExtractor properties = new POIXMLPropertiesTextExtractor(new XWPFDocument(fis));
260 |             System.out.println(properties.getText());
261 |             System.out.println();
262 |             
263 |             CoreProperties coreProperties = properties.getCoreProperties();
264 |             System.out.println("Core Properties");
265 |             System.out.println(properties.getCorePropertiesText());
266 |             
267 |             System.out.println();
268 |             System.out.println("Creator: " + coreProperties.getCreator());
269 |             System.out.println("Date Created: " + coreProperties.getCreated());
270 |             System.out.println("Date Last Modified: " + coreProperties.getModified());
271 |             
272 |             System.out.println();
273 |             System.out.println("Extended Properties");
274 |             ExtendedProperties extendedProperties = properties.getExtendedProperties();
275 |             System.out.println(properties.getExtendedPropertiesText());
276 |             System.out.println();
277 |             System.out.println("Application: " + extendedProperties.getApplication());
278 |             System.out.println("Application Version: " + extendedProperties.getAppVersion());
279 |             System.out.println("Pages: " + extendedProperties.getPages());
280 |             
281 |             System.out.println();    
282 |             System.out.println("Custom Properties: " );
283 |             System.out.println(properties.getCustomPropertiesText());
284 |         } catch (IOException ex) {
285 |             ex.printStackTrace();
286 |         } catch (OpenXML4JException | XmlException ex) {
287 |             ex.printStackTrace();
288 |         }
289 |     }
290 | }
291 | 


--------------------------------------------------------------------------------
/Chapter08/Positions.java:
--------------------------------------------------------------------------------
 1 | package packt;
 2 | 
 3 | class Positions {
 4 |     int sentence;
 5 |     int position;
 6 | 
 7 |     Positions(int sentence, int position) {
 8 |         this.sentence = sentence;
 9 |         this.position = position;
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/Chapter08/StopWords.java:
--------------------------------------------------------------------------------
 1 | package packt;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.FileReader;
 5 | import java.io.IOException;
 6 | import java.util.ArrayList;
 7 | import java.util.Arrays;
 8 | import java.util.HashSet;
 9 | import java.util.Iterator;
10 | import opennlp.tools.tokenize.WhitespaceTokenizer;
11 | 
12 | public class StopWords {
13 | 
14 |     private String[] defaultStopWords = {"i", "a", "about", "an", "are", "as", "at",
15 |         "be", "by", "com", "for", "from", "how", "in", "is", "it", "of", "on",
16 |         "or", "that", "the", "this", "to", "was", "what", "when", "where",
17 |         "who", "will", "with"};
18 | 
19 |     private static HashSet stopWords = new HashSet();
20 | 
21 |     public StopWords() {
22 |         stopWords.addAll(Arrays.asList(defaultStopWords));
23 |     }
24 | 
25 |     public StopWords(String fileName) {
26 |         try {
27 |             BufferedReader bufferedreader
28 |                     = new BufferedReader(new FileReader(fileName));
29 |             String line = null;
30 |             while ((line = bufferedreader.readLine()) != null) {
31 | //                line = bufferedreader.readLine();
32 |                 System.out.println("---Adding: [" + line + "]" + (int)line.charAt(0));
33 |                 stopWords.add(line);
34 |             }
35 |         } catch (IOException ex) {
36 |             ex.printStackTrace();
37 |         }
38 |     }
39 | 
40 |     public void addStopWord(String word) {
41 |         stopWords.add(word);
42 |     }
43 | 
44 |     public String[] removeStopWords(String[] words) {
45 |         ArrayList<String> tokens = new ArrayList<String>(Arrays.asList(words));
46 |         for (int i = 0; i < tokens.size(); i++) {
47 | //            System.out.println(stopWords.contains(tokens.get(i)) + " " + tokens.get(i));
48 |             if (stopWords.contains(tokens.get(i))) {
49 |                 tokens.remove(i);
50 |             }
51 |         }
52 |         return (String[]) tokens.toArray(new String[tokens.size()]);
53 |     }
54 | 
55 |     public String removeStopWords(String words) {
56 |         String arr[] = WhitespaceTokenizer.INSTANCE.tokenize(words);
57 |         StringBuilder sb = new StringBuilder();
58 | //        ArrayList<String> tokens = new ArrayList<String>(Arrays.asList(arr));
59 |         for (int i = 0; i < arr.length; i++) {
60 | //            System.out.println(tokens.get(i) + "-");
61 |             if (stopWords.contains(arr[i])) {
62 | //                tokens.remove(i);
63 | //                System.out.println("Removing: [" + arr[i] + "]");
64 |             } else {
65 |                 sb.append(arr[i]+" ");
66 |             }
67 |         }
68 |         return sb.toString();
69 |     }
70 | 
71 |     public void displayStopWords() {
72 |         Iterator<String> iterator = stopWords.iterator();
73 |         while (iterator.hasNext()) {
74 |             System.out.print("[" + iterator.next() + "]  ");
75 |         }
76 |     }
77 | }
78 | 


--------------------------------------------------------------------------------
/Chapter08/Word.java:
--------------------------------------------------------------------------------
 1 | package packt;
 2 | 
 3 | import java.util.ArrayList;
 4 | 
 5 | public class Word {
 6 |     private String word;
 7 |     private final ArrayList<Positions> positions;
 8 | 
 9 |     public Word() {
10 |         this.positions = new ArrayList();
11 |     }
12 |     
13 |     public void addWord(String word, int sentence, int position) {
14 |         this.word = word;
15 |         Positions counts = new Positions(sentence, position);
16 |         positions.add(counts);
17 |     }
18 | 
19 |     public ArrayList<Positions> getPositions() {
20 |         return positions;
21 |     }
22 | 
23 |     public String getWord() {
24 |         return word;
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/Chapter09/TestMallet.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * To change this license header, choose License Headers in Project Properties.
 3 |  * To change this template file, choose Tools | Templates
 4 |  * and open the template in the editor.
 5 |  */
 6 | package chapter9;
 7 | 
 8 | /**
 9 |  *
10 |  * @author ashish
11 |  */
12 | 
13 | class MyData{
14 |     
15 | }
16 | public class TestMallet {
17 |     public static void main(String args[]){
18 |         
19 |     }
20 |     
21 | }
22 | 


--------------------------------------------------------------------------------
/Chapter10/CoreferenceDemo.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * To change this license header, choose License Headers in Project Properties.
 3 |  * To change this template file, choose Tools | Templates
 4 |  * and open the template in the editor.
 5 |  */
 6 | package chapter10;
 7 | 
 8 | import edu.stanford.nlp.coref.CorefCoreAnnotations.CorefChainAnnotation;
 9 | import edu.stanford.nlp.pipeline.Annotation;
10 | import edu.stanford.nlp.coref.data.CorefChain;
11 | import edu.stanford.nlp.coref.data.CorefChain.CorefMention;
12 | import edu.stanford.nlp.pipeline.StanfordCoreNLP;
13 | import java.io.File;
14 | import java.util.Iterator;
15 | import java.util.List;
16 | import java.util.Map;
17 | import java.util.Properties;
18 | import java.util.Set;
19 | import static org.jdom2.filter.Filters.document;
20 | 
21 | /**
22 |  *
23 |  * @author ashish
24 |  */
25 | public class CoreferenceDemo {
26 |     private static String getResourcePath(){
27 |         File currDir = new File(".");
28 |         String path = currDir .getAbsolutePath();
29 |         path = path.substring(0, path.length()-2);
30 |         System.out.println(path);
31 |             String resourcePath = path + File.separator  + "src/chapter10/";
32 |         return resourcePath;
33 |     }
34 |     public static void main(String args[]){
35 |         String sentence = "He took his cash and she took her change "  
36 |             + "and together they bought their lunch."; 
37 |         Properties props = new Properties();
38 |         props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
39 |         StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
40 |         Annotation annotation = new Annotation(sentence);
41 |         pipeline.annotate(annotation);
42 | //        Map<Integer, CorefChain> corefChainMap =  annotation.get(CorefChainAnnotation.class); 
43 | 
44 |         Map<Integer, CorefChain> corefChainMap = annotation.get(CorefChainAnnotation.class);
45 |         Set<Integer> set = corefChainMap.keySet(); 
46 |         Iterator<Integer> setIterator = set.iterator(); 
47 |         while(setIterator.hasNext()) { 
48 |             CorefChain corefChain = corefChainMap.get(setIterator.next()); 
49 |             System.out.println("CorefChain: " + corefChain); 
50 |             System.out.print("ClusterId: " + corefChain.getChainID()); 
51 |             CorefMention mention = corefChain.getRepresentativeMention(); 
52 |             System.out.println(" CorefMention: " + mention  
53 |                 + " Span: [" + mention.mentionSpan + "]"); 
54 | 
55 |             List<CorefMention> mentionList =  
56 |                 corefChain.getMentionsInTextualOrder(); 
57 |             Iterator<CorefMention> mentionIterator =  
58 |                 mentionList.iterator(); 
59 |             while(mentionIterator.hasNext()) { 
60 |                 CorefMention cfm = mentionIterator.next(); 
61 |                 System.out.println("tMention: " + cfm  
62 |                     + " Span: [" + mention.mentionSpan + "]"); 
63 |                 System.out.print("tMention Mention Type: "  
64 |                     + cfm.mentionType + " Gender: " + cfm.gender); 
65 |                 System.out.println(" Start: " + cfm.startIndex  
66 |                     + " End: " + cfm.endIndex); 
67 |             } 
68 |             System.out.println(); 
69 |         } 
70 |         
71 |         
72 |         
73 |     }
74 | }
75 | 


--------------------------------------------------------------------------------
/Chapter10/DemoParsing.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * To change this license header, choose License Headers in Project Properties.
 3 |  * To change this template file, choose Tools | Templates
 4 |  * and open the template in the editor.
 5 |  */
 6 | package chapter10;
 7 | 
 8 | import java.io.File;
 9 | import java.io.FileInputStream;
10 | import java.io.FileNotFoundException;
11 | import java.io.IOException;
12 | import java.io.InputStream;
13 | import java.util.logging.Level;
14 | import java.util.logging.Logger;
15 | import opennlp.tools.cmdline.parser.ParserTool;
16 | import opennlp.tools.parser.Parse;
17 | import opennlp.tools.parser.Parser;
18 | import opennlp.tools.parser.ParserFactory;
19 | import opennlp.tools.parser.ParserModel;
20 | 
21 | /**
22 |  *
23 |  * @author ashish
24 |  */
25 | public class DemoParsing {
26 |     private static String getResourcePath(){
27 |         File currDir = new File(".");
28 |         String path = currDir .getAbsolutePath();
29 |         path = path.substring(0, path.length()-2);
30 |         System.out.println(path);
31 |             String resourcePath = path + File.separator  + "src/chapter10/";
32 |         return resourcePath;
33 |     }
34 |     public static void main(String args[]){
35 |         String fileLocation = getResourcePath() + "en-parser-chunking.bin";
36 |         try {
37 |             InputStream modelInputStream = new FileInputStream(fileLocation);
38 |             ParserModel model = new ParserModel(modelInputStream);
39 |             Parser parser = ParserFactory.create(model);
40 |             String sentence = "The cow jumped over the moon"; 
41 |             Parse parses[] = ParserTool.parseLine(sentence, parser, 3); 
42 |             for(Parse parse : parses) { 
43 |                 parse.show(); 
44 |                 System.out.println("Probability: " + parse.getProb()); 
45 |                 parse.showCodeTree();
46 |             
47 |                 Parse children[] = parse.getChildren(); 
48 |                 for (Parse parseElement : children) { 
49 |                     System.out.println(parseElement.getText()); 
50 |                     System.out.println(parseElement.getType()); 
51 |                     Parse tags[] = parseElement.getTagNodes(); 
52 |                     System.out.println("Tags"); 
53 |                     for (Parse tag : tags) { 
54 |                         System.out.println("[" + tag + "]"  
55 |                             + " type: " + tag.getType()  
56 |                             + "  Probability: " + tag.getProb()  
57 |                             + "  Label: " + tag.getLabel()); 
58 |                     } 
59 |                 }
60 |             } 
61 |             
62 |         } catch (FileNotFoundException ex) {
63 |             ex.printStackTrace();
64 |         } catch (IOException ex) {
65 |             ex.printStackTrace();
66 |         }
67 |     }
68 |     
69 | }
70 | 


--------------------------------------------------------------------------------
/Chapter10/President.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * To change this license header, choose License Headers in Project Properties.
  3 |  * To change this template file, choose Tools | Templates
  4 |  * and open the template in the editor.
  5 |  */
  6 | package chapter10;
  7 | 
  8 | import edu.stanford.nlp.trees.TypedDependency;
  9 | import java.io.BufferedReader;
 10 | import java.io.File;
 11 | import java.io.FileReader;
 12 | import java.io.IOException;
 13 | import java.util.ArrayList;
 14 | import java.util.List;
 15 | import opennlp.tools.tokenize.SimpleTokenizer;
 16 | 
 17 | /**
 18 |  *
 19 |  * @author ashish
 20 |  */
 21 | public class President {
 22 |     private String name; 
 23 |     private int start; 
 24 |     private int end; 
 25 |  
 26 |     public President(){
 27 |         
 28 |     }
 29 |     public President(String name, int start, int end) { 
 30 |         this.name = name; 
 31 |         this.start = start; 
 32 |         this.end = end; 
 33 |     } 
 34 | 
 35 |     public String getName() {
 36 |         return name;
 37 |     }
 38 | 
 39 |     public int getStart() {
 40 |         return start;
 41 |     }
 42 | 
 43 |     public int getEnd() {
 44 |         return end;
 45 |     }
 46 |     
 47 |     
 48 |     private static int getOrder(String position) { 
 49 |         String tmp = ""; 
 50 |         int i = 0; 
 51 |         while (Character.isDigit(position.charAt(i))) { 
 52 |             tmp += position.charAt(i++); 
 53 |         } 
 54 |         return Integer.parseInt(tmp); 
 55 |     } 
 56 |     
 57 |     public void processWhoQuestion(List<TypedDependency> tdl) { 
 58 |     List<President> list = createPresidentList(); 
 59 |     for (TypedDependency dependency : tdl) { 
 60 |         if ("president".equalsIgnoreCase( 
 61 |                 dependency.gov().originalText()) 
 62 |                 && "adjectival modifier".equals( 
 63 |                   dependency.reln().getLongName())) { 
 64 |             String positionText =  
 65 |                 dependency.dep().originalText(); 
 66 |             int position = getOrder(positionText)-1; 
 67 |             System.out.println("The president is "  
 68 |                 + list.get(position).getName()); 
 69 |          } 
 70 |         } 
 71 |     } 
 72 |     private static String getResourcePath(){
 73 |         File currDir = new File(".");
 74 |         String path = currDir .getAbsolutePath();
 75 |         path = path.substring(0, path.length()-2);
 76 |         System.out.println(path);
 77 |             String resourcePath = path + File.separator  + "src/chapter10/";
 78 |         return resourcePath;
 79 |     }
 80 |     public List<President> createPresidentList() { 
 81 |     ArrayList<President> list = new ArrayList<>(); 
 82 |     String line = null; 
 83 |     try (FileReader reader = new FileReader(getResourcePath() + "PresidentList"); 
 84 |             BufferedReader br = new BufferedReader(reader)) { 
 85 |         while ((line = br.readLine()) != null) { 
 86 |             System.out.println(">>>>>>>>>>>>." + line);
 87 |             SimpleTokenizer simpleTokenizer =  
 88 |                 SimpleTokenizer.INSTANCE; 
 89 |             String tokens[] = simpleTokenizer.tokenize(line); 
 90 |             for(int i=0;i<tokens.length;i++)
 91 |                 System.out.println(tokens[i]);
 92 |             String name = ""; 
 93 |             String start = ""; 
 94 |             String end = ""; 
 95 |             int i = 0; 
 96 |             while (!tokens[i].equals("(")) { 
 97 |                 name += tokens[i] + " "; 
 98 |                 i++; 
 99 |             } 
100 |             start = tokens[i + 1]; 
101 |             end = tokens[i + 3]; 
102 |             if (end.equalsIgnoreCase("present")) { 
103 |                 end = start; 
104 |             } 
105 |             list.add(new President(name,  
106 |                 Integer.parseInt(start), 
107 |                 Integer.parseInt(end))); 
108 |         } 
109 |      } catch (IOException ex) { 
110 |         // Handle exceptions 
111 |     } 
112 |     return list; 
113 | } 
114 | }
115 | 


--------------------------------------------------------------------------------
/Chapter10/StanfordLexicalDemo.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * To change this license header, choose License Headers in Project Properties.
 3 |  * To change this template file, choose Tools | Templates
 4 |  * and open the template in the editor.
 5 |  */
 6 | package chapter10;
 7 | 
 8 | import edu.stanford.nlp.ling.CoreLabel;
 9 | import edu.stanford.nlp.ling.SentenceUtils;
10 | import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
11 | import edu.stanford.nlp.process.CoreLabelTokenFactory;
12 | import edu.stanford.nlp.process.PTBTokenizer;
13 | import edu.stanford.nlp.process.Tokenizer;
14 | import edu.stanford.nlp.process.TokenizerFactory;
15 | import edu.stanford.nlp.trees.GrammaticalStructure;
16 | import edu.stanford.nlp.trees.GrammaticalStructureFactory;
17 | import edu.stanford.nlp.trees.Tree;
18 | import edu.stanford.nlp.trees.TreePrint;
19 | import edu.stanford.nlp.trees.TreebankLanguagePack;
20 | import edu.stanford.nlp.trees.TypedDependency;
21 | import java.io.File;
22 | import java.io.StringReader;
23 | import java.util.List;
24 | 
25 | /**
26 |  *
27 |  * @author ashish
28 |  */
29 | public class StanfordLexicalDemo {
30 |     private static String getResourcePath(){
31 |         File currDir = new File(".");
32 |         String path = currDir .getAbsolutePath();
33 |         path = path.substring(0, path.length()-2);
34 |         System.out.println(path);
35 |             String resourcePath = path + File.separator  + "src/chapter10/";
36 |         return resourcePath;
37 |     }
38 |     
39 |     public static void main(String args[]){
40 |         String parseModel = getResourcePath() + "englishPCFG.ser.gz";
41 |         LexicalizedParser lexicalizedParser = LexicalizedParser.loadModel(parseModel);
42 |         String [] sentenceArray = {"The", "cow" ,"jumped", "over", "the", "moon", "."};
43 |         List<CoreLabel> words = SentenceUtils.toCoreLabelList(sentenceArray);
44 |         Tree parseTree = lexicalizedParser.apply(words); 
45 |         parseTree.pennPrint(); 
46 |         
47 |         TreePrint treePrint =  new TreePrint("typedDependenciesCollapsed"); 
48 |         treePrint.printTree(parseTree); 
49 |         
50 |         
51 |         String sentence = "The cow jumped over the moon."; 
52 |         TokenizerFactory<CoreLabel> tokenizerFactory =  PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); 
53 |         Tokenizer<CoreLabel> tokenizer =  tokenizerFactory.getTokenizer(new StringReader(sentence)); 
54 |         List<CoreLabel> wordList = tokenizer.tokenize(); 
55 |         parseTree = lexicalizedParser.apply(wordList); 
56 |         TreebankLanguagePack tlp =  lexicalizedParser.treebankLanguagePack(); 
57 |         GrammaticalStructureFactory gsf =  tlp.grammaticalStructureFactory(); 
58 |         GrammaticalStructure gs =  gsf.newGrammaticalStructure(parseTree); 
59 |         List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); 
60 |         System.out.println(tdl); 
61 |         
62 |         for(TypedDependency dependency : tdl) { 
63 |             System.out.println("Governor Word: [" + dependency.gov()  
64 |                 + "] Relation: [" + dependency.reln().getLongName() 
65 |                 + "] Dependent Word: [" + dependency.dep() + "]"); 
66 |         } 
67 |         
68 |     }
69 | }
70 | 


--------------------------------------------------------------------------------
/Chapter10/WordDependencyDemo.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * To change this license header, choose License Headers in Project Properties.
 3 |  * To change this template file, choose Tools | Templates
 4 |  * and open the template in the editor.
 5 |  */
 6 | package chapter10;
 7 | 
 8 | import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
 9 | import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
10 | import edu.stanford.nlp.process.CoreLabelTokenFactory;
11 | import edu.stanford.nlp.process.PTBTokenizer;
12 | import edu.stanford.nlp.process.Tokenizer;
13 | import edu.stanford.nlp.process.TokenizerFactory;
14 | import edu.stanford.nlp.ling.CoreLabel;
15 | import edu.stanford.nlp.trees.GrammaticalStructure;
16 | import edu.stanford.nlp.trees.GrammaticalStructureFactory;
17 | import edu.stanford.nlp.trees.Tree;
18 | import edu.stanford.nlp.trees.TreebankLanguagePack;
19 | import edu.stanford.nlp.trees.TypedDependency;
20 | import java.io.File;
21 | import java.io.StringReader;
22 | import java.util.List;
23 | 
24 | /**
25 |  *
26 |  * @author ashish
27 |  */
28 | public class WordDependencyDemo {
29 |     private static String getResourcePath(){
30 |         File currDir = new File(".");
31 |         String path = currDir .getAbsolutePath();
32 |         path = path.substring(0, path.length()-2);
33 |         System.out.println(path);
34 |             String resourcePath = path + File.separator  + "src/chapter10/";
35 |         return resourcePath;
36 |     }
37 |     
38 |     public static void main(String args[]){
39 |         String question = "Who is the 32nd president of the United States?"; 
40 |         String parseModel = getResourcePath() + "englishPCFG.ser.gz";
41 |         
42 |         LexicalizedParser lexicalizedParser = LexicalizedParser.loadModel(parseModel);
43 |         TokenizerFactory<CoreLabel> tokenizerFactory =  PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); 
44 |         Tokenizer<CoreLabel> tokenizer =  tokenizerFactory.getTokenizer(new StringReader(question)); 
45 |         List<CoreLabel> wordList = tokenizer.tokenize(); 
46 |         Tree parseTree = lexicalizedParser.apply(wordList); 
47 | 
48 |         TreebankLanguagePack tlp =  lexicalizedParser.treebankLanguagePack(); 
49 |         GrammaticalStructureFactory gsf =  tlp.grammaticalStructureFactory(); 
50 |         GrammaticalStructure gs =  gsf.newGrammaticalStructure(parseTree); 
51 |         List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); 
52 |         System.out.println(tdl); 
53 |         for (TypedDependency dependency : tdl) { 
54 |             System.out.println("Governor Word: [" + dependency.gov()  
55 |                 + "] Relation: [" + dependency.reln().getLongName() 
56 |                 + "] Dependent Word: [" + dependency.dep() + "]"); 
57 |         } 
58 |         
59 |         for (TypedDependency dependency : tdl) { 
60 |             if ("nominal subject".equals( dependency.reln().getLongName()) 
61 |                 && "who".equalsIgnoreCase( dependency.gov().originalText())) { 
62 |                 President p = new President();
63 |                 p.processWhoQuestion(tdl); 
64 |             } 
65 |         } 
66 |         
67 |     }
68 | }
69 | 


--------------------------------------------------------------------------------
/Chapter11/HTMLExtractorDemo.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * To change this license header, choose License Headers in Project Properties.
 3 |  * To change this template file, choose Tools | Templates
 4 |  * and open the template in the editor.
 5 |  */
 6 | package chapter11;
 7 | 
 8 | import de.l3s.boilerpipe.BoilerpipeProcessingException;
 9 | import de.l3s.boilerpipe.document.TextDocument;
10 | import de.l3s.boilerpipe.sax.BoilerpipeSAXInput;
11 | import de.l3s.boilerpipe.sax.HTMLFetcher;
12 | import java.net.MalformedURLException;
13 | import java.net.URL;
14 | import java.util.logging.Level;
15 | import java.util.logging.Logger;
16 | import de.l3s.boilerpipe.sax.HTMLDocument;
17 | import java.io.IOException;
18 | import org.xml.sax.InputSource;
19 | import org.xml.sax.SAXException;
20 | 
21 | /**
22 |  *
23 |  * @author ashish
24 |  */
25 | public class HTMLExtractorDemo {
26 |     public static void main(String args[]){
27 |         try{
28 |             URL url = new URL("https://en.wikipedia.org/wiki/Berlin");
29 |             HTMLDocument htmldoc = HTMLFetcher.fetch(url);
30 |             InputSource is = htmldoc.toInputSource();
31 |             TextDocument document = new BoilerpipeSAXInput(is).getTextDocument();
32 |             System.out.println(document.getText(true, true));
33 |         } catch (MalformedURLException ex) {
34 |             System.out.println(ex);
35 |             Logger.getLogger(HTMLExtractorDemo.class.getName()).log(Level.SEVERE, null, ex);
36 |         } catch (IOException ex) {
37 |             System.out.println(ex);
38 |             Logger.getLogger(HTMLExtractorDemo.class.getName()).log(Level.SEVERE, null, ex);
39 |         } catch (SAXException | BoilerpipeProcessingException ex) {
40 |             System.out.println(ex);
41 |             Logger.getLogger(HTMLExtractorDemo.class.getName()).log(Level.SEVERE, null, ex);
42 |         }
43 |     }
44 |     
45 | }
46 | 


--------------------------------------------------------------------------------
/Chapter11/PDFExtractor.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * To change this license header, choose License Headers in Project Properties.
 3 |  * To change this template file, choose Tools | Templates
 4 |  * and open the template in the editor.
 5 |  */
 6 | package chapter11;
 7 | 
 8 | import java.io.File;
 9 | import java.io.IOException;
10 | import org.apache.pdfbox.pdmodel.PDDocument;
11 | import org.apache.pdfbox.text.PDFTextStripper;
12 | 
13 | /**
14 |  *
15 |  * @author ashish
16 |  */
17 | public class PDFExtractor {
18 |     private static String getResourcePath(){
19 |         File currDir = new File(".");
20 |         String path = currDir .getAbsolutePath();
21 |         path = path.substring(0, path.length()-2);
22 |         String resourcePath = path + File.separator  + "src/chapter11/TestDocument.pdf";
23 |         return resourcePath;
24 |     }
25 |     public static void main(String args[]){
26 |         try{
27 |         File file = new File(getResourcePath());
28 |         PDDocument pd = PDDocument.load(file);
29 |         PDFTextStripper stripper = new PDFTextStripper();
30 |         String text= stripper.getText(pd);
31 |         System.out.println(text);
32 |         }
33 |         catch(IOException ex){
34 |             System.out.println(ex);
35 |         }
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/Chapter11/PipelineDemo.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * To change this license header, choose License Headers in Project Properties.
 3 |  * To change this template file, choose Tools | Templates
 4 |  * and open the template in the editor.
 5 |  */
 6 | package chapter11;
 7 | 
 8 | import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
 9 | import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
10 | import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
11 | import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
12 | import edu.stanford.nlp.ling.CoreLabel;
13 | import edu.stanford.nlp.pipeline.Annotation;
14 | import edu.stanford.nlp.pipeline.StanfordCoreNLP;
15 | import edu.stanford.nlp.util.CoreMap;
16 | import java.util.ArrayList;
17 | import java.util.List;
18 | import java.util.Properties;
19 | import java.util.Set;
20 | 
21 | /**
22 |  *
23 |  * @author ashish
24 |  */
25 | public class PipelineDemo {
26 |     public static void main(String args[]){
27 |         String text = "The robber took the cash and ran";
28 |         Properties props = new Properties();
29 |         props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
30 |         StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
31 |         Annotation annotation = new Annotation(text);
32 |         
33 |         System.out.println("Before annotate method executed ");
34 |         Set<Class<?>> annotationSet = annotation.keySet();
35 |         for(Class c : annotationSet) {
36 |             System.out.println("\tClass: " + c.getName());
37 |         }
38 | 
39 |         pipeline.annotate(annotation);
40 | 
41 |         System.out.println("After annotate method executed ");
42 |         annotationSet = annotation.keySet();
43 |         for(Class c : annotationSet) {
44 |             System.out.println("\tClass: " + c.getName());
45 |         }
46 |         List<CoreMap> sentences = annotation.get(SentencesAnnotation.class);
47 |         for (CoreMap sentence : sentences) {
48 |             for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
49 |                 String word = token.get(TextAnnotation.class); 
50 |                 String pos = token.get(PartOfSpeechAnnotation.class); 
51 |                 System.out.println(word);
52 |                 System.out.println(pos);
53 |             }
54 |         }
55 |         
56 |         
57 |         
58 |         Annotation annotation1 = new Annotation("The robber took the cash and ran.");
59 |         Annotation annotation2 = new Annotation("The policeman chased him down the street.");
60 |         Annotation annotation3 = new Annotation("A passerby, watching the action, tripped the thief "
61 |             + "as he passed by.");
62 |         Annotation annotation4 = new Annotation("They all lived happily ever after, except for the thief "
63 |             + "of course.");
64 |         
65 |         ArrayList<Annotation> list = new ArrayList();
66 |         list.add(annotation1);
67 |         list.add(annotation2);
68 |         list.add(annotation3);
69 |         list.add(annotation4);
70 |         Iterable<Annotation> iterable = list;
71 |         pipeline.annotate(iterable);
72 |         List<CoreMap> sentences1 = annotation2.get(SentencesAnnotation.class);
73 | 
74 |         for (CoreMap sentence : sentences1) {
75 |             for (CoreLabel token : 
76 |                     sentence.get(TokensAnnotation.class)) {
77 |                 String word = token.get(TextAnnotation.class);
78 |                 String pos = token.get(PartOfSpeechAnnotation.class);
79 |                 System.out.println("Word: " + word + " POS Tag: " + pos);
80 |             }
81 |         }
82 |     }
83 | }
84 | 


--------------------------------------------------------------------------------
/Chapter11/SearchText.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * To change this license header, choose License Headers in Project Properties.
  3 |  * To change this template file, choose Tools | Templates
  4 |  * and open the template in the editor.
  5 |  */
  6 | package chapter11;
  7 | 
  8 | import java.io.BufferedReader;
  9 | import java.io.File;
 10 | import java.io.FileInputStream;
 11 | import java.io.FileNotFoundException;
 12 | import java.io.FileReader;
 13 | import java.io.IOException;
 14 | import java.io.InputStream;
 15 | import java.util.ArrayList;
 16 | import java.util.HashMap;
 17 | import java.util.logging.Level;
 18 | import java.util.logging.Logger;
 19 | import opennlp.tools.sentdetect.SentenceDetectorME;
 20 | import opennlp.tools.sentdetect.SentenceModel;
 21 | import opennlp.tools.tokenize.WhitespaceTokenizer;
 22 | 
 23 | /**
 24 |  *
 25 |  * @author ashish
 26 |  */
 27 | class StopWords
 28 | {
 29 |     
 30 |     public String removeStopWords(String words) {
 31 |         String arr[] = WhitespaceTokenizer.INSTANCE.tokenize(words);
 32 |         StringBuilder sb = new StringBuilder();
 33 |         for (int i = 0; i < arr.length; i++) {
 34 |             if (words.contains(arr[i])) {
 35 |                 // Do nothing
 36 |             } else {
 37 |                 sb.append(arr[i]+" ");
 38 |             }
 39 |         }
 40 |         return sb.toString();
 41 |     }
 42 | }
 43 | public class SearchText {
 44 |     private static String getResourcePath(){
 45 |         File currDir = new File(".");
 46 |         String path = currDir .getAbsolutePath();
 47 |         path = path.substring(0, path.length()-2);
 48 |         String resourcePath = path + File.separator  + "src/chapter11/";
 49 |         return resourcePath;
 50 |     }
 51 |     
 52 |     public static void main(String args[]){
 53 |         try {
 54 |             InputStream is = new FileInputStream(new File(getResourcePath() + "en-sent.bin"));
 55 |             FileReader fr = new FileReader(getResourcePath() + "pg164.txt");
 56 |             BufferedReader br = new BufferedReader(fr);
 57 |             System.out.println(getResourcePath() + "en-sent.bin");
 58 |             SentenceModel model = new SentenceModel(is);
 59 |             SentenceDetectorME detector = new SentenceDetectorME(model);
 60 |             
 61 |             String line;
 62 |             StringBuilder sb = new StringBuilder();
 63 |             while((line = br.readLine())!=null){
 64 |                 sb.append(line + " ");
 65 |             }
 66 |             String sentences[] = detector.sentDetect(sb.toString());
 67 |             for (int i = 0; i < sentences.length; i++) {
 68 |                 sentences[i] = sentences[i].toLowerCase();
 69 |             }
 70 |             
 71 | //            StopWords stopWords = new StopWords("stop-words_english_2_en.txt");
 72 | //            for (int i = 0; i < sentences.length; i++) {
 73 | //                sentences[i] = stopWords.removeStopWords(sentences[i]);
 74 | //            }
 75 |             
 76 |             HashMap<String, Word> wordMap = new HashMap();
 77 |             for (int sentenceIndex = 0; sentenceIndex < sentences.length; sentenceIndex++) {
 78 |             String words[] = WhitespaceTokenizer.INSTANCE.tokenize(sentences[sentenceIndex]);
 79 |             Word word;
 80 |             for (int wordIndex = 0; 
 81 |                     wordIndex < words.length; wordIndex++) {
 82 |                 String newWord = words[wordIndex];
 83 |                 if (wordMap.containsKey(newWord)) {
 84 |                      word = wordMap.remove(newWord);
 85 |                 } else {
 86 |                     word = new Word();
 87 |                 }
 88 |                 word.addWord(newWord, sentenceIndex, wordIndex);
 89 |                 wordMap.put(newWord, word);
 90 |             }
 91 | //            for(String k : wordMap.keySet()){
 92 | //                System.out.println(k);
 93 | //            }
 94 |             Word sword = wordMap.get("sea");
 95 |             ArrayList<Positions> positions = sword.getPositions();
 96 |             for (Positions position : positions) {
 97 |                 System.out.println(sword.getWord() + " is found at line " 
 98 |                     + position.sentence + ", word " 
 99 |                     + position.position);
100 |             }
101 |         }
102 | 
103 |         } catch (FileNotFoundException ex) {
104 |             Logger.getLogger(SearchText.class.getName()).log(Level.SEVERE, null, ex);
105 |         } catch (IOException ex) {
106 |             Logger.getLogger(SearchText.class.getName()).log(Level.SEVERE, null, ex);
107 |         }
108 |     }
109 | }
110 | 


--------------------------------------------------------------------------------
/Chapter11/TikaDemo.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * To change this license header, choose License Headers in Project Properties.
 3 |  * To change this template file, choose Tools | Templates
 4 |  * and open the template in the editor.
 5 |  */
 6 | package chapter11;
 7 | 
 8 | import java.io.File;
 9 | import java.io.IOException;
10 | import java.io.InputStream;
11 | import java.util.Arrays;
12 | import java.util.logging.Level;
13 | import java.util.logging.Logger;
14 | import org.apache.tika.Tika;
15 | import org.apache.tika.exception.TikaException;
16 | import org.apache.tika.metadata.Metadata;
17 | 
18 | /**
19 |  *
20 |  * @author ashish
21 |  */
22 | public class TikaDemo {
23 |     private static String getResourcePath(){
24 |         File currDir = new File(".");
25 |         String path = currDir .getAbsolutePath();
26 |         path = path.substring(0, path.length()-2);
27 |         String resourcePath = path + File.separator  + "src/chapter11/TestDocument.pdf";
28 |         return resourcePath;
29 |     }
30 |     public static void main(String args[]){
31 |         Tika tika = new Tika();
32 |         try{
33 |             File file = new File(getResourcePath());            
34 |             String filetype = tika.detect(file);
35 |             
36 |             System.out.println(filetype);
37 |             System.out.println(tika.parseToString(file));
38 |             
39 |             
40 |         } catch (IOException ex) {
41 |             Logger.getLogger(TikaDemo.class.getName()).log(Level.SEVERE, null, ex);
42 |         } catch (TikaException ex) {
43 |             Logger.getLogger(TikaDemo.class.getName()).log(Level.SEVERE, null, ex);
44 |         }
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/Chapter11/Word.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * To change this license header, choose License Headers in Project Properties.
 3 |  * To change this template file, choose Tools | Templates
 4 |  * and open the template in the editor.
 5 |  */
 6 | package chapter11;
 7 | 
 8 | import java.util.ArrayList;
 9 | 
10 | /**
11 |  *
12 |  * @author ashish
13 |  */
14 | class Positions {
15 |     int sentence;
16 |     int position;
17 | 
18 |     Positions(int sentence, int position) {
19 |         this.sentence = sentence;
20 |         this.position = position;
21 |     }
22 | }
23 | 
24 | 
25 | public class Word {
26 |     private String word;
27 |     private final ArrayList<Positions> positions;
28 | 
29 |     public Word() {
30 |         this.positions = new ArrayList();
31 |     }
32 | 
33 |     public void addWord(String word, int sentence, 
34 |             int position) {
35 |         this.word = word;
36 |         Positions counts = new Positions(sentence, position);
37 |         positions.add(counts);
38 |     }
39 | 
40 |     public ArrayList<Positions> getPositions() {
41 |         return positions;
42 |     }
43 | 
44 |     public String getWord() {
45 |         return word;
46 |     }
47 | }
48 | 


--------------------------------------------------------------------------------
/Chapter11/WordDocExtractor.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * To change this license header, choose License Headers in Project Properties.
 3 |  * To change this template file, choose Tools | Templates
 4 |  * and open the template in the editor.
 5 |  */
 6 | package chapter11;
 7 | 
 8 | import java.io.File;
 9 | import java.io.FileInputStream;
10 | import java.io.FileNotFoundException;
11 | import java.io.IOException;
12 | import java.util.logging.Level;
13 | import java.util.logging.Logger;
14 | import org.apache.poi.POITextExtractor;
15 | import org.apache.poi.POIXMLProperties.CoreProperties;
16 | import org.apache.poi.POIXMLProperties.ExtendedProperties;
17 | import org.apache.poi.POIXMLPropertiesTextExtractor;
18 | import org.apache.poi.extractor.ExtractorFactory;
19 | import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
20 | import org.apache.poi.xwpf.usermodel.XWPFDocument;
21 | import org.apache.xmlbeans.XmlException;
22 | 
23 | /**
24 |  *
25 |  * @author ashish
26 |  */
27 | public class WordDocExtractor {
28 |     private static String getResourcePath(){
29 |         File currDir = new File(".");
30 |         String path = currDir .getAbsolutePath();
31 |         path = path.substring(0, path.length()-2);
32 |         String resourcePath = path + File.separator  + "src/chapter11/TestDocument.docx";
33 |         return resourcePath;
34 |     }
35 |     public static void main(String args[]){
36 |         try {
37 |             FileInputStream fis = new FileInputStream(getResourcePath());
38 |             POITextExtractor textExtractor = ExtractorFactory.createExtractor(fis);
39 |             System.out.println(textExtractor.getText());
40 |             
41 |             POITextExtractor metaExtractor = textExtractor.getMetadataTextExtractor();
42 |             System.out.println(metaExtractor.getText());
43 |             fis = new FileInputStream(getResourcePath());
44 |             POIXMLPropertiesTextExtractor properties = new POIXMLPropertiesTextExtractor(new XWPFDocument(fis));
45 |             CoreProperties coreProperties = properties.getCoreProperties();
46 |             System.out.println(properties.getCorePropertiesText());
47 | 
48 |             ExtendedProperties extendedProperties = properties.getExtendedProperties();
49 |             System.out.println(properties.getExtendedPropertiesText());
50 |             
51 |         } catch (FileNotFoundException ex) {
52 |             Logger.getLogger(WordDocExtractor.class.getName()).log(Level.SEVERE, null, ex);
53 |         } catch (IOException ex) {
54 |             Logger.getLogger(WordDocExtractor.class.getName()).log(Level.SEVERE, null, ex);
55 |         } catch (OpenXML4JException | XmlException ex) {
56 |             Logger.getLogger(WordDocExtractor.class.getName()).log(Level.SEVERE, null, ex);
57 |         }
58 |         
59 |     }
60 | }
61 | 


--------------------------------------------------------------------------------
/Chapter12/GenerateAIML.java:
--------------------------------------------------------------------------------
 1 | package chapter12;
 2 | 
 3 | /*
 4 |  * To change this license header, choose License Headers in Project Properties.
 5 |  * To change this template file, choose Tools | Templates
 6 |  * and open the template in the editor.
 7 |  */
 8 | 
 9 | 
10 | import java.io.File;
11 | import org.alicebot.ab.Bot;
12 | import org.alicebot.ab.MagicBooleans;
13 | 
14 | /**
15 |  *
16 |  * @author ashish
17 |  */
18 | public class GenerateAIML {
19 |     
20 |         private static final boolean TRACE_MODE = false;
21 |         static String botName = "appointment";
22 |  
23 |     public static void main(String[] args) {
24 |         try {
25 |  
26 |             String resourcesPath = getResourcesPath();
27 |             System.out.println(resourcesPath);
28 |             MagicBooleans.trace_mode = TRACE_MODE;
29 |             Bot bot = new Bot("appointment", resourcesPath);
30 |              
31 |             bot.writeAIMLFiles();
32 |  
33 |         } catch (Exception e) {
34 |             e.printStackTrace();
35 |         }
36 |     }
37 |  
38 |     private static String getResourcesPath(){
39 |         File currDir = new File(".");
40 |         String path = currDir .getAbsolutePath();
41 |         path = path.substring(0, path.length()-2);
42 |         System.out.println(path);
43 |             String resourcePath = path + File.separator  + "src/chapter12/mybot";
44 |         return resourcePath;
45 |     }
46 | }
47 |     
48 |     
49 | 
50 | 


--------------------------------------------------------------------------------
/Chapter12/Mychatbotdemo.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * To change this license header, choose License Headers in Project Properties.
 3 |  * To change this template file, choose Tools | Templates
 4 |  * and open the template in the editor.
 5 |  */
 6 | package chapter12;
 7 | 
 8 | import java.io.File;
 9 | import org.alicebot.ab.Bot;
10 | import org.alicebot.ab.Chat;
11 | import org.alicebot.ab.History;
12 | import org.alicebot.ab.MagicBooleans;
13 | import org.alicebot.ab.MagicStrings;
14 | import org.alicebot.ab.utils.IOUtils;
15 | 
16 | /**
17 |  *
18 |  * @author ashish
19 |  */
20 | class MyChat{
21 |     
22 | }
23 | 
24 | public class Mychatbotdemo {
25 |     private static final boolean TRACE_MODE = false;
26 |     static String botName = "appointment";
27 |     private static String getResourcePath(){
28 |         File currDir = new File(".");
29 |         String path = currDir .getAbsolutePath();
30 |         path = path.substring(0, path.length()-2);
31 |         System.out.println(path);
32 |             String resourcePath = path + File.separator  + "src/chapter12/mybot";
33 |         return resourcePath;
34 |     }
35 |     public static void main(String args[]){
36 |         try
37 |         {
38 |             String resourcePath = getResourcePath();
39 |             System.out.println(resourcePath);
40 |             MagicBooleans.trace_mode = TRACE_MODE;
41 |             Bot bot = new Bot(botName, resourcePath);
42 |             Chat chatSession = new Chat(bot);
43 |             bot.brain.nodeStats();
44 |             String textLine = "";
45 |             System.out.println("Robot : Hello, I am your appointment scheduler May i know your name");
46 |             while(true){
47 |                 
48 |                 System.out.println("Human : ");
49 |                 textLine = IOUtils.readInputTextLine();
50 |                 if ((textLine==null) || (textLine.length()<1)){
51 |                     textLine = MagicStrings.null_input;
52 |                 }
53 |                 if(textLine.equals("q")){
54 |                     System.exit(0);
55 |                 } else if (textLine.equals("wq")){
56 |                     bot.writeQuit();
57 |                 } else {
58 |                     String request = textLine;
59 |                     if(MagicBooleans.trace_mode)
60 |                         System.out.println("STATE=" + request + ":THAT" + ((History)chatSession.thatHistory.get(0)).get(0) + ": Topic" + chatSession.predicates.get("topic"));
61 |                     String response = chatSession.multisentenceRespond(request);
62 |                     while(response.contains("&lt;"))
63 |                         response = response.replace("&lt;", "<");
64 |                     while(response.contains("&gt"))
65 |                         response = response.replace("&gt;", ">");
66 |                     System.out.println("Robot : " + response);
67 |                 }
68 |             }
69 |         }
70 |         catch(Exception e){
71 |             e.printStackTrace();
72 |         }
73 |         
74 |     }
75 | }
76 | 


--------------------------------------------------------------------------------
/Chapter12/Test.java:
--------------------------------------------------------------------------------
  1 | ///*
  2 | // * To change this license header, choose License Headers in Project Properties.
  3 | // * To change this template file, choose Tools | Templates
  4 | // * and open the template in the editor.
  5 | // */
  6 | //package chapter12;
  7 | //
  8 | ///**
  9 | // *
 10 | // * @author ashish
 11 | // */
 12 | //import org.alicebot.ab.*;
 13 | //
 14 | //import java.io.*;
 15 | //import java.util.HashMap;
 16 | //
 17 | //
 18 | public class Test {
 19 | //
 20 |     public static void main (String[] args) {
 21 | //
 22 | //
 23 | //
 24 | ////        MagicStrings.setRootPath();
 25 | //
 26 | //        AIMLProcessor.extension =  new PCAIMLProcessorExtension();
 27 | //        mainFunction(args);
 28 | //    }
 29 | //    public static void mainFunction (String[] args) {
 30 | //        String botName = "alice2";
 31 | //        MagicBooleans.jp_tokenize = false;
 32 | //        MagicBooleans.trace_mode = true;
 33 | //        String action="chat";
 34 | //        System.out.println(MagicStrings.program_name_version);
 35 | //        for (String s : args) {
 36 | //            //System.out.println(s);
 37 | //            String[] splitArg = s.split("=");
 38 | //            if (splitArg.length >= 2) {
 39 | //                String option = splitArg[0];
 40 | //                String value = splitArg[1];
 41 | //                //if (MagicBooleans.trace_mode) System.out.println(option+"='"+value+"'");
 42 | //                if (option.equals("bot")) botName = value;
 43 | //                if (option.equals("action")) action = value;
 44 | //                if (option.equals("trace")) {
 45 | //                    if (value.equals("true")) MagicBooleans.trace_mode = true;
 46 | //                    else MagicBooleans.trace_mode = false;
 47 | //                }
 48 | //                if (option.equals("morph")) {
 49 | //                    if (value.equals("true")) MagicBooleans.jp_tokenize = true;
 50 | //                    else {
 51 | //                        MagicBooleans.jp_tokenize = false;
 52 | //                    }
 53 | //                }
 54 | //             }
 55 | //        }
 56 | //        if (MagicBooleans.trace_mode) System.out.println("Working Directory = " + MagicStrings.root_path);
 57 | //        Graphmaster.enableShortCuts = true;
 58 | //        //Timer timer = new Timer();
 59 | //        Bot bot = new Bot(botName, MagicStrings.root_path, action); //
 60 | //        //EnglishNumberToWords.makeSetMap(bot);
 61 | //        //getGloss(bot, "c:/ab/data/wn30-lfs/wne-2006-12-06.xml");
 62 | //        if (MagicBooleans.make_verbs_sets_maps) Verbs.makeVerbSetsMaps(bot);
 63 | //        //bot.preProcessor.normalizeFile("c:/ab/data/log2.txt", "c:/ab/data/log2normal.txt");
 64 | //        //System.exit(0);
 65 | //        if (bot.brain.getCategories().size() < MagicNumbers.brain_print_size) bot.brain.printgraph();
 66 | //        if (MagicBooleans.trace_mode) System.out.println("Action = '"+action+"'");
 67 | //        if (action.equals("chat") || action.equals("chat-app")) {
 68 | //			boolean doWrites = ! action.equals("chat-app");
 69 | //			TestAB.testChat(bot, doWrites, MagicBooleans.trace_mode);
 70 | //		}
 71 | //        //else if (action.equals("test")) testSuite(bot, MagicStrings.root_path+"/data/find.txt");
 72 | //        else if (action.equals("ab")) TestAB.testAB(bot, TestAB.sample_file);
 73 | //        else if (action.equals("aiml2csv") || action.equals("csv2aiml")) convert(bot, action);
 74 | //        else if (action.equals("abwq")){AB ab = new AB(bot, TestAB.sample_file);  ab.abwq();}
 75 | //		else if (action.equals("test")) { TestAB.runTests(bot, MagicBooleans.trace_mode);     }
 76 | //        else if (action.equals("shadow")) { MagicBooleans.trace_mode = false; bot.shadowChecker();}
 77 | //        else if (action.equals("iqtest")) { ChatTest ct = new ChatTest(bot);
 78 | //                try {
 79 | //                    ct.testMultisentenceRespond();
 80 | //                }
 81 | //            catch (Exception ex) { ex.printStackTrace(); }
 82 | //            }
 83 | //        else System.out.println("Unrecognized action "+action);
 84 | //    }
 85 | //    public static void convert(Bot bot, String action) {
 86 | //        if (action.equals("aiml2csv")) bot.writeAIMLIFFiles();
 87 | //        else if (action.equals("csv2aiml")) bot.writeAIMLFiles();
 88 | //    }
 89 | //
 90 | //
 91 | //    public static void getGloss (Bot bot, String filename) {
 92 | //        System.out.println("getGloss");
 93 | //        try{
 94 | //            // Open the file that is the first
 95 | //            // command line parameter
 96 | //            File file = new File(filename);
 97 | //            if (file.exists()) {
 98 | //                FileInputStream fstream = new FileInputStream(filename);
 99 | //                // Get the object
100 | //                getGlossFromInputStream(bot, fstream);
101 | //                fstream.close();
102 | //            }
103 | //        }catch (Exception e){//Catch exception if any
104 | //            System.err.println("Error: " + e.getMessage());
105 | //        }
106 | //    }
107 | //    public static void getGlossFromInputStream (Bot bot, InputStream in)  {
108 | //        System.out.println("getGlossFromInputStream");
109 | //        BufferedReader br = new BufferedReader(new InputStreamReader(in));
110 | //        String strLine;
111 | //        int cnt = 0;
112 | //        int filecnt = 0;
113 | //        HashMap<String, String> def = new HashMap<String, String>();
114 | //        try {
115 | //            //Read File Line By Line
116 | //            String word; String gloss;
117 | //            word = null;
118 | //            gloss = null;
119 | //            while ((strLine = br.readLine()) != null)   {
120 | //
121 | //                if (strLine.contains("<entry word")) {
122 | //                    int start = strLine.indexOf("<entry word=\"")+"<entry word=\"".length();
123 | //                    //int end = strLine.indexOf(" status=");
124 | //                    int end = strLine.indexOf("#");
125 | //
126 | //                    word = strLine.substring(start, end);
127 | //                    word = word.replaceAll("_"," ");
128 | //                    System.out.println(word);
129 | //
130 | //                }
131 | //                else  if (strLine.contains("<gloss>")) {
132 | //                    gloss = strLine.replaceAll("<gloss>","");
133 | //                    gloss = gloss.replaceAll("</gloss>","");
134 | //                    gloss = gloss.trim();
135 | //                    System.out.println(gloss);
136 | //
137 | //                }
138 | //
139 | //
140 | //                if (word != null && gloss != null) {
141 | //                    word = word.toLowerCase().trim();
142 | //                    if (gloss.length() > 2) gloss = gloss.substring(0, 1).toUpperCase()+gloss.substring(1, gloss.length());
143 | //                    String definition;
144 | //                    if (def.keySet().contains(word))  {
145 | //                        definition = def.get(word);
146 | //                        definition = definition+"; "+gloss;
147 | //                    }
148 | //                    else definition = gloss;
149 | //                    def.put(word, definition);
150 | //                    word = null;
151 | //                    gloss = null;
152 | //                }
153 | //            }
154 | //            Category d = new Category(0,"WNDEF *","*","*","unknown","wndefs"+filecnt+".aiml");
155 | //            bot.brain.addCategory(d);
156 | //            for (String x : def.keySet()) {
157 | //                word = x;
158 | //                gloss = def.get(word)+".";
159 | //                cnt++;
160 | //                if (cnt%5000==0) filecnt++;
161 | //
162 | //                Category c = new Category(0,"WNDEF "+word,"*","*",gloss,"wndefs"+filecnt+".aiml");
163 | //                System.out.println(cnt+" "+filecnt+" "+c.inputThatTopic()+":"+c.getTemplate()+":"+c.getFilename());
164 | //                Nodemapper node;
165 | //                if ((node = bot.brain.findNode(c)) != null) node.category.setTemplate(node.category.getTemplate()+","+gloss);
166 | //                bot.brain.addCategory(c);
167 | //
168 | //
169 | //            }
170 | //        } catch (Exception ex) {
171 | //            ex.printStackTrace();
172 | //        }
173 | //    }
174 | //
175 | //    public static void sraixCache (String filename, Chat chatSession) {
176 | //        int limit = 1000;
177 | //        try {
178 | //            FileInputStream fstream = new FileInputStream(filename);
179 | //            // Get the object
180 | //            BufferedReader br = new BufferedReader(new InputStreamReader(fstream));
181 | //            String strLine;
182 | //            //Read File Line By Line
183 | //            int count = 0;
184 | //            while ((strLine = br.readLine()) != null && count++ < limit) {
185 | //                System.out.println("\n\nHuman: " + strLine);
186 | //
187 | //                String response = chatSession.multisentenceRespond(strLine);
188 | //                System.out.println("\nVasudev : " + response);
189 | //            }
190 | //        } catch (Exception ex) {
191 | //            ex.printStackTrace();
192 | //        }
193 |     }
194 | 
195 | 
196 | }
197 | 


--------------------------------------------------------------------------------
/Chapter12/TestClass.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * To change this license header, choose License Headers in Project Properties.
 3 |  * To change this template file, choose Tools | Templates
 4 |  * and open the template in the editor.
 5 |  */
 6 | package chapter12;
 7 | 
 8 | 
 9 | import java.io.File;
10 | import org.alicebot.ab.MagicBooleans;
11 | import org.alicebot.ab.Bot;
12 | import org.alicebot.ab.Chat;
13 | import org.alicebot.ab.History;
14 | import org.alicebot.ab.MagicStrings;
15 | import org.alicebot.ab.utils.IOUtils;
16 | 
17 | /**
18 |  *
19 |  * @author ashish
20 |  */
21 | public class TestClass {
22 |     private static final boolean TRACE_MODE = false;
23 |     static String botName = "super";
24 |     
25 |     private static String getResourcePath(){
26 |         File currDir = new File(".");
27 |         String path = currDir .getAbsolutePath();
28 |         path = path.substring(0, path.length()-2);
29 |         System.out.println(path);
30 |         String resourcePath = path + File.separator  + "src/chapter12" +  File.separator + "resources";
31 |         return resourcePath;
32 |     }
33 |     public static void main(String args[])
34 |     {
35 |         try
36 |         {
37 |             String resourcePath = getResourcePath();
38 |             System.out.println(resourcePath);
39 |             MagicBooleans.trace_mode = TRACE_MODE;
40 |             Bot bot = new Bot("super", resourcePath);
41 |             Chat chatSession = new Chat(bot);
42 |             bot.brain.nodeStats();
43 |             String textLine = "";
44 |            
45 |             while(true){
46 |                 System.out.println("Human : ");
47 |                 textLine = IOUtils.readInputTextLine();
48 |                 if ((textLine==null) || (textLine.length()<1)){
49 |                     textLine = MagicStrings.null_input;
50 |                 }
51 |                 if(textLine.equals("q")){
52 |                     System.exit(0);
53 |                 } else if (textLine.equals("wq")){
54 |                     bot.writeQuit();
55 |                 } else {
56 |                     String request = textLine;
57 |                     if(MagicBooleans.trace_mode)
58 |                         System.out.println("STATE=" + request + ":THAT" + ((History)chatSession.thatHistory.get(0)).get(0) + ": Topic" + chatSession.predicates.get("topic"));
59 |                     String response = chatSession.multisentenceRespond(request);
60 |                     while(response.contains("&lt;"))
61 |                         response = response.replace("&lt;", "<");
62 |                     while(response.contains("&gt"))
63 |                         response = response.replace("&gt;", ">");
64 |                     System.out.println("Robot : " + response);
65 |                 }
66 |             }
67 |         }
68 |         catch(Exception e){
69 |             e.printStackTrace();
70 |         }
71 |     }
72 |     
73 | }
74 | 


--------------------------------------------------------------------------------
/Chapter12/mybot.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Natural-Language-Processing-with-Java-Second-Edition/f235777262c48f4bbb4e52e45c4936896471080c/Chapter12/mybot.zip


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Packt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Natural Language Processing with Java Second Edition
 2 | 
 3 | <a href="https://www.packtpub.com/big-data-and-business-intelligence/natural-language-processing-java-second-edition?utm_source=github&utm_medium=repository&utm_campaign=9781789347999"><img src="https://dz13w8afd47il.cloudfront.net/sites/default/files/imagecache/ppv4_main_book_cover/B10786.png" alt="Book Name" height="256px" align="right"></a>
 4 | 
 5 | This is the code repository for [Natural Language Processing with Java Second Edition](https://www.packtpub.com/big-data-and-business-intelligence/natural-language-processing-java-second-edition?utm_source=github&utm_medium=repository&utm_campaign=9781789347999), published by Packt.
 6 | 
 7 | **Techniques for building machine learning and neural network models for NLP**
 8 | 
 9 | ## What is this book about?
10 | Natural Language Processing (NLP) allows you to take any sentence and identify patterns, special names, company names, and more. The second edition of Natural Language Processing with Java teaches you how to perform language analysis with the help of Java libraries, while constantly gaining insights from the outcomes.
11 | 
12 | This book covers the following exciting features:
13 | * Understand basic NLP tasks and how they relate to one another
14 | * Discover and use the available tokenization engines
15 | * Apply search techniques to find people, as well as things, within a document
16 | * Construct solutions to identify parts of speech within sentences
17 | * Use parsers to extract relationships between elements of a document
18 | 
19 | If you feel this book is for you, get your [copy](https://www.amazon.com/dp/1788993497) today!
20 | 
21 | <a href="https://www.packtpub.com/?utm_source=github&utm_medium=banner&utm_campaign=GitHubBanner"><img src="https://raw.githubusercontent.com/PacktPublishing/GitHub/master/GitHub.png" 
22 | alt="https://www.packtpub.com/" border="5" /></a>
23 | 
24 | 
25 | ## Instructions and Navigations
26 | All of the code is organized into folders. For example, Chapter02.
27 | 
28 | The code will look like the following:
29 | ```
30 | System.out.println(tagger.tagString("AFAIK she H8 cth!"));
31 | System.out.println(tagger.tagString(
32 |  "BTW had a GR8 tym at the party BBIAM."));
33 | ```
34 | 
35 | **Following is what you need for this book:**
36 | Natural Language Processing with Java is for you if you are a data analyst, data scientist, or machine learning engineer who wants to extract information from a language using Java. Knowledge of Java programming is needed, while a basic understanding of statistics will be useful but not mandatory.
37 | 
38 | With the following software and hardware list you can run all code files present in the book (Chapter 1-12).
39 | 
40 | ### Software and Hardware List
41 | 
42 | | Chapter  | Software required                   | OS required                        |
43 | | -------- | ------------------------------------| -----------------------------------|
44 | | 1        | OpenNLP                             |Windows, Mac OS X, and Linux (Any)  |
45 | |          | Stanford CoreNLP                    |                                    |
46 | |          | LingPipe                            |                                    |
47 | |          | Standford Tagger                    |                                    |
48 | |          |                                     |                                    |
49 | | 2        | OpenNLP Models                      | Windows, Mac OS X, and Linux (Any) |
50 | | 3        | LingPipe Models                     | Windows, Mac OS X, and Linux (Any) |
51 | | 4        | OpenNLPModels                       | Windows, Mac OS X, and Linux (Any) |
52 | | 5        | Gate Twitter Model                  |                                    |
53 | |          |   LingPipe POS Models               | Windows, Mac OS X, and Linux (Any) |
54 | | 6        | Stanford Classifier                 | Windows, Mac OS X, and Linux (Any) |
55 | | 8-12     | Boilerpipe                          |                                    |
56 | |          |   POI                               |                                    |
57 | |          |  PDFBox                             | Windows, Mac OS X, and Linux (Any) |
58 | 
59 | 
60 | 
61 | 
62 | We also provide a PDF file that has color images of the screenshots/diagrams used in this book. [Click here to download it](http://www.packtpub.com/sites/default/files/downloads/NaturalLanguageProcessingwithJavaSecondEdition_ColorImages.pdf).
63 | 
64 | ### Related products <Paste books from the Other books you may enjoy section>
65 | * Java Deep Learning Projects [[Packt]](https://www.packtpub.com/big-data-and-business-intelligence/java-deep-learning-projects?utm_source=github&utm_medium=repository&utm_campaign=9781788997454) [[Amazon]](https://www.amazon.com/dp/178899745X)
66 | 
67 | * Hands-On Natural Language Processing with Python [[Packt]](https://www.packtpub.com/big-data-and-business-intelligence/hands-natural-language-processing-python?utm_source=github&utm_medium=repository&utm_campaign=9781789139495) [[Amazon]](https://www.amazon.com/dp/178913949X)
68 | 
69 | ## Get to Know the Authors
70 | **Richard M. Reese**
71 | has worked in both industry and academia. For 17 years, he worked in the telephone and aerospace industries, serving in several capacities, including research and development, software development, supervision, and training. He currently teaches at Tarleton State University. Richard has written several Java books and a C Pointer book. He uses a concise and easy-to-follow approach to teaching about topics. His Java books have addressed EJB 3.1, updates to Java 7 and 8, certification, functional programming, jMonkeyEngine, and natural language processing.
72 | 
73 | **AshishSingh Bhatia**
74 | is a learner, reader, seeker, and developer at core. He has over 10 years of IT experience in different domains, including banking, ERP, and education. He is persistently passionate about Python, Java, R, and web and mobile development. He is always ready to explore new technologies.
75 | 
76 | 
77 | ## Other books by the authors
78 | * [Java for Data Science](https://www.packtpub.com/big-data-and-business-intelligence/java-data-science?utm_source=github&utm_medium=repository&utm_campaign=9781785280115)
79 | * [Machine Learning with R Cookbook, Second Edition](https://www.packtpub.com/big-data-and-business-intelligence/machine-learning-r-cookbook-second-edition?utm_source=github&utm_medium=repository&utm_campaign=9781785280115)
80 | 
81 | ### Suggestions and Feedback
82 | [Click here](https://docs.google.com/forms/d/e/1FAIpQLSdy7dATC6QmEL81FIUuymZ0Wy9vH1jHkvpY57OiMeKGqib_Ow/viewform) if you have any feedback or suggestions.
83 | 


--------------------------------------------------------------------------------