├── .gitignore
├── README.md
├── data
    ├── brown_dict
    │   ├── brown_dict.csv
    │   ├── brown_dict_updated.csv
    │   ├── brown_tags.csv
    │   ├── np_tags.csv
    │   ├── pos_trans.csv
    │   └── ptb_tags.csv
    └── entity-graph
    │   ├── db-acronyms.txt
    │   ├── db-article.txt
    │   ├── gpe_syns-updated.csv
    │   ├── org_syns-updated.csv
    │   └── person_syns-updated.csv
├── nbproject
    ├── private
    │   └── private.xml
    ├── project.properties
    └── project.xml
└── src
    ├── book
        ├── ch02.py
        ├── ch02_ex.py
        ├── ch03.py
        ├── ch03_ex.py
        ├── ch04_ex.py
        ├── ch05.py
        ├── ch05_ex.py
        ├── ch06.py
        ├── ch07.py
        ├── ch07_ex.py
        ├── ch08.py
        ├── ch09.py
        └── ch10.py
    ├── brown_dict
        ├── dict_build.py
        ├── phrase_seqs.py
        └── predict.py
    ├── cener
        ├── bootstrap.py
        ├── ce_phrases.txt
        ├── cener.py
        ├── cener_lib.py
        ├── cnet_reviews.txt
        ├── cnet_reviews_sents.txt
        └── test.txt
    ├── docsim
        ├── blogdoctest.py
        ├── docsim.py
        ├── scam_dist.py
        └── sugar-coffee-cocoa-docs.txt
    ├── drug_ner
        ├── apply_model.py
        ├── apply_regex_model.py
        ├── co_train.py
        ├── drug_ner_utils.py
        ├── ngram_convert.py
        └── parse_drugbank.py
    ├── entity-graph
        ├── 01-preprocess-data.py
        ├── 02-find-entities.py
        ├── 03-cluster-entity-mentions.ipynb
        ├── 04-generate-entity-sets.py
        ├── 05-find-corefs.py
        ├── 06-find-matches.py
        ├── 07-create-graphs.py
        ├── 08-explore-graph.ipynb
        └── graph-snapshot.png
    ├── genetagger
        ├── file_reformatter.py
        ├── gene.test
        ├── gene.train
        ├── gene.validate
        └── hmm_gene_ner.py
    ├── hangman
        ├── game.py
        └── gamestats.py
    ├── langmodel
        ├── med_lang_model.py
        ├── old_med_lang_model.py
        └── sentences.test
    ├── medorleg
        ├── README.md
        ├── db_loader.py
        ├── eval_model.py
        ├── model_params.py
        ├── ngram_counting_job.py
        ├── preprocess.py
        ├── regression_data.py
        └── testset_splitter.py
    ├── medorleg2
        ├── arffwriter.py
        ├── arffwriter_test.py
        ├── classify.py
        └── preprocess.py
    ├── phrases
        ├── interesting_phrases.py
        └── preprocess.py
    ├── sameword
        ├── same_word_finder.py
        └── test.dat
    ├── semantic
        └── short_sentence_similarity.py
    ├── similar-tweets-nmslib
        ├── 01-load-sqlite3.py
        ├── 02-generate-vectors.py
        ├── 03-query-times.py
        ├── 04-chart-times.py
        └── README.md
    ├── stlclust
        ├── cluster_titles.py
        ├── extract_stl.py
        └── fuzz_similarity.py
    └── topicmodel
        ├── bok_model.py
        ├── bow_model.py
        ├── gensim_preprocess.py
        ├── gensim_word2vec.py
        ├── kea_preprocess.py
        ├── keywords_merge.py
        ├── lda_model.py
        ├── lsi_model.py
        ├── num_topics.py
        ├── viz_doctopic_distrib.py
        ├── viz_topics_scatter.py
        ├── viz_topics_wordcloud.py
        └── word2vec_cluster_plot.py


/.gitignore:
--------------------------------------------------------------------------------
1 | **/.ipynb_checkpoints/*
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | nltk-examples
 2 | =============
 3 | 
 4 | ## src/book/
 5 | 
 6 | Worked examples from the NLTK Book
 7 | 
 8 | ## src/cener/
 9 | 
10 | A Consumer Electronics Named Entity Recognizer - uses an NLTK Maximum Entropy
11 | Classifier and IOB tags to train and predict Consumer Electronics named entities
12 | in text.
13 | 
14 | ## src/sameword/
15 | 
16 | A simple tool to detect word equivalences using Wordnet. Reads a TSV file of word pairs and returns the original (LHS) word if the words don't have the same meaning. Useful (at least in my case) for checking the results of a set of regular expressions to convert words from British to American spellings and for converting Greek/Latin plurals to their singular form (both based on patterns).
17 | 
18 | ## src/genetagger/
19 | 
20 | A Named Entity Recognizer for Genes - uses NLTK's HMM package to build an HMM tagger to recognize Gene names from within English text.
21 | 
22 | ## src/langmodel/
23 | 
24 | A trigram backoff language model trained on medical XML documents, and used to estimate the normalized log probability of an unknown sentence.
25 | 
26 | ## src/docsim/
27 | 
28 | A proof of concept for calculating inter-document similarities for a collection of text documents for a cheating detection system. Contains implementation of the SCAM (Standard Copy Analysis Mechanism) in order to possible near-duplicate documents.
29 | 
30 | ## src/phrases/
31 | 
32 | A proof of concept to identify significant word collocations as phrases from about an hours worth of messages from the Twitter 1% feed, calculated as a log-likelihood ratio of the probability that they are dependent vs that they are independent. Based on the approach described in "Building Search Applications: Lucene, LingPipe and GATE" by Manu Konchady, but extended to handle any size N-gram.
33 | 
34 | ## src/medorleg
35 | 
36 | A trigram interpolated model trained on medical and legal sentences, and used to classify a sentence as one of the two genres.
37 | 
38 | ## src/medorleg2
39 | 
40 | Uses the same training data as medorleg, but uses Scikit-Learn's text API and LinearSVC implementations to build a classifier that predicts the genre of an unseen sentence.
41 | 
42 | Also contains an ARFF writer to convert the X and y matrices to ARFF format for consumption by WEKA. This was done so we could reuse Scikit-Learn's text processing pipeline to build a WEKA model, which could then be used directly from within a Java based data pipeline.
43 | 
44 | ## src/brown\_dict
45 | 
46 | Using a POS tagged (Brown) corpus to build a dictionary of words and their sense frequencies, and using a chunked (Penn Treebank subset) corpus to build a reference set of POS sequences and POS state transitions to allow context free POS tagging of standalone words and phrase type detection of standalone phrases.
47 | 
48 | ### src/topicmodel
49 | 
50 | Topic modeling the PHR corpus with gensim. More information in these posts:
51 | 
52 | * [Topic Modeling with gensim](http://sujitpal.blogspot.com/2014/08/topic-modeling-with-gensim-over-past.html)
53 | * [Using Keyword Generation to refine Topic Models](http://sujitpal.blogspot.com/2015/07/using-keyword-generation-to-refine.html).
54 | 
55 | ### src/stlclust
56 | 
57 | Using DBSCAN to cluster section titles in clinical notes.
58 | 
59 | ### src/semantic
60 | 
61 | Python/NLTK implementation of the algorithm described in the paper - Sentence Similarity Based on Semantic Nets and Corpus Statistics by Li, et al.
62 | 
63 | ### src/drug\_ner
64 | 
65 | Drug name NER using one class classification approach. Only positive training set (drug name ngrams) are provided, along with an unlabelled dataset and estimate of proportion of positive data. More information on my blog post: [Classification with Positive Examples only](http://sujitpal.blogspot.com/2015/02/classification-with-positive-examples.html).
66 | 
67 | ### src/similar-tweets-nmslib
68 | 
69 | More information on my blog post: [Finding Similar Tweets with BERT and NMSLib](https://sujitpal.blogspot.com/2019/12/finding-similar-tweets-with-bert-and.html).
70 | 
71 | ### src/entity\_graph
72 | 
73 | More information on my blog post: [Entity Co-occurrence graphs as Mind Map](https://sujitpal.blogspot.com/2020/02/entity-co-occurrence-graphs-as-mind-maps.html).
74 | 
75 | 
76 | 


--------------------------------------------------------------------------------
/data/brown_dict/np_tags.csv:
--------------------------------------------------------------------------------
  1 | NN	4624
  2 | DT NN	3869
  3 | OT	2480
  4 | NN NN	2256
  5 | JJ NN	1515
  6 | DT JJ NN	1162
  7 | OT NN	1097
  8 | DT NN NN	875
  9 | DT	830
 10 | NN NN NN	547
 11 | JJ	359
 12 | OT OT OT	338
 13 | OT OT	320
 14 | JJ NN NN	287
 15 | DT JJ NN NN	199
 16 | DT NN NN NN	186
 17 | OT JJ NN	180
 18 | JJ JJ NN	172
 19 | DT NN OT NN	141
 20 | OT NN NN	133
 21 | NN OT NN	132
 22 | NN NN NN NN	123
 23 | NN OT	122
 24 | OT OT NN	108
 25 | DT JJ	103
 26 | VB NN	97
 27 | DT VB NN	96
 28 | DT JJ JJ NN	84
 29 | DT OT NN	67
 30 | NN NN OT NN	64
 31 | DT NN NN NN NN	61
 32 | VB	55
 33 | DT OT NN NN	53
 34 | NN OT NN NN	50
 35 | DT JJ OT NN	46
 36 | DT NN OT JJ NN	41
 37 | NN OT JJ NN	39
 38 | JJ NN NN NN	35
 39 | DT NN NN OT NN	33
 40 | DT NN OT	33
 41 | DT NN OT NN NN	33
 42 | OT NN NN NN	32
 43 | VB NN NN	32
 44 | DT NN JJ NN	31
 45 | JJ OT	31
 46 | OT NN OT NN	30
 47 | DT JJ NN NN NN	29
 48 | JJ JJ NN NN	29
 49 | NN NN NN NN NN	29
 50 | NN NN OT NN NN	29
 51 | DT RB JJ NN	27
 52 | NN JJ NN	24
 53 | NN NN OT JJ NN	24
 54 | RB JJ NN	23
 55 | DT JJ JJ NN NN	22
 56 | DT OT	22
 57 | OT JJ NN NN	22
 58 | OT JJ	19
 59 | DT OT OT OT NN	18
 60 | JJ VB NN	17
 61 | NN NN OT	17
 62 | RB	17
 63 | NN OT NN NN NN	16
 64 | OT OT JJ NN	16
 65 | DT OT OT NN	15
 66 | DT VB NN NN	15
 67 | DT NN VB	14
 68 | DT OT JJ NN	14
 69 | OT DT NN	14
 70 | OT JJ JJ NN	14
 71 | DT NN NN NN NN NN	13
 72 | NN VB	12
 73 | DT JJ NN OT NN	11
 74 | DT NN OT JJ NN NN	11
 75 | JJ NN OT NN	11
 76 | NN VB NN	11
 77 | OT NN OT	11
 78 | OT OT NN OT NN	11
 79 | DT JJ OT	10
 80 | DT NN NN OT NN NN	10
 81 | JJ JJ	10
 82 | NN NN NN OT NN	10
 83 | NN OT JJ NN NN	10
 84 | NN OT OT	10
 85 | OT VB NN	10
 86 | DT JJ NN JJ NN	9
 87 | DT JJ VB NN	9
 88 | DT NN OT NN NN NN	9
 89 | DT OT OT OT	9
 90 | IN NN	9
 91 | JJ NN NN NN NN	9
 92 | NN NN OT JJ NN NN	9
 93 | VB RB	9
 94 | DT NN VB NN	8
 95 | NN NN JJ NN	8
 96 | OT OT NN NN	8
 97 | OT OT OT JJ NN	8
 98 | DT OT OT OT NN NN	7
 99 | JJ JJ JJ NN	7
100 | JJ OT NN	7
101 | NN NN NN NN NN NN	7
102 | NN NN NN OT NN NN	7
103 | NN NN VB	7
104 | DT JJ NN NN NN NN	6
105 | DT JJ OT NN NN	6
106 | DT NN NN JJ NN	6
107 | DT NN NN OT	6
108 | DT NN OT JJ	6
109 | DT OT OT	6
110 | JJ NN OT	6
111 | OT NN NN NN NN	6
112 | RB JJ NN NN	6
113 | DT JJ JJ	5
114 | DT JJ OT OT OT	5
115 | DT NN NN NN NN NN NN	5
116 | DT NN OT JJ JJ NN	5
117 | DT NN VB RB	5
118 | DT OT OT NN NN	5
119 | DT OT OT OT JJ NN	5
120 | DT RB JJ JJ NN	5
121 | DT VB JJ NN	5
122 | JJ DT NN	5
123 | NN NN NN NN JJ NN NN	5
124 | OT DT	5
125 | OT NN JJ NN	5
126 | OT OT DT	5
127 | DT JJ NN OT	4
128 | DT JJ OT OT NN	4
129 | DT NN JJ NN NN	4
130 | DT NN NN NN JJ NN	4
131 | DT NN NN OT JJ NN	4
132 | DT NN NN VB	4
133 | DT OT NN NN NN	4
134 | DT OT OT JJ NN	4
135 | DT OT VB NN	4
136 | DT RB	4
137 | DT VB	4
138 | IN JJ NN NN	4
139 | NN NN NN NN JJ	4
140 | NN NN OT NN NN NN	4
141 | NN NN OT OT	4
142 | NN OT JJ JJ NN	4
143 | NN OT VB NN	4
144 | VB JJ NN	4
145 | DT DT	3
146 | DT JJ JJ JJ NN	3
147 | DT JJ NN NN OT NN	3
148 | DT JJ NN OT JJ NN	3
149 | DT JJ NN VB NN	3
150 | DT NN NN NN OT	3
151 | DT NN OT VB NN	3
152 | DT OT JJ JJ NN	3
153 | DT OT JJ NN NN	3
154 | DT OT OT OT NN NN NN	3
155 | DT RB JJ	3
156 | IN JJ NN	3
157 | JJ NN JJ NN OT	3
158 | JJ NN NN NN NN NN	3
159 | JJ NN OT JJ NN	3
160 | JJ NN OT OT OT OT	3
161 | NN JJ	3
162 | NN JJ NN NN	3
163 | NN NN OT JJ	3
164 | NN NN VB NN	3
165 | NN NN VB RB	3
166 | NN OT OT NN	3
167 | NN VB NN NN	3
168 | OT DT JJ NN	3
169 | OT DT NN NN	3
170 | OT JJ JJ NN NN	3
171 | OT JJ NN NN NN	3
172 | OT JJ OT NN	3
173 | OT JJ VB NN	3
174 | OT NN NN NN NN NN	3
175 | OT OT OT OT	3
176 | RB JJ JJ NN	3
177 | VB OT	3
178 | VB OT NN	3
179 | DT DT NN	2
180 | DT JJ NN NN NN VB	2
181 | DT JJ NN NN OT	2
182 | DT JJ NN OT NN NN	2
183 | DT JJ NN VB NN NN	2
184 | DT JJ NN VB RB	2
185 | DT JJ OT OT	2
186 | DT NN JJ	2
187 | DT NN NN NN OT NN	2
188 | DT NN OT JJ JJ NN NN	2
189 | DT NN OT NN NN NN NN	2
190 | DT NN OT VB	2
191 | DT OT JJ NN JJ NN	2
192 | DT OT NN JJ NN	2
193 | DT OT NN NN JJ NN	2
194 | DT RB JJ OT	2
195 | DT VB NN NN NN	2
196 | DT VB NN OT NN	2
197 | DT VB OT	2
198 | DT VB OT NN	2
199 | DT VB OT NN NN	2
200 | DT VB OT OT	2
201 | DT VB OT OT OT	2
202 | IN	2
203 | JJ JJ JJ NN NN	2
204 | JJ NN VB	2
205 | JJ OT OT OT	2
206 | NN JJ NN OT	2
207 | NN NN NN JJ	2
208 | NN NN NN JJ NN	2
209 | NN NN NN NN OT NN	2
210 | NN NN NN NN OT NN NN NN	2
211 | NN NN NN OT	2
212 | NN NN OT JJ JJ NN	2
213 | NN NN OT NN OT NN	2
214 | NN NN OT OT NN NN	2
215 | NN NN OT VB	2
216 | NN OT NN NN NN NN	2
217 | NN OT OT NN NN	2
218 | NN OT OT OT	2
219 | NN OT OT OT NN	2
220 | NN OT OT OT OT NN	2
221 | NN OT VB	2
222 | NN OT VB NN NN	2
223 | NN VB RB	2
224 | OT JJ JJ	2
225 | OT JJ NN OT NN	2
226 | OT NN NN NN JJ OT	2
227 | OT NN OT NN NN	2
228 | OT NN OT VB OT NN	2
229 | OT NN VB	2
230 | OT NN VB NN	2
231 | OT OT JJ JJ NN	2
232 | OT OT JJ NN NN	2
233 | OT OT NN JJ	2
234 | OT OT OT NN	2
235 | OT OT OT OT JJ NN	2
236 | RB JJ	2
237 | VB NN NN OT NN	2
238 | VB NN OT NN NN	2
239 | DT DT JJ NN	1
240 | DT DT NN OT NN	1
241 | DT JJ JJ NN NN NN	1
242 | DT JJ JJ NN VB	1
243 | DT JJ JJ OT NN	1
244 | DT JJ JJ VB NN	1
245 | DT JJ NN JJ	1
246 | DT JJ NN JJ NN NN NN	1
247 | DT JJ NN NN JJ NN	1
248 | DT JJ NN NN NN NN NN	1
249 | DT JJ NN NN OT JJ NN	1
250 | DT JJ NN NN OT VB	1
251 | DT JJ NN OT DT NN	1
252 | DT JJ NN OT JJ JJ	1
253 | DT JJ NN OT JJ NN NN	1
254 | DT JJ NN OT NN OT	1
255 | DT JJ NN OT VB OT NN	1
256 | DT JJ NN VB	1
257 | DT JJ NN VB NN OT	1
258 | DT JJ OT DT	1
259 | DT JJ OT DT JJ VB NN	1
260 | DT JJ OT JJ NN	1
261 | DT JJ OT NN OT JJ JJ JJ NN	1
262 | DT JJ OT OT JJ OT NN	1
263 | DT JJ OT OT OT NN	1
264 | DT JJ VB JJ NN	1
265 | DT JJ VB NN OT NN	1
266 | DT NN DT NN	1
267 | DT NN JJ JJ NN	1
268 | DT NN JJ JJ NN NN	1
269 | DT NN JJ NN VB NN	1
270 | DT NN NN JJ NN NN	1
271 | DT NN NN NN JJ NN NN NN	1
272 | DT NN NN NN NN NN OT NN	1
273 | DT NN NN NN NN OT NN	1
274 | DT NN NN NN NN OT VB NN	1
275 | DT NN NN NN NN VB	1
276 | DT NN NN NN OT JJ NN	1
277 | DT NN NN NN OT NN NN NN	1
278 | DT NN NN NN VB NN NN	1
279 | DT NN NN NN VB RB	1
280 | DT NN NN OT DT NN	1
281 | DT NN NN OT JJ JJ NN	1
282 | DT NN NN OT JJ JJ NN NN	1
283 | DT NN NN OT JJ OT OT OT NN	1
284 | DT NN NN OT NN JJ NN	1
285 | DT NN NN OT NN NN NN	1
286 | DT NN NN OT OT NN NN	1
287 | DT NN NN OT OT NN NN JJ OT	1
288 | DT NN NN OT OT VB	1
289 | DT NN NN OT RB VB NN NN	1
290 | DT NN NN OT VB	1
291 | DT NN NN RB OT	1
292 | DT NN NN VB NN	1
293 | DT NN NN VB RB	1
294 | DT NN OT JJ NN NN NN NN NN	1
295 | DT NN OT JJ OT NN	1
296 | DT NN OT NN JJ NN	1
297 | DT NN OT NN NN OT NN	1
298 | DT NN OT NN NN VB NN	1
299 | DT NN OT NN OT NN	1
300 | DT NN OT NN OT NN OT	1
301 | DT NN OT OT JJ NN	1
302 | DT NN OT OT NN	1
303 | DT NN OT OT NN NN	1
304 | DT NN OT OT NN NN JJ OT	1
305 | DT NN OT OT NN NN NN	1
306 | DT NN OT OT NN OT IN OT NN	1
307 | DT NN OT OT OT	1
308 | DT NN OT OT OT JJ JJ NN	1
309 | DT NN OT OT OT JJ NN	1
310 | DT NN OT OT OT NN NN JJ NN	1
311 | DT NN VB NN NN	1
312 | DT NN VB NN OT	1
313 | DT NN VB OT NN	1
314 | DT OT JJ JJ	1
315 | DT OT JJ NN JJ NN NN	1
316 | DT OT NN NN JJ NN OT	1
317 | DT OT NN NN JJ OT	1
318 | DT OT NN NN NN JJ OT	1
319 | DT OT NN NN NN OT	1
320 | DT OT NN OT NN	1
321 | DT OT OT NN NN JJ NN OT	1
322 | DT OT OT NN NN JJ OT	1
323 | DT OT OT NN NN NN JJ OT	1
324 | DT OT OT NN OT NN	1
325 | DT OT OT OT JJ	1
326 | DT OT OT OT JJ NN NN JJ NN	1
327 | DT OT OT OT NN NN NN NN	1
328 | DT OT OT OT NN OT NN	1
329 | DT OT OT OT OT	1
330 | DT RB JJ NN NN NN	1
331 | DT VB JJ JJ NN	1
332 | DT VB JJ NN NN	1
333 | DT VB JJ NN OT VB	1
334 | DT VB NN NN NN NN	1
335 | DT VB OT JJ NN	1
336 | DT VB OT OT OT NN NN	1
337 | DT VB VB NN	1
338 | IN NN NN	1
339 | IN NN NN NN	1
340 | IN OT NN OT NN	1
341 | JJ DT JJ NN	1
342 | JJ DT NN NN	1
343 | JJ JJ NN OT JJ NN	1
344 | JJ JJ NN OT NN	1
345 | JJ JJ NN OT NN OT NN NN OT OT	1
346 | JJ JJ OT OT NN NN	1
347 | JJ NN JJ NN	1
348 | JJ NN JJ NN NN NN JJ OT	1
349 | JJ NN NN DT NN	1
350 | JJ NN NN JJ NN	1
351 | JJ NN NN NN JJ NN	1
352 | JJ NN NN NN OT NN	1
353 | JJ NN NN NN OT NN JJ NN	1
354 | JJ NN NN OT NN	1
355 | JJ NN NN OT OT	1
356 | JJ NN OT DT	1
357 | JJ NN OT DT NN	1
358 | JJ NN OT NN NN NN NN	1
359 | JJ NN OT OT	1
360 | JJ NN OT VB	1
361 | JJ NN VB JJ NN	1
362 | JJ OT JJ NN	1
363 | JJ OT NN NN	1
364 | JJ OT NN NN NN	1
365 | JJ OT NN NN NN NN	1
366 | JJ OT OT	1
367 | JJ OT OT JJ NN	1
368 | JJ OT OT NN	1
369 | JJ OT OT OT DT JJ NN	1
370 | JJ OT RB	1
371 | JJ OT VB OT OT	1
372 | JJ RB	1
373 | NN DT	1
374 | NN JJ NN OT OT OT VB	1
375 | NN JJ OT NN NN	1
376 | NN JJ OT OT	1
377 | NN JJ VB NN	1
378 | NN NN JJ	1
379 | NN NN JJ NN NN	1
380 | NN NN JJ NN NN NN	1
381 | NN NN JJ NN OT	1
382 | NN NN NN JJ NN OT	1
383 | NN NN NN NN NN NN NN	1
384 | NN NN NN NN NN NN NN VB	1
385 | NN NN NN NN NN OT	1
386 | NN NN NN NN NN OT JJ NN	1
387 | NN NN NN NN NN VB	1
388 | NN NN NN NN OT	1
389 | NN NN NN NN OT JJ NN NN	1
390 | NN NN NN NN OT NN NN	1
391 | NN NN NN OT JJ JJ NN	1
392 | NN NN NN OT JJ NN	1
393 | NN NN NN OT JJ NN NN	1
394 | NN NN NN OT NN NN NN	1
395 | NN NN NN OT VB NN	1
396 | NN NN NN OT VB NN NN	1
397 | NN NN OT DT	1
398 | NN NN OT DT NN	1
399 | NN NN OT DT NN NN OT	1
400 | NN NN OT NN JJ NN	1
401 | NN NN OT NN NN VB NN NN	1
402 | NN NN OT OT JJ NN	1
403 | NN NN OT OT NN NN NN NN	1
404 | NN NN OT OT OT NN	1
405 | NN NN OT RB JJ JJ NN	1
406 | NN OT IN DT NN	1
407 | NN OT JJ	1
408 | NN OT JJ NN NN NN	1
409 | NN OT JJ NN VB NN NN	1
410 | NN OT JJ NN VB RB	1
411 | NN OT JJ OT OT NN NN	1
412 | NN OT JJ VB NN	1
413 | NN OT NN DT NN	1
414 | NN OT NN JJ	1
415 | NN OT NN NN NN OT	1
416 | NN OT NN NN OT	1
417 | NN OT NN OT	1
418 | NN OT NN OT NN NN	1
419 | NN OT OT JJ	1
420 | NN OT OT JJ NN	1
421 | NN OT OT OT JJ NN	1
422 | NN OT VB OT JJ NN	1
423 | NN OT VB OT OT	1
424 | NN OT VB OT OT OT	1
425 | NN RB JJ	1
426 | NN VB JJ NN	1
427 | NN VB NN NN OT	1
428 | OT DT JJ NN NN NN	1
429 | OT IN	1
430 | OT JJ NN JJ JJ NN	1
431 | OT JJ NN JJ NN	1
432 | OT JJ NN OT NN NN	1
433 | OT JJ NN OT NN OT	1
434 | OT JJ NN VB NN	1
435 | OT JJ OT JJ NN NN	1
436 | OT JJ OT OT OT NN	1
437 | OT JJ VB	1
438 | OT JJ VB OT IN	1
439 | OT NN JJ	1
440 | OT NN JJ NN JJ NN OT	1
441 | OT NN NN JJ	1
442 | OT NN NN JJ NN	1
443 | OT NN NN JJ OT	1
444 | OT NN NN NN JJ NN	1
445 | OT NN NN NN OT	1
446 | OT NN NN OT	1
447 | OT NN NN OT NN	1
448 | OT NN OT IN NN	1
449 | OT NN OT JJ JJ NN	1
450 | OT NN OT JJ NN	1
451 | OT NN OT NN NN NN	1
452 | OT NN OT OT NN	1
453 | OT NN OT OT OT	1
454 | OT NN OT VB NN	1
455 | OT NN VB RB	1
456 | OT OT DT JJ NN	1
457 | OT OT NN NN JJ NN OT	1
458 | OT OT NN NN JJ OT	1
459 | OT OT NN NN NN JJ OT	1
460 | OT OT NN OT OT	1
461 | OT OT OT DT NN	1
462 | OT OT OT JJ	1
463 | OT OT OT JJ NN JJ NN	1
464 | OT OT OT JJ NN JJ NN NN NN	1
465 | OT OT OT JJ OT NN	1
466 | OT OT OT NN NN	1
467 | OT OT OT OT DT JJ	1
468 | OT OT OT OT NN	1
469 | OT OT OT OT NN JJ NN	1
470 | OT OT OT OT NN NN	1
471 | OT OT OT VB RB	1
472 | OT OT RB	1
473 | OT OT VB	1
474 | OT OT VB NN	1
475 | OT RB JJ NN NN	1
476 | OT VB	1
477 | OT VB JJ NN	1
478 | OT VB NN NN	1
479 | OT VB NN NN NN	1
480 | RB JJ JJ	1
481 | RB JJ VB	1
482 | RB JJ VB NN	1
483 | RB NN	1
484 | RB NN JJ NN	1
485 | RB RB	1
486 | VB DT JJ NN	1
487 | VB JJ	1
488 | VB JJ OT RB	1
489 | VB JJ VB NN	1
490 | VB NN NN NN	1
491 | VB NN OT JJ NN	1
492 | VB OT NN NN NN	1
493 | VB VB NN NN	1
494 | 


--------------------------------------------------------------------------------
/data/brown_dict/pos_trans.csv:
--------------------------------------------------------------------------------
 1 | 0.00000	0.43902	0.06724	0.21358	0.04351	0.78312	0.02769	0.37574	0.00000
 2 | 0.00000	0.71857	0.07099	0.13159	0.00346	0.00693	0.00000	0.36708	0.57139
 3 | 0.00000	0.83146	0.03079	0.18477	0.15397	0.01540	0.00000	0.27715	0.41573
 4 | 0.00000	0.94283	0.06168	0.20266	0.00441	0.01322	0.00000	0.22029	0.13217
 5 | 0.00000	0.08762	0.04381	0.65716	0.04381	0.00000	0.00000	0.04381	0.74478
 6 | 0.00000	0.79556	0.14763	0.50030	0.04921	0.03281	0.00000	0.29526	0.06561
 7 | 0.00000	0.68599	0.00000	0.34300	0.00000	0.17150	0.00000	0.34300	0.51450
 8 | 0.00000	0.66386	0.11246	0.33374	0.02177	0.06893	0.01814	0.59131	0.28296
 9 | 0.00000	0.00000	0.00000	0.00000	0.00000	0.00000	0.00000	0.00000	0.00000
10 | 


--------------------------------------------------------------------------------
/data/brown_dict/ptb_tags.csv:
--------------------------------------------------------------------------------
 1 | # $Id$
 2 | # $Source$
 3 | # From: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
 4 | #
 5 | #Tag	Description
 6 | CC	Coordinating conjunction
 7 | CD	Cardinal number
 8 | DT	Determiner
 9 | EX	Existential there
10 | FW	Foreign word
11 | IN	Preposition or subordinating conjunction
12 | JJ	Adjective
13 | JJR	Adjective, comparative
14 | JJS	Adjective, superlative
15 | LS	List item marker
16 | MD	Modal
17 | NN	Noun, singular or mass
18 | NNS	Noun, plural
19 | NNP	Proper noun, singular
20 | NNPS	Proper noun, plural
21 | PDT	Predeterminer
22 | POS	Possessive ending
23 | PRP	Personal pronoun
24 | PRP$	Possessive pronoun
25 | RB	Adverb
26 | RBR	Adverb, comparative
27 | RBS	Adverb, superlative
28 | RP	Particle
29 | SYM	Symbol
30 | TO	to
31 | UH	Interjection
32 | VB	Verb, base form
33 | VBD	Verb, past tense
34 | VBG	Verb, gerund or present participle
35 | VBN	Verb, past participle
36 | VBP	Verb, non-3rd person singular present
37 | VBZ	Verb, 3rd person singular present
38 | WDT	Wh-determiner
39 | WP	Wh-pronoun
40 | WP$	Possessive wh-pronoun
41 | WRB	Wh-adverb
42 | 


--------------------------------------------------------------------------------
/data/entity-graph/db-acronyms.txt:
--------------------------------------------------------------------------------
1 | DFS	Department of Financial Services
2 | LIBOR	London Interbank Offered Rate
3 | DOJ	US Department of Justice
4 | SDNY	Southern District of New York
5 | EDNY	Eastern District of New York
6 | FinCEN	Financial Crimes Enforcement Network
7 | OCCRP	Organized Crime and Corruption Reporting Project
8 | 


--------------------------------------------------------------------------------
/data/entity-graph/gpe_syns-updated.csv:
--------------------------------------------------------------------------------
 1 | ent_text_x,synonyms
 2 | Africa,Africa - Israel|East Africa
 3 | Cyprus,the Cyprus Mail
 4 | Eastern District,the Eastern District|this Eastern District
 5 | Israel,Israeli
 6 | Latvia,Latvian
 7 | New York,New Yorker
 8 | Panama,Panama City
 9 | Russia,Russian|the Russian Federation|Russians
10 | Seychelles,Seychelles Island
11 | Southern District,the Southern District|The Southern District
12 | Syria,Syrian
13 | 


--------------------------------------------------------------------------------
/data/entity-graph/org_syns-updated.csv:
--------------------------------------------------------------------------------
 1 | ent_text_x,synonyms
 2 | Alfa Group,Alfa
 3 | Central Bank of Cyprus,the Cyprus Popular Bank|Bank of Cyprus
 4 | Bloomberg News,Bloomberg
 5 | Commercial Bank of SF,
 6 | Russian Commercial Bank,
 7 | Committee on Financial Services,
 8 | Organized Crime and Corruption Reporting Project,Crime and Corruption Reporting Project
 9 | New York State Department of Financial Services,Department of Financial Services|Department of Financial Services|Department of Financial Services
10 | Deutsche Bank,Group Executive Committee of Deutsche Bank|Deutsche Bank Trust Company Americas|Deutsche Asset & Wealth Management|Deutsche Bank Securities|Deutsche Bank A.G.|Deutsche for Trump|Deutsche Bank ’s|Deutsche Bank AG|Deutsche bank|Deutschea|Deutsche
11 | RenTech,
12 | European Union,
13 | US Federal Reserve,Federal Reserve Board,Federal Reserve|Fed
14 | US Senate,Senate|US Senate Subcommittee for Investigation|US Senate Intelligence committee|Senate Subcommittee
15 | Federal Bank of the Middle East,
16 | Federal Elections Commission,
17 | Hermitage Capital,Hermitage
18 | White House,
19 | House,US House
20 | IRS Office of Appeals,IRS
21 | Abu Dhabi Department of Finance,
22 | US Department of Justice,Department of Justice|Justice|Justice Department
23 | New York State Department of Financial Services,New York Department of Financial Services|Department of Financial Services
24 | Department of State,State Department
25 | New York Times,Times
26 | Panama Papers,
27 | Prevezon Holdings,Prevezon
28 | Russia Senate Security and Defense Committee,
29 | Department of Financial Services,Committee on Financial Services|
30 | Treasury Department,Treasury
31 | Trump Organization,UA Trump International Investments Ltd.,Trump Construction Co Limited
32 | VTB Capital,VTB
33 | White House,
34 | American Jewish Congress,
35 | Russian Jewish Congress,
36 | Central Bank of Russia,
37 | Central Bank of Syria,
38 | 


--------------------------------------------------------------------------------
/data/entity-graph/person_syns-updated.csv:
--------------------------------------------------------------------------------
 1 | ent_text_x,synonyms
 2 | Ackerman,Josef Ackermann
 3 | Alex Sapir,Sapir
 4 | Boris Rotenberg,Rotenberg
 5 | Breuer,Ernst Breuer
 6 | Carl Levin,Levin
 7 | Crown Prince Mohammed bin Zayed al Nahyan,Mohammed bin Zayed
 8 | David Kautter,Kautter
 9 | Diane Glossman,Glossman
10 | Donald Trump Jr.,Donald Jr.
11 | Donald J. Trump,Donald Trump|Trump|Trump et al
12 | Vladimir Putin,Putin
13 | Jack Rosen,Rosen
14 | Jared Kushner,Kushner
15 | Josef Ackerman,Josef Ackermann
16 | Lev Leviev,Leviev|Leviev et al
17 | Magnitsky,Sergei Magnitsky
18 | Maxine Waters,Waters
19 | Rosen,Rotem Rosen
20 | Tim Wiswell,Wiswell
21 | Constantinos Koudellaris,Koudellaris
22 | Steve Bannon,Bannon
23 | Dmitry Firtash,Firtash
24 | Ivan Fursin,Fursin
25 | Sergey Gorkov,Gorkov
26 | Rosemary Vrablic,Vrablic
27 | Oleg Deripaska,Deripaska
28 | Preet Bharara,Bharara
29 | Geoffrey Berman,Berman
30 | Bill Browder,Browder
31 | Nikolai Gorokov,Gorokov
32 | Louise Shelley,Shelley
33 | 


--------------------------------------------------------------------------------
/nbproject/private/private.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project-private xmlns="http://www.netbeans.org/ns/project-private/1">
3 |     <editor-bookmarks xmlns="http://www.netbeans.org/ns/editor-bookmarks/1"/>
4 | </project-private>
5 | 


--------------------------------------------------------------------------------
/nbproject/project.properties:
--------------------------------------------------------------------------------
1 | java.lib.path=
2 | platform.active=Python_2.6.1
3 | python.lib.path=
4 | src.dir=src
5 | 


--------------------------------------------------------------------------------
/nbproject/project.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://www.netbeans.org/ns/project/1">
 3 |     <type>org.netbeans.modules.python.project</type>
 4 |     <configuration>
 5 |         <data xmlns="http://nbpython.dev.java.net/ns/php-project/1">
 6 |             <name>nltk-examples</name>
 7 |             <sources>
 8 |                 <root id="src.dir"/>
 9 |             </sources>
10 |             <tests/>
11 |         </data>
12 |     </configuration>
13 | </project>
14 | 


--------------------------------------------------------------------------------
/src/book/ch02.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # Accessing text corpora and local resources
  3 | from __future__ import division
  4 | import nltk
  5 | import nltk.corpus
  6 | from nltk.corpus.reader.plaintext import PlaintextCorpusReader
  7 | 
  8 | def lexical_diversity(text):
  9 |   return len(text) / len(set(text))
 10 | 
 11 | def contents(corpus):
 12 |   return corpus.fileids()
 13 | 
 14 | def describe(corpus):
 15 |   print "\t".join(["c/w", "w/s", "w/v", "id"])
 16 |   for fileid in corpus.fileids():
 17 |     nchars = len(corpus.raw(fileid))
 18 |     nwords = len(corpus.words(fileid))
 19 |     nsents = len(corpus.sents(fileid))
 20 |     nvocab = len(set([w.lower() for w in corpus.words(fileid)]))
 21 |     print "\t".join([str(nchars/nwords), str(nwords/nsents),
 22 |       str(nwords/nvocab), fileid])
 23 | 
 24 | def brown_word_usage_by_category(brown, words):
 25 |   for category in brown.categories():
 26 |     text = brown.words(categories=category)
 27 |     fdist = nltk.FreqDist([w.lower() for w in text])
 28 |     print category,
 29 |     for word in words:
 30 |       print word + ":" + str(fdist[word]),
 31 |     print
 32 | 
 33 | def brown_word_usage_by_category_cfg(brown, words):
 34 |   genres = brown.categories()
 35 |   cfd = nltk.ConditionalFreqDist(
 36 |     (genre, word.lower())
 37 |     for genre in genres
 38 |     for word in brown.words(categories=genre))
 39 |   cfd.tabulate(conditions=genres, samples=words)
 40 | 
 41 | def inaugural_word_usage_by_year(inaugural, words):
 42 |   cfd = nltk.ConditionalFreqDist(
 43 |     (target, fileid[:4])
 44 |     for fileid in inaugural.fileids()
 45 |     for w in inaugural.words(fileid)
 46 |     for target in words
 47 |     if w.lower().startswith(target))
 48 |   cfd.plot()
 49 | 
 50 | def udhr_language_length(udhr, languages):
 51 |   cfd = nltk.ConditionalFreqDist(
 52 |     (lang, len(word))
 53 |     for lang in languages
 54 |     for word in udhr.words(lang + "-Latin1"))
 55 |   cfd.plot(cumulative=True)
 56 | 
 57 | def load_local(corpus_root):
 58 |   return PlaintextCorpusReader(corpus_root, ".*")
 59 | 
 60 | def generate_model(cfdist, start_word, num=15):
 61 |   for i in range(num):
 62 |     print start_word,
 63 |     start_word = cfdist[start_word].max()
 64 | 
 65 | def unusual_words(text):
 66 |   text_vocab = set(w.lower() for w in text if w.isalpha())
 67 |   english_vocab = set(w.lower() for w in nltk.corpus.words.words())
 68 |   unusual = text_vocab.difference(english_vocab)
 69 |   return sorted(unusual)
 70 | 
 71 | def non_stopword_content_pct(text):
 72 |   stopwords = nltk.corpus.stopwords.words("english")
 73 |   content = [w for w in text if w.lower() not in stopwords]
 74 |   return len(content) * 100 / len(text)
 75 | 
 76 | def gen_words_puzzle(puzzle_letters, obligatory_letter, min_word_size):
 77 |   wordlist = nltk.corpus.words.words()
 78 |   plfd = nltk.FreqDist(puzzle_letters)
 79 |   return [w for w in wordlist if len(w) >= min_word_size
 80 |     and obligatory_letter in w
 81 |     and nltk.FreqDist(w) < plfd]
 82 | 
 83 | def gender_ambig_names():
 84 |   names = nltk.corpus.names
 85 |   male_names = names.words("male.txt")
 86 |   female_names = names.words("female.txt")
 87 |   return [w for w in male_names if w in female_names]
 88 | 
 89 | def gender_names_by_firstchar():
 90 |   names = nltk.corpus.names
 91 |   cfd = nltk.ConditionalFreqDist(
 92 |     (fileid, name[0:1])
 93 |     for fileid in names.fileids()
 94 |     for name in names.words(fileid))
 95 |   cfd.plot()
 96 | 
 97 | def semantic_similarity(left, right):
 98 |   lch = left.lowest_common_hypernyms(right)
 99 |   return map(lambda x : x.min_depth(), lch)
100 | 
101 | if __name__ == "__main__":
102 | 
103 | #  from nltk.corpus import gutenberg
104 | #  print lexical_diversity(gutenberg.words("austen-emma.txt"))
105 | #  describe(gutenberg)
106 | 
107 | #  from nltk.corpus import brown
108 | #  modals = ["can", "could", "may", "might", "must", "will"]
109 | #  brown_word_usage_by_category(brown, modals)
110 | #  whwords = ["what", "when", "where", "who", "why"]
111 | #  brown_word_usage_by_category(brown, whwords)
112 | #  modals = ["can", "could", "may", "might", "must", "will"]
113 | #  brown_word_usage_by_category_cfg(brown, modals)
114 | 
115 | #  from nltk.corpus import inaugural
116 | #  inaugural_word_usage_by_year(inaugural, ["america", "citizen"])
117 | 
118 | #  from nltk.corpus import udhr
119 | #  languages = ["English", "French_Francais", "German_Deutsch"]
120 | #  udhr_language_length(udhr, languages)
121 | #  raw_text = udhr.raw("English-Latin1")
122 | #  nltk.FreqDist(raw_text).plot()
123 | 
124 | #  localCorpus = load_local("/usr/share/dict")
125 | #  print localCorpus.fileids()
126 | #  print localCorpus.words("connectives")
127 | 
128 | #  from nltk.corpus import brown
129 | #  days = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]
130 | #  genres = ["news", "romance"]
131 | #  cfd = nltk.ConditionalFreqDist(
132 | #    (genre, day)
133 | #    for genre in genres
134 | #    for day in days
135 | #    for word in brown.words(categories=genre) if word.lower() == day)
136 | #  cfd.tabulate(conditions=genres, samples=days)
137 | #  cfd.plot()
138 | 
139 | #  text = nltk.corpus.genesis.words("english-kjv.txt")
140 | #  bigrams = nltk.bigrams(text)
141 | #  cfd = nltk.ConditionalFreqDist(bigrams)
142 | #  print cfd["living"]
143 | #  generate_model(cfd, "living")
144 | 
145 | #  # doesn't work (check why not)
146 | #  from com.mycompany.Foo import *
147 | #  com.mycompany.Foo.bar()
148 | 
149 | #  print unusual_words(nltk.corpus.gutenberg.words("austen-sense.txt"))
150 | #  print unusual_words(nltk.corpus.nps_chat.words())
151 | 
152 | #  print non_stopword_content_pct(nltk.corpus.reuters.words())
153 | 
154 | #  print gen_words_puzzle("egivrvonl", "r", 6)
155 | 
156 | #  print gender_ambig_names()
157 | #  gender_names_by_firstchar()
158 | 
159 | #  entries = nltk.corpus.cmudict.entries()
160 | #  print len(entries)
161 | #  for word, pron in entries:
162 | #    if len(pron) == 3:
163 | #      ph1, ph2, ph3 = pron
164 | #      if ph1 == "P" and ph3 == "T":
165 | #        print word, pron
166 | 
167 | #  from nltk.corpus import swadesh
168 | #  print swadesh.fileids()
169 | #  print swadesh.words("en")
170 | #  fr2en = swadesh.entries(["fr", "en"])
171 | #  translate = dict(fr2en)
172 | #  print translate["chien"]
173 | #  print translate["jeter"]
174 | #  de2en = swadesh.entries(["de", "en"])
175 | #  es2en = swadesh.entries(["es", "en"])
176 | #  translate.update(de2en)
177 | #  translate.update(es2en)
178 | #  print translate["Hund"]
179 | #  print translate["perro"]
180 | #
181 | #  languages = ["en", "de", "nl", "es", "fr", "pt", "la"]
182 | #  for i in range(139, 142):
183 | #    print swadesh.entries(languages)[i]
184 | 
185 |   from nltk.corpus import wordnet as wn
186 | #  print wn.synsets("motorcar")
187 | #  print wn.synset("car.n.01").lemma_names
188 | #  print wn.synset("car.n.01").definition
189 | #  print wn.synset("car.n.01").examples
190 | #  lemmas = wn.synset("car.n.01").lemmas
191 | #  print "lemmas=", lemmas
192 | #  print "synsets(car.n.01.automobile)=", wn.lemma("car.n.01.automobile").synset
193 | #  print "names(car.n.01.automobile)=", wn.lemma("car.n.01.automobile").name
194 | #  print wn.synsets("car")
195 | #  for synset in wn.synsets("car"):
196 | #    print synset.lemma_names
197 | #  print wn.lemmas("car")
198 | 
199 | 
200 | #  motorcar = wn.synset("car.n.01")
201 | #  types_of_motorcar = motorcar.hyponyms()
202 | #  print types_of_motorcar[26]
203 | #  print sorted([lemma.name for synset in types_of_motorcar for lemma in synset.lemmas])
204 | #  print motorcar.hypernyms()
205 | #  paths = motorcar.hypernym_paths()
206 | #  print len(paths)
207 | #  print "dist1=", [synset.name for synset in paths[0]]
208 | #  print "dist2=", [synset.name for synset in paths[1]]
209 | #  print motorcar.root_hypernyms()
210 | 
211 | #  print "part_meronyms(tree)=", wn.synset("tree.n.01").part_meronyms()
212 | #  print "substance_meronyms(tree)=", wn.synset("tree.n.01").substance_meronyms()
213 | #  print "member_holonyms(tree)=", wn.synset("tree.n.01").member_holonyms()
214 | 
215 | #  for synset in wn.synsets("mint", wn.NOUN):
216 | #    print synset.name + ": " + synset.definition
217 | #
218 | #  print "entailments(walk.v.01)=", wn.synset("walk.v.01").entailments()
219 | #  print "entailments(eat.v.01)=", wn.synset("eat.v.01").entailments()
220 | #  print "entailments(swallow.v.01)=", wn.synset("swallow.v.01").entailments()
221 | #  print "entailments(tease.v.03)=", wn.synset("tease.v.03").entailments()
222 | 
223 | #  print "antonym(supply.n.02.supply)=", wn.lemma("supply.n.02.supply").antonyms()
224 | #  print dir(wn.synset("harmony.n.02"))
225 | 
226 |   #Semantic Similarity
227 |   orca = wn.synset("orca.n.01")
228 |   minke = wn.synset("minke_whale.n.01")
229 |   tortoise = wn.synset("tortoise.n.01")
230 |   novel = wn.synset("novel.n.01")
231 |   print "sim(orca,minke)=", semantic_similarity(orca, minke)
232 |   print "sim(orca,tortoise)=", semantic_similarity(orca, tortoise)
233 |   print "sim(orca,novel)=", semantic_similarity(orca, novel)
234 |   print "psim(orca,minke)=", orca.path_similarity(minke)
235 |   print "psim(orca,tortoise)=", orca.path_similarity(tortoise)
236 |   print "psim(orca,novel)=", orca.path_similarity(novel)
237 | 
238 |   print "end"


--------------------------------------------------------------------------------
/src/book/ch02_ex.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | from __future__ import division
  4 | import nltk
  5 | import operator
  6 | 
  7 | def ex1():
  8 |   phrase = ["This", "is", "the", "house", "that", "Jack", "built", "."]
  9 |   print phrase + phrase
 10 | #  print phrase - phrase
 11 | #  print phrase * phrase
 12 | #  print phrase / phrase
 13 |   print sorted(phrase)
 14 | 
 15 | def ex2():
 16 |   from nltk.corpus import gutenberg
 17 |   ap = gutenberg.words("austen-persuasion.txt")
 18 |   word_tokens = len(ap)
 19 |   word_types = len(set([w.lower() for w in ap]))
 20 |   print "#-word tokens=", word_tokens
 21 |   print "#-word types=", word_types
 22 | 
 23 | def ex4():
 24 |   from nltk.corpus import state_union
 25 |   tags = ["men", "women", "people"]
 26 | #  for fileid in state_union.fileids():
 27 | #    words = state_union.words(fileid)
 28 | #    fdist = nltk.FreqDist([w.lower() for w in words])
 29 | #    print fileid + ": ",
 30 | #    for tag in tags:
 31 | #      print tag + "=" + str(fdist[tag]) + " ",
 32 | #    print
 33 |   cfd = nltk.ConditionalFreqDist(
 34 |     (target, fileid[0:4])
 35 |     for fileid in state_union.fileids()
 36 |     for w in state_union.words(fileid)
 37 |       for target in tags if w.lower() == target)
 38 |   cfd.plot()
 39 | 
 40 | def ex5():
 41 |   from nltk.corpus import wordnet as wn
 42 |   for w in ["jaguar", "transistor", "train"]:
 43 |     s = wn.synset(w + ".n.01")
 44 |     if (s is not None):
 45 |       print "member_meronym(" + w + ")=", s.member_meronyms()
 46 |       print "part_meronym(" + w + ")=", s.part_meronyms()
 47 |       print "substance_meronym(" + w + ")=", s.substance_meronyms()
 48 |       print "member_holonym(" + w + ")=", s.member_holonyms()
 49 |       print "part_holonym(" + w + ")=", s.part_holonyms()
 50 |       print "substance_holonym(" + w + ")=", s.substance_holonyms()
 51 | 
 52 | def ex7():
 53 |   from nltk.corpus import gutenberg
 54 |   for fileid in gutenberg.fileids():
 55 |     text = nltk.Text(gutenberg.words(fileid))
 56 |     print ("file: " + fileid)
 57 |     print text.concordance("however")
 58 | 
 59 | def ex8():
 60 |   from nltk.corpus import names
 61 |   genders = ["male", "female"]
 62 |   alphabets = ["A", "B", "C", "D", "E", "F", "G",
 63 |     "H", "I", "J", "K", "L", "M", "N", "O", "P",
 64 |     "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"]
 65 |   cfd = nltk.ConditionalFreqDist(
 66 |     (gender, firstletter)
 67 |     for gender in genders
 68 |     for firstletter in alphabets
 69 |     for letter_count in
 70 |     [len(w) for w in names.words(gender + ".txt")
 71 |     if w[0:1] == firstletter])
 72 |   cfd.plot()
 73 | 
 74 | def ex10():
 75 |   from nltk.corpus import brown
 76 |   from nltk.corpus import stopwords
 77 |   stopwords = stopwords.words("english")
 78 |   for genre in brown.categories():
 79 |     print genre
 80 |     words = map(lambda x : x.lower(), brown.words(categories=genre))
 81 |     fd = nltk.FreqDist([w for w in words if w.isalpha() and not(w in stopwords)])
 82 |     vocab_size = len(set(words))
 83 |     sum = 0
 84 |     for word in fd.keys():
 85 |       freq = fd[word]
 86 |       print "... %s (%f)" % (word, (freq * 100 / vocab_size))
 87 |       sum = sum + freq
 88 |       if (sum > (vocab_size / 3)):
 89 |         break
 90 | 
 91 | def ex11():
 92 |   from nltk.corpus import brown
 93 |   modals = set(["can", "could", "may", "might", "shall", "should", "will", "would", "must", "ought"])
 94 |   cfd = nltk.ConditionalFreqDist(
 95 |     (genre, modal)
 96 |     for genre in brown.categories()
 97 |     for modal in [w.lower() for w in brown.words(categories=genre) if w.lower() in modals])
 98 |   cfd.plot()
 99 | 
100 | def ex12():
101 |   from nltk.corpus import cmudict
102 |   entries = cmudict.entries()
103 |   words = map(lambda (word, pron) : word, entries)
104 |   distinct_words = set(words)
105 |   fd = nltk.FreqDist(words)
106 |   multi_prons = 0
107 |   for key in fd.keys():
108 |     if fd[key] == 1:
109 |       break
110 |     multi_prons = multi_prons + 1
111 |   print "#-distinct words:", len(distinct_words)
112 |   print "#-words with multiple prons:", multi_prons
113 | 
114 | def ex13():
115 |   from nltk.corpus import wordnet as wn
116 |   num_synsets = 0
117 |   num_synsets_wo_hyponyms = 0
118 |   for noun_synset in wn.all_synsets("n"):
119 |     if len(noun_synset.hyponyms()) == 0:
120 |       num_synsets_wo_hyponyms = num_synsets_wo_hyponyms + 1
121 |     num_synsets = num_synsets + 1
122 |   print num_synsets_wo_hyponyms * 100 / num_synsets
123 | 
124 | def ex14_supergloss(s):
125 |   from nltk.corpus import wordnet as wn
126 |   ss = wn.synset(s)
127 |   buf = ss.definition[0:1].upper() + ss.definition[1:]
128 |   for hyponym in ss.hyponyms():
129 |     buf = buf + ". " + hyponym.definition[0:1].upper() + hyponym.definition[1:]
130 |   for hypernym in ss.hypernyms():
131 |     buf = buf + ". " + hypernym.definition[0:1].upper() + hypernym.definition[1:]
132 |   print buf
133 | 
134 | def ex15():
135 |   from nltk.corpus import brown
136 |   fd = nltk.FreqDist([w.lower() for w in brown.words()])
137 |   print filter(lambda k : fd[k] > 3, fd.keys())
138 | 
139 | def ex16():
140 |   from nltk.corpus import brown
141 |   lex_div = {}
142 |   for category in brown.categories():
143 |     words = brown.words(categories=category)
144 |     ld = len(words) / len(set(words))
145 |     print category, ld
146 |     lex_div[category] = ld
147 |   print sorted(lex_div.iteritems(), key=operator.itemgetter(1))
148 | 
149 | def ex17():
150 |   from nltk.corpus import gutenberg
151 |   macbeth = gutenberg.words("shakespeare-macbeth.txt")
152 |   stopwords = set(nltk.corpus.stopwords.words())
153 |   fd = nltk.FreqDist([w for w in macbeth if w.lower() not in stopwords
154 |       and len(w) > 3 and w.isalpha()])
155 |   print fd.keys()[0:50]
156 | 
157 | def ex18():
158 |   from nltk.corpus import gutenberg
159 |   macbeth = gutenberg.words("shakespeare-macbeth.txt")
160 |   stopwords = set(nltk.corpus.stopwords.words())
161 |   bigrams = nltk.bigrams(macbeth)
162 |   print bigrams
163 |   bigrams_wo_stopwords = filter(lambda (k, v) : k not in stopwords
164 |     and v not in stopwords
165 |     and k.isalpha()
166 |     and v.isalpha(), bigrams)
167 |   fd = nltk.FreqDist(map(lambda (k,v) : k+":"+v, bigrams_wo_stopwords))
168 |   print map(lambda k : (k.split(":")[0], k.split(":")[1]), fd.keys())[0:50]
169 | 
170 | def ex25_findlanguage():
171 |   from nltk.corpus import udhr
172 |   word_lang_map = {}
173 |   for fileid in udhr.fileids():
174 |     if fileid.endswith("-Latin1"):
175 |       lang = fileid[:-7]
176 |       words = udhr.words(fileid)
177 |       for word in words:
178 |         try:
179 |           word_lang_map[word]
180 |         except KeyError:
181 |           word_lang_map[word] = set()
182 |         langs = word_lang_map[word]
183 |         langs.add(lang)
184 |         word_lang_map[word] = langs
185 |   print word_lang_map["arashobora"]
186 | 
187 | def ex26_branchingfactor():
188 |   from nltk.corpus import wordnet as wn
189 |   num_synsets = 0
190 |   num_hyponyms = 0
191 |   for noun_synset in wn.all_synsets("n"):
192 |     (num_hyponyms, num_synsets) = \
193 |       branchingfactor_r(noun_synset, num_synsets, num_hyponyms)
194 |   print "branching factor=", (num_hyponyms / num_synsets)
195 | 
196 | def branchingfactor_r(synset, num_synsets, num_hyponyms):
197 |   num_synsets = num_synsets + 1
198 |   for hyponym in synset.hyponyms():
199 |     branchingfactor_r(hyponym, num_synsets, num_hyponyms)
200 |     num_hyponyms = num_hyponyms + 1
201 |   return (num_hyponyms, num_synsets)
202 | 
203 | def ex27_polysemy():
204 |   from nltk.corpus import wordnet as wn
205 |   for pos in ["n", "v", "a"]:
206 |     synsets = wn.all_synsets(pos)
207 |     num_synsets = 0
208 |     num_senses = 0
209 |     for synset in synsets:
210 |       num_synsets = num_synsets + 1
211 |       num_senses = num_senses + len(synset.lemmas)
212 |     print "polysemy(" + pos + ")=", (num_senses / num_synsets)
213 | 
214 | def main():
215 | #  ex1()
216 | #  ex2()
217 | #  ex4()
218 | #  ex5()
219 | #  ex7()
220 | #  ex8()
221 | #  ex10()
222 | #  ex11()
223 | #  ex12()
224 | #  ex13()
225 | #  ex14_supergloss("car.n.01")
226 | #  ex15()
227 | #  ex16()
228 | #  ex17()
229 | #  ex18()
230 | #  ex25_findlanguage()
231 | #  ex26_branchingfactor()
232 |   ex27_polysemy()
233 |   
234 | if __name__ == "__main__":
235 |   main()


--------------------------------------------------------------------------------
/src/book/ch03.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # Processing raw text
  3 | 
  4 | from __future__ import division
  5 | import nltk 
  6 | import re
  7 | import pprint
  8 | import urllib2
  9 | import feedparser
 10 | import codecs
 11 | 
 12 | def download(url, file):
 13 |   req = urllib2.Request(url)
 14 |   req.add_header("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; es-ES; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5")
 15 |   raw = urllib2.urlopen(req).read()
 16 |   f = open(file, 'w')
 17 |   f.write(raw)
 18 |   f.close()
 19 | 
 20 | def web_file_plain():
 21 | #  download("http://www.gutenberg.org/files/2554/2554.txt", "/tmp/2554.txt")
 22 |   f = open("/tmp/2554.txt", 'r')
 23 |   raw = f.read()
 24 |   f.close()
 25 |   print "raw", type(raw), len(raw), raw[:75]
 26 |   tokens = nltk.word_tokenize(raw)
 27 |   print "tokens", type(tokens), len(tokens), tokens[:10]
 28 |   text = nltk.Text(tokens)
 29 |   print "text[1020:1060", text[1020:1060]
 30 |   print "colloc=", text.collocations()
 31 |   start = raw.find("PART I")
 32 |   end = raw.rfind("End of Project Gutenberg's Crime")
 33 |   raw2 = raw[start:end]
 34 |   print "index(PART I)=", raw.find("PART I"), raw2.find("PART I")
 35 | 
 36 | def web_file_html():
 37 | #  download("http://news.bbc.co.uk/2/hi/health/2284783.stm", "/tmp/2284783.stm")
 38 |   f = open("/tmp/2284783.stm", 'r')
 39 |   html = f.read()
 40 |   f.close()
 41 |   raw = nltk.clean_html(html)
 42 |   tokens = nltk.word_tokenize(raw)
 43 |   text = nltk.Text(tokens[96:399])
 44 |   text.concordance("gene")
 45 | 
 46 | def web_file_rss():
 47 |   download("http://languagelog.ldc.upenn.edu/nll/?feed=atom",
 48 |     "/tmp/feed.xml")
 49 |   f = open("/tmp/feed.xml", 'r')
 50 |   llog = feedparser.parse(f.read())
 51 |   print "title,len(content)=", llog["feed"]["title"], len(llog.entries)
 52 |   post = llog.entries[2]
 53 |   content = post.content[0].value
 54 |   print "title,countent...=", post.title, content[:70]
 55 |   tokens = nltk.word_tokenize(nltk.clean_html(content))
 56 |   print "tokens=", tokens
 57 | 
 58 | def unicode_read():
 59 |   path = "/opt/nltk_data/corpora/unicode_samples/polish-lat2.txt"
 60 |   f = codecs.open(path, encoding="latin2")
 61 |   for line in f:
 62 |     print line.strip().encode("unicode_escape")
 63 | 
 64 | def basic_regexps():
 65 |   wordlist = [w for w in nltk.corpus.words.words("en") if w.islower()]
 66 |   print "words ending with -ed", [w for w in wordlist if re.search("ed$", w)]
 67 |   print "crossword pattern", [w for w in wordlist if re.search("^..j..t..$", w)]
 68 |   print "textonyms(golf)=", [w for w in wordlist if re.search("^[ghi][mno][jlk][def]$", w)]
 69 |   chat_words = sorted(set([w for w in nltk.corpus.nps_chat.words()]))
 70 |   print "mine=", [w for w in chat_words if re.search("^m+i+n+e+$", w)]
 71 | 
 72 | def compress(regex, word):
 73 |   pieces = re.findall(regex, word)
 74 |   return "".join(pieces)
 75 | 
 76 | def compress_vowels():
 77 |   # initial vowel sequence, final vowel sequence or consonents,
 78 |   # everything else is removed
 79 |   regex = r"^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]"
 80 |   english_udhr = nltk.corpus.udhr.words("English-Latin1")
 81 |   print nltk.tokenwrap([compress(regex, w) for w in english_udhr[:75]])
 82 | 
 83 | def consonant_vowel_sequences_rotokas():
 84 |   rotokas_words = nltk.corpus.toolbox.words("rotokas.dic")
 85 |   cvs = [cv for w in rotokas_words
 86 |     for cv in re.findall(r"[ptksvr][aeiou]", w)]
 87 |   cfd = nltk.ConditionalFreqDist(cvs)
 88 |   cfd.tabulate()
 89 |   cv_word_pairs = [(cv, w) for w in rotokas_words
 90 |                            for cv in re.findall(r"[ptksrv][aeiou]", w)]
 91 |   cv_index = nltk.Index(cv_word_pairs)
 92 |   print "index(su)=", cv_index["su"]
 93 |   print "index(po)=", cv_index["po"]
 94 | 
 95 | def discover_hypernyms():
 96 |   from nltk.corpus import brown
 97 |   text = nltk.Text(brown.words(categories=["hobbies", "learned"]))
 98 | #  print text.findall(r"<\w*> <and> <other> <\w*>")
 99 |   print text.findall(r"<as> <\w*> <as> <\w*>")
100 | 
101 | def find_in_stemmed_index(word):
102 | #  porter = nltk.PorterStemmer()
103 |   wnl = nltk.WordNetLemmatizer()
104 |   grail = nltk.corpus.webtext.words("grail.txt")
105 | #  index = nltk.Index([(porter.stem(w.lower()), pos)
106 | #    for (pos, w) in enumerate(grail)])
107 |   index = nltk.Index([(wnl.lemmatize(w.lower()), pos)
108 |     for (pos, w) in enumerate(grail)])
109 |   for pos in index[word]:
110 |     lcontext = " ".join(grail[pos-4:pos])
111 |     rcontext = " ".join(grail[pos:pos+4])
112 |     print lcontext, rcontext
113 | 
114 | def regex_word_tokenize():
115 |   # developing own tokenizer, compare between
116 |   # nltk.corpus.treebank_raw.raw() and
117 |   # nltk.corpus.treebank.words()
118 |   alice = nltk.corpus.gutenberg.raw("carroll-alice.txt")
119 | #  print re.split(r" ", alice)
120 | #  print re.split(r"\W+", alice) # split on any non-word not only space
121 | #  print re.findall(r"\w+|\S\w*", alice) # seq of 2/more punct separated
122 | #  print re.findall(r"\w+(?:[-']\w)*|'|[-.(]+\s\w*", alice)
123 |   pattern = r"""(?x)     # verbose regexp
124 |     ([A-Z]\.)+ |         # abbreviations (U.S.A.)
125 |     \w+(-\w+)* |         # words with optional internal hyphens
126 |     \$?\d+(\.\d+)?%? |   # currency and percentages
127 |     \.\.\. |             # ellipsis
128 |     [][.,;"'?():-_`]     # separator tokens
129 |   """
130 |   print nltk.regexp_tokenize(alice, pattern)
131 | 
132 | def sentence_tokenization():
133 |   sent_tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
134 |   text = nltk.corpus.gutenberg.raw("chesterton-thursday.txt")
135 |   sents = sent_tokenizer.tokenize(text)
136 |   pprint.pprint(sents[171:181])
137 |   
138 | def main():
139 | #  web_file_plain()
140 | #  web_file_html()
141 | #  web_file_rss()
142 | #  unicode_read()
143 | #  basic_regexps()
144 | #  compress_vowels()
145 | #  consonant_vowel_sequences_rotokas()
146 | #  discover_hypernyms()
147 | #  find_in_stemmed_index("offic") # porter
148 | #  find_in_stemmed_index("officer") # wordnet
149 | #  regex_word_tokenize()
150 | #  sentence_tokenization()
151 |   pass
152 |   
153 | if __name__ == "__main__":
154 |   main()
155 | 
156 | 


--------------------------------------------------------------------------------
/src/book/ch03_ex.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | from __future__ import division
 4 | import nltk
 5 | import re
 6 | 
 7 | def ch03_10():
 8 |   sent = re.split(" ", "The dog gave John the newspaper")
 9 |   print [(w, len(w)) for w in sent]
10 | 
11 | def ch03_18_wh_words():
12 |   moby_dick = nltk.corpus.gutenberg.words("melville-moby_dick.txt")
13 |   print [w for w in moby_dick if w.startswith("wh")]
14 | 
15 | def ch03_29_reading_difficulty():
16 |   sent_tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
17 |   from nltk.corpus import brown
18 |   for category in brown.categories():
19 |     raw = brown.raw(categories=category)
20 |     words = len(brown.words(categories=category))
21 |     sentences = len(sent_tokenizer.tokenize(raw))
22 |     letters_per_word = (len(raw) - words) / words # raw chars - words space chars
23 |     words_per_sentence = words / sentences
24 |     reading_level = (4.71 * letters_per_word) + (0.5 * words_per_sentence) + 21.43
25 |     print category, reading_level
26 | 
27 | def ch03_30_porter_vs_lancaster():
28 |   porter = nltk.PorterStemmer()
29 |   lancaster = nltk.LancasterStemmer()
30 |   tokens = ["When", "all", "is", "said", "and", "done", ",", "more", "is", "said", "than", "done", "."]
31 |   print "porter=", [porter.stem(w.lower()) for w in tokens]
32 |   print "lancaster=", [lancaster.stem(w.lower()) for w in tokens]
33 |   print "len(tokens)=", map(lambda token : len(token), tokens)
34 | 
35 | def ch03_42_wordnet_semantic_index():
36 |   from nltk.corpus import webtext
37 |   from nltk.corpus import wordnet as wn
38 |   postings = []
39 |   docids = {}
40 |   for (pos, fileid) in enumerate(webtext.fileids()):
41 |     docids[pos] = fileid
42 |     wpos = 0
43 |     words = webtext.words(fileid)
44 |     for word in words:
45 |       try:
46 |         postings.append((word.lower(), (pos, wpos)))
47 |         offset = wn.synsets(word)[0].offset
48 |         postings.append((offset, (pos, wpos)))
49 |         poffset = wn.synsets(word)[0].hypernyms()[0].offset
50 |         postings.append((poffset, (pos, wpos)))
51 |       except IndexError:
52 |         continue
53 |       wpos = wpos + 1
54 |   index = nltk.Index(postings)
55 |   query = "canine"
56 |   qpostings = []
57 |   qpostings.extend([(pos, wpos) for (pos, wpos) in index[query]])
58 |   try:
59 |     offset = wn.synsets(query)[0].offset
60 |     qpostings.extend([(pos, wpos) for (pos, wpos) in index[offset]])
61 |   except IndexError:
62 |     pass
63 |   for (pos, wpos) in qpostings:
64 |     left = webtext.words(docids[pos])[wpos-4:wpos]
65 |     right = webtext.words(docids[pos])[wpos:wpos+4]
66 |     print left, right
67 | 
68 | def bigram_freqdist(words):
69 |   return nltk.FreqDist(["".join(w)
70 |     for word in words
71 |     for w in nltk.bigrams(word.lower())])
72 |     
73 | def ch03_43_translate():
74 |   from nltk.corpus import udhr
75 |   en_fd = bigram_freqdist(udhr.words("English-Latin1"))
76 |   fr_fd = bigram_freqdist(udhr.words("French_Francais-Latin1"))
77 |   de_fd = bigram_freqdist(udhr.words("German_Deutsch-Latin1"))
78 |   es_fd = bigram_freqdist(udhr.words("Spanish-Latin1"))
79 |   inputs = ["Nice day", "Guten Tag", "Buenas Dias", "Tres Bien"]
80 |   for input in inputs:
81 |     words = input.lower().split(" ")
82 |     # TODO: remove keys present in reference set
83 |     ranks = map(lambda x : nltk.spearman_correlation(x, bigram_freqdist(words)),
84 |       [en_fd, fr_fd, de_fd, es_fd])
85 |     print input, ranks
86 |     
87 | def main():
88 | #  ch03_10()
89 | #  ch03_18_wh_words()
90 | #  ch03_29_reading_difficulty()
91 | #  ch03_30_porter_vs_lancaster()
92 |   ch03_42_wordnet_semantic_index()
93 | #  ch03_43_translate()
94 | 
95 | if __name__ == "__main__":
96 |   main()
97 | 


--------------------------------------------------------------------------------
/src/book/ch04_ex.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # Writing structured programs
  3 | 
  4 | from __future__ import division
  5 | import nltk
  6 | import re
  7 | 
  8 | def bylen(x, y):
  9 |   return len(x) - len(y)
 10 | 
 11 | def ch04_10_sort_words_by_length(words):
 12 |   return sorted(words, cmp=bylen)
 13 | 
 14 | def gematrix_score(word):
 15 |   if word.isalpha():
 16 |     letter_vals = {'a':1, 'b':2, 'c':3, 'd':4, 'e':5, 'f':80, 'g':3, 'h':8,
 17 |       'i':10, 'j':10, 'k':20, 'l':30, 'm':40, 'n':50, 'o':70, 'p':80, 'q':100,
 18 |       'r':200, 's':300, 't':400, 'u':6, 'v':6, 'w':800, 'x':60, 'y':10, 'z':7}
 19 |     return sum(map(lambda x : letter_vals[x], [c for c in word.lower()]))
 20 |   else:
 21 |     return 0
 22 | 
 23 | def ch04_16_gematria_for_word():
 24 |   state_union = nltk.corpus.state_union
 25 |   for fileid in state_union.fileids():
 26 |     words = state_union.words(fileid)
 27 |     satanic_words = filter(lambda x : gematrix_score(x) == 666, words)
 28 |     if satanic_words > 0:
 29 |       print fileid, len(satanic_words)
 30 | 
 31 | def ch04_17_shorten(words, n):
 32 |   fd = nltk.FreqDist(words)
 33 |   topterms = set()
 34 |   topterms.update(fd.keys()[0:n])
 35 |   shortened = filter(lambda x : x not in topterms, words)
 36 |   return " ".join(shortened)
 37 | 
 38 | def ch04_19_sort_by_path_sim(synsets, ref_synset):
 39 |   def by_pathsimilarity(x, y):
 40 |     diff = ref_synset.path_similarity(x) - ref_synset.path_similarity(y)
 41 |     if diff == 0:
 42 |       return 0
 43 |     elif diff < 0:
 44 |       return -1
 45 |     else:
 46 |       return 1
 47 |   return sorted(synsets, cmp=by_pathsimilarity, reverse=True)
 48 | 
 49 | def insert_trie(keys):
 50 |   trie = nltk.defaultdict()
 51 |   [insert_trie_r(trie, key + "_") for key in keys]
 52 |   return trie
 53 |     
 54 | def insert_trie_r(trie, key):
 55 |   if len(key) > 1:
 56 |     first, rest = key[0], key[1:]
 57 |     if first not in trie:
 58 |       trie[first] = {}
 59 |     insert_trie_r(trie[first], rest)
 60 |   else:
 61 |     trie[key] = {}
 62 | 
 63 | def lookup_trie(trie, key):
 64 |   buf = []
 65 |   return lookup_trie_r(trie, key + "_", buf)
 66 | 
 67 | def lookup_trie_r(trie, key, buf):
 68 |   if len(key) > 1:
 69 |     first, rest = key[0], key[1:]
 70 |     if first not in trie:
 71 |       return None
 72 |     else:
 73 |       buf.append(first)
 74 |       return lookup_trie_r(trie[first], rest, buf)
 75 |   else:
 76 |     if key not in trie:
 77 |       return None
 78 |     else:
 79 |       return "".join(buf)
 80 | 
 81 | def ch04_23_lookup_trie():
 82 |   trie = insert_trie(["van", "vanity", "vanguard"])
 83 |   print lookup_trie(trie, "van")
 84 |   print lookup_trie(trie, "vanguard")
 85 |   print lookup_trie(trie, "fidelity")
 86 | 
 87 | #def ch04_24_keyword_linkage():
 88 | #  print "TODO"
 89 | 
 90 | def catalan1(n):
 91 |   if n == 0 or n == 1:
 92 |     return 1
 93 |   else:
 94 |     return sum([catalan1(i) * catalan1(n - i - 1) for i in range(0,n)])
 95 | 
 96 | def catalan2(cache, n):
 97 |   if n == 0 or n == 1:
 98 |     return 1
 99 |   try:
100 |     return cache[n]
101 |   except KeyError:
102 |     cache[n] = sum([catalan1(i) * catalan1(n - i - 1) for i in range(0,n)])
103 |     return cache[n]
104 | 
105 | def ch04_26_catalan_numbers():
106 |   import time
107 |   cache = {}
108 |   for i in range(0, 10):
109 |     s1 = time.clock()
110 |     cat1 = catalan1(i)
111 |     s1 = time.clock() - s1
112 |     s2 = time.clock()
113 |     cat2 = catalan2(cache, i)
114 |     s2 = time.clock() - s2
115 |     print i, cat1, cat2, s1, s2
116 | 
117 | #def ch04_27_author_identification():
118 | #  print "TODO"
119 | #
120 | #def ch04_28_gender_lexical_choice():
121 | #  print "TODO"
122 | #
123 | #def ch04_30_uniqueness_point_cutoff():
124 | #  print "TODO"
125 | #
126 | #def ch04_32_summarizer():
127 | #  print "TODO"
128 | #
129 | #def ch04_semantic_orientation_adjectives():
130 | #  print "TODO"
131 | #
132 | #def ch04_statistically_improbable_phrases():
133 | #  print "TODO"
134 | 
135 | def main():
136 | #  print ch04_10_sort_words_by_length(
137 | #    ["She", "sells", "sea", "shells", "by", "the", "seashore"])
138 | 
139 | #  print ch04_16_gematria_for_word()
140 | 
141 | #  print ch04_17_shorten(nltk.corpus.state_union.words("2000-Clinton.txt"), 20)
142 | 
143 | #  from nltk.corpus import wordnet as wn
144 | #  print ch04_19_sort_by_path_sim(
145 | #    [wn.synset("minke_whale.n.01"), wn.synset("orca.n.01"),
146 | #    wn.synset("novel.n.01"), wn.synset("tortoise.n.01")],
147 | #    wn.synset("right_whale.n.01"))
148 | 
149 | #  ch04_23_lookup_trie()
150 | 
151 |   ch04_26_catalan_numbers()
152 |   
153 | 
154 | if __name__ == "__main__":
155 |   main()
156 | 


--------------------------------------------------------------------------------
/src/book/ch05_ex.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | from __future__ import division
  4 | from operator import itemgetter
  5 | import nltk
  6 | 
  7 | def ch05_1_3_tag_sentences():
  8 |   sents = [
  9 |     "British left waffles on Falkland Islands.",
 10 |     "Juvenile Court to try shooting defendant.",
 11 |     "They wind back the clock, while we chase after the wind."
 12 |   ]
 13 |   for sent in sents:
 14 |     tokens = nltk.word_tokenize(sent)
 15 |     print nltk.pos_tag(tokens)
 16 | 
 17 | def ch05_10_train_test_unigram_tagger():
 18 |   from nltk.corpus import brown
 19 |   fd = nltk.FreqDist(brown.words(categories="news"))
 20 |   cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories="news"))
 21 |   most_freq_pos = dict((word, cfd[word].max()) for word in fd.keys())
 22 |   unigram_tagger = nltk.UnigramTagger(model=most_freq_pos)
 23 |   for sent in brown.sents(categories="editorial")[:10]:
 24 |     tagged = unigram_tagger.tag(sent)
 25 |     print sent
 26 |     print ">>>", tagged
 27 |     print "not tagged: ", filter(lambda (a,b): b == None, tagged)
 28 | 
 29 | def ch05_11_train_test_affix_tagger():
 30 |   from nltk.corpus import brown
 31 |   fd = nltk.FreqDist(brown.words(categories="news"))
 32 |   cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories="news"))
 33 |   most_freq_pos = dict((word, cfd[word].max()) for word in fd.keys())
 34 |   affix_tagger = nltk.AffixTagger(model=most_freq_pos)
 35 |   print affix_tagger.evaluate(brown.tagged_sents(categories="editorial"))
 36 | 
 37 | def ch05_14_brown_corpus_tags_list():
 38 |   from nltk.corpus import brown
 39 |   print sorted(set([t for (w, t) in brown.tagged_words()]))
 40 | 
 41 | def ch05_15_brown_corpus_trivia():
 42 |   from nltk.corpus import brown
 43 |   tagged_words = brown.tagged_words(categories="news")
 44 |   # which nouns are more common in plural form than singular?
 45 |   # NNS - plural, NN - singular. Calculate plural = singular + s
 46 |   s_nouns = [w for (w,t) in tagged_words if t == "NN"]
 47 |   plurals = set([w + "s" for w in s_nouns])
 48 |   p_nouns = [w for (w,t) in tagged_words if t == "NNS" and w in plurals]
 49 |   s_fd = nltk.FreqDist(s_nouns)
 50 |   p_fd = nltk.FreqDist(p_nouns)
 51 |   print "words where singular > plural=", \
 52 |     filter(lambda word: s_fd[word] < p_fd[word], p_fd.keys())[:50]
 53 |   # which word has the greatest number of distinct tags
 54 |   word_tags = nltk.defaultdict(lambda: set())
 55 |   for word, token in tagged_words:
 56 |     word_tags[word].add(token)
 57 |   ambig_words = sorted([(k, len(v)) for (k, v) in word_tags.items()],
 58 |     key=itemgetter(1), reverse=True)[:50]
 59 |   print [(word, numtoks, word_tags[word]) for (word, numtoks) in ambig_words]
 60 |   # list top 20 (by frequency) tags
 61 |   token_fd = nltk.FreqDist([token for (word, token) in tagged_words])
 62 |   print "top_tokens=", token_fd.keys()[:20]
 63 |   # which tags are nouns most commonly found after
 64 |   tagged_word_bigrams = nltk.bigrams(tagged_words)
 65 |   fd_an = nltk.FreqDist([t1 for (w1,t1),(w2,t2)
 66 |     in tagged_word_bigrams if t2.startswith("NN")])
 67 |   print "nouns commonly found after these tags:", fd_an.keys()
 68 | 
 69 | def ch05_17_lookup_tagger_performance_upper_limit():
 70 |   # average percentage of words that are assigned the most likely
 71 |   # tokens for the word
 72 |   from nltk.corpus import brown
 73 |   cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories="news"))
 74 |   sum_of_avgs = 0
 75 |   for word in cfd.conditions():
 76 |     mlt = reduce(lambda t1, t2: t1 if t1 > t2 else t2, cfd[word])
 77 |     num_mlt_tags = cfd[word][mlt]
 78 |     num_all_tags = cfd[word].N()
 79 |     sum_of_avgs += num_mlt_tags / num_all_tags
 80 |   print "perf_upper_limit=", sum_of_avgs / len(cfd.conditions())
 81 | 
 82 | def ch05_18_brown_corpus_statistics():
 83 |   from nltk.corpus import brown
 84 |   tagged_words = brown.tagged_words(categories="news")
 85 |   vocab_size = len(set([w for (w,t) in tagged_words]))
 86 |   cfd = nltk.ConditionalFreqDist(tagged_words)
 87 |   # proportion of word types always assigned the same part-of-speech
 88 |   # ie words with a single POS
 89 |   num_single_pos_words = sum(len(cfd[word].hapaxes())
 90 |     for word in cfd.conditions())
 91 |   print "prop of word types with single POS=", \
 92 |     num_single_pos_words / vocab_size
 93 |   # how many words are ambiguous, ie with >= 2 POS tags
 94 |   ambig_words = [w for w in cfd.conditions()
 95 |     if len(filter(lambda x: cfd[w][x] >= 2, cfd[w].keys())) >= 2]
 96 |   num_ambig_words = len(ambig_words)
 97 |   print "prop of ambiguous words (>= 2 POS)=", \
 98 |     num_ambig_words / vocab_size
 99 |   # percentage of word tokens in the brown corpus that involve
100 |   # ambiguous words
101 |   token_size = len(set([t for (w,t) in tagged_words]))
102 |   unique_tokens = set()
103 |   for w in ambig_words:
104 |     unique_tokens.update(set([t for t in cfd[w].keys()]))
105 |   print "prop of ambig tokens=", len(unique_tokens) / token_size
106 | 
107 | def ch05_20_brown_corpus_words_phrases_by_tag():
108 |   from nltk.corpus import brown
109 |   tagged_words = brown.tagged_words(categories="news")
110 |   # produce alpha sorted list of distinct words tagged MD
111 |   print sorted(set([w.lower()
112 |     for (w,t) in filter(lambda (w,t): t == "MD", tagged_words)]))
113 |   # identify words that can be plural (NRS, NPS*, NNS*) or
114 |   # third person singular verbs (BEDZ*, BEZ*, DOZ*, *BEZ)
115 |   # AND the ones ending with "s"
116 |   print set([w for (w, t) in tagged_words
117 |     if w.lower().endswith("s") and
118 |     (t == "NRS" or t.startswith("NPS")
119 |     or t.startswith("NPS") or t.startswith("NNS")
120 |     or t.startswith("BEDZ") or t.startswith("BEZ")
121 |     or t.startswith("DOZ") or t.endswith("BEZ"))])
122 |   # identify 3 word prepositional phrases IN+DET+NN
123 |   tagged_word_trigrams = nltk.trigrams(tagged_words)
124 |   print tagged_word_trigrams[:10]
125 |   print set([" ".join([w1, w2, w3])
126 |     for (w1,t1), (w2,t2), (w3,t3) in tagged_word_trigrams
127 |     if t1 == "IN" and t2 == "DET" and t3 == "NN"])
128 |   # ratio of masculine to feminine pronouns
129 |   num_masc_pn = len([w for (w,t) in tagged_words if w.lower() == "he"])
130 |   num_fem_pn = len([w for (w,t) in tagged_words if w.lower() == "she"])
131 |   print "masc/fem = ", (num_masc_pn / num_fem_pn)
132 | 
133 | def ch05_21_qualifiers_before_adore_love_like_prefer():
134 |   from nltk.corpus import brown
135 |   tagged_words = brown.tagged_words(categories="news")
136 |   tagged_word_bigrams = nltk.bigrams(tagged_words)
137 |   allp = set(["adore", "love", "like", "prefer"])
138 |   print set([w for (w1,t1), (w2,t2) in tagged_word_bigrams
139 |     if t1 == "QL" and w2.lower() in allp])
140 | 
141 | def ch05_22_regular_expression_tagger():
142 |   from nltk.corpus import brown
143 |   tagged_sents = brown.tagged_sents(categories="news")
144 |   patterns = [ # patterns copied from page 199
145 |     (r".*s$", "NNS"), # plurals
146 |     (r".*ing$", "VBG"), # gerund
147 |     (r".*ould$", "MD"), # modal
148 |     (r".*ed$", "VBD"), # verb past
149 |     (r".*es$", "VBZ"), # 3rd person singular
150 |     (r'.*', "NN")       # fallback to noun
151 |   ]
152 |   tagger = nltk.RegexpTagger(patterns)
153 |   print tagger.evaluate(tagged_sents)
154 | 
155 | def ch05_27_collapse_tags_based_on_conf_matrix():
156 |   # TODO: run ch05.py:ambiguous_tags to get confusion matrix
157 |   print "TODO"
158 |   
159 | def ch05_30_bigram_tagger_low_freq_words_as_unk():
160 |   from nltk.corpus import brown
161 |   # before UNK, check tagger score
162 |   sents = brown.tagged_sents(categories="news")
163 |   size = int(len(sents) * 0.9)
164 |   train_sents = sents[:size]
165 |   test_sents = sents[size:]
166 |   tagger1 = nltk.BigramTagger(train_sents)
167 |   print "before UNK, evaluate=", tagger1.evaluate(test_sents)
168 |   # replace low freq words with UNK
169 |   words = brown.tagged_words(categories="news")
170 |   fd = nltk.FreqDist([w for (w,t) in words])
171 |   lfw = set([w for (w,t) in words if fd[w] <= 1])
172 |   sents2 = []
173 |   for sent in train_sents:
174 |     sents2.append(map(lambda (w,t): ("UNK",t) if w in lfw else (w,t), sent))
175 |   tagger2 = nltk.BigramTagger(sents2)
176 |   print "after UNK, evaluate=", tagger2.evaluate(test_sents)
177 | 
178 | def ch05_32_brill_tagger():
179 |   # TODO: check out usage of brill tagger
180 |   # also see # 40
181 |   print "TODO"
182 | 
183 | def ch05_33_list_pos_of_word_given_word_and_pos():
184 |   from nltk.corpus import brown
185 |   tagged_words = brown.tagged_words(categories="news")
186 |   tagged_word_bigrams = nltk.bigrams(tagged_words)
187 |   dd = nltk.defaultdict(dict)
188 |   for (w1,t1), (w2,t2) in tagged_word_bigrams:
189 |     dd[w1][t1] = t2
190 |   print dd
191 | 
192 | def ch05_34_num_words_with_1to10_distinct_tags():
193 |   from nltk.corpus import brown
194 |   tagged_words = brown.tagged_words(categories="news")
195 |   # number of distinct tags and number of words in corpus for this
196 |   dd = nltk.defaultdict(set)
197 |   for w,t in tagged_words:
198 |     dd[w].add(t)
199 |   for i in range(1,10):
200 |     print i, len(filter(lambda x: len(dd[x]) == i, dd.keys()))
201 |   # for the word with greatest number of tags, print out concordance
202 |   # one for each tag
203 |   maxtags = 6
204 |   word = None
205 |   tags = None
206 |   for w in dd.keys():
207 |     if len(dd[w]) >= maxtags:
208 |       word = w
209 |       tags = dd[w]
210 |       break
211 |   poss = []
212 |   pos = 0
213 |   for w, t in tagged_words:
214 |     if w == word and t in tags:
215 |       poss.append((t, pos))
216 |       tags.remove(t)
217 |     pos += 1
218 |   for t, pos in poss:
219 |     print t, " ".join(w for w,t in tagged_words[pos-10:pos+10])
220 | 
221 | def ch05_35_must_contexts():
222 |   from nltk.corpus import brown
223 |   tagged_words = brown.tagged_words(categories="news")
224 |   tagged_word_bigrams = nltk.bigrams(tagged_words)
225 |   fd = nltk.FreqDist((w1,t2) for (w1,t1),(w2,t2)
226 |     in tagged_word_bigrams
227 |     if w1 == "must")
228 |   for t in fd.keys():
229 |     print t, fd[t]
230 |   # TODO: epistemic and deontic uses of must?
231 | 
232 | def ch05_37_prev_token_tagger():
233 |   # TODO
234 |   pass
235 | 
236 | def ch05_39_statistical_tagger():
237 |   # TODO
238 |   pass
239 | 
240 |   
241 | def main():
242 | #  ch05_1_3_tag_sentences()
243 | #  ch05_10_train_test_unigram_tagger()
244 | #  ch05_11_train_test_affix_tagger()
245 | #  ch05_14_brown_corpus_tags_list()
246 | #  ch05_15_brown_corpus_trivia()
247 | #  ch05_17_lookup_tagger_performance_upper_limit()
248 | #  ch05_18_brown_corpus_statistics()
249 | #  ch05_20_brown_corpus_words_phrases_by_tag()
250 | #  ch05_21_qualifiers_before_adore_love_like_prefer()
251 | #  ch05_22_regular_expression_tagger()
252 | #  ch05_30_bigram_tagger_low_freq_words_as_unk()
253 | #  ch05_32_brill_tagger()
254 | #  ch05_33_list_pos_of_word_given_word_and_pos()
255 | #  ch05_34_num_words_with_1to10_distinct_tags()
256 | #  ch05_35_must_contexts()
257 |   ch05_36_tagger_training()
258 |   print "end"
259 |   
260 | if __name__ == "__main__":
261 |   main()
262 | 


--------------------------------------------------------------------------------
/src/book/ch07.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # Extracting information from text
  3 | 
  4 | from __future__ import division
  5 | import nltk
  6 | import re
  7 | 
  8 | def _ie_preprocess(document):
  9 |   sentences = nltk.sent_tokenize(document)
 10 |   sentences = [nltk.word_tokenizer(sent) for sent in sentences]
 11 |   sentences = [nltk.os_tag(sent) for sent in sentences]
 12 | 
 13 | def simple_regex_based_np_chunker():
 14 |   sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"),
 15 |     ("dog", "NN"), ("barked", "VBD"), ("at", "IN"), ("the", "DT"),
 16 |     ("cat", "NN")]
 17 |   grammar = r"""
 18 |     NP : {<DT|PP\$>?<JJ>*<NN>}   # determiner/possessive, adjective, noun
 19 |          {<NNP>+}                # sequences of proper nouns
 20 |          {<NN>+}                 # sequence of common nouns
 21 |          }<VBD|IN>+{             # Chink sequences of VBD and IN
 22 |   """
 23 |   cp = nltk.RegexpParser(grammar)
 24 |   result = cp.parse(sentence)
 25 |   print result
 26 | #  result.draw()
 27 |   nouns = [("money", "NN"), ("market", "NN"), ("fund", "NN")]
 28 |   print cp.parse(nouns)
 29 |   sentence = [("Rapunzel", "NNP"), ("let", "VBD"), ("down", "RP"),
 30 |       ("her", "PP$"), ("long", "JJ"), ("golden", "JJ"), ("hair", "NN")]
 31 |   print cp.parse(sentence)
 32 | 
 33 | def _find_chunks(pattern):
 34 |   print "======", pattern, "======="
 35 |   cp = nltk.RegexpParser(r"""
 36 |     CHUNK: {%s}
 37 |   """ % (pattern))
 38 |   brown = nltk.corpus.brown
 39 |   for sent in brown.tagged_sents():
 40 |     tree = cp.parse(sent)
 41 |     for subtree in tree.subtrees():
 42 |       if subtree.node == "CHUNK":
 43 |         print subtree
 44 | 
 45 | def extract_pos_pattern_with_chunk_parser():
 46 |   _find_chunks("<V.*> <TO> <V.*>")
 47 |   _find_chunks("<N.*> <N.*> <N.*> <N.*>+")
 48 | 
 49 | def iob_to_tree():
 50 |   text = """
 51 | he PRP B-NP
 52 | accepted VBD B-VP
 53 | the DT B-NP
 54 | position NN I-NP
 55 | of IN B-PP
 56 | vice NN B-NP
 57 | chairman NN I-NP
 58 | of IN B-PP
 59 | Carlyle NNP B-NP
 60 | Group NNP I-NP
 61 | , , O
 62 | a DT B-NP
 63 | merchant NN I-NP
 64 | banking NN I-NP
 65 | concern NN I-NP
 66 | . . O
 67 |   """
 68 |   tree = nltk.chunk.conllstr2tree(text, chunk_types=["NP"])
 69 |   print tree
 70 | 
 71 | def read_chunked_corpus():
 72 |   from nltk.corpus import conll2000
 73 |   print conll2000.chunked_sents("train.txt")[99]
 74 |   print conll2000.chunked_sents("train.txt", chunk_types = ["NP"])[99]
 75 | 
 76 | def evaluate_chunker():
 77 |   from nltk.corpus import conll2000
 78 |   cp = nltk.RegexpParser("") # baseline
 79 |   test_sents = conll2000.chunked_sents("test.txt", chunk_types=["NP"])
 80 |   print cp.evaluate(test_sents)
 81 |   grammar = r"NP: {<[CDJNP].*>+}"
 82 |   cp1 = nltk.RegexpParser(grammar) # naive tagger, look for all tags in NP chunk
 83 |   print cp1.evaluate(test_sents)
 84 | 
 85 | class UnigramChunker(nltk.ChunkParserI):
 86 |   def __init__(self, train_sents):
 87 |     train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
 88 |                     for sent in train_sents]
 89 |     self.tagger = nltk.UnigramTagger(train_data)
 90 | #    self.tagger = nltk.BigramTagger(train_data) # increase accuracy a bit
 91 | 
 92 |   def parse(self, sentence):
 93 |     pos_tags = [pos for (word,pos) in sentence]
 94 |     tagged_pos_tags = self.tagger.tag(pos_tags)
 95 |     chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
 96 |     conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
 97 |                   in zip(sentence, chunktags)]
 98 |     return nltk.chunk.conlltags2tree(conlltags)
 99 | 
100 | def chunk_with_unigram_tagger():
101 |   # use unigram tagger to find the IOB tag given its POS tag
102 |   from nltk.corpus import conll2000
103 |   test_sents = conll2000.chunked_sents("test.txt", chunk_types=["NP"])
104 |   train_sents = conll2000.chunked_sents("train.txt", chunk_types=["NP"])
105 |   unigram_chunker = UnigramChunker(train_sents)
106 |   print unigram_chunker.evaluate(test_sents)
107 |   postags = sorted(set(pos for sent in train_sents
108 |                            for (word, pos) in sent.leaves()))
109 |   print unigram_chunker.tagger.tag(postags)
110 | 
111 | def _npchunk_features(sentence, i, history):
112 |   features = {}
113 |   word, pos = sentence[i]
114 |   features["pos"] = pos
115 |   # add previous POS tag
116 |   prevword, prevpos = "<START>", "<START>" if i == 0 else sentence[i-1]
117 |   features["prevpos"] = prevpos
118 |   # add current word
119 |   features["word"] = word
120 |   # more features
121 |   nextword, nextpos = "<END>", "<END>" if i == len(sentence) - 1 else sentence[i+1]
122 |   features["nextpos"] = nextpos
123 |   features["prevpos+pos"] = "%s+%s" % (prevpos, pos)
124 |   features["pos+nextpos"] = "%s+%s" % (pos, nextpos)
125 |   # tags since last determiner
126 |   tags_since_dt = set()
127 |   for word, pos in sentence[:i]:
128 |     if pos == "DT":
129 |       tags_since_dt = set()
130 |     else:
131 |       tags_since_dt.add(pos)
132 |   features["tags_since_dt"] = "+".join(sorted(tags_since_dt))
133 |   return features
134 | 
135 | class ConsecutiveNPChunkTagger(nltk.TaggerI):
136 |   def __init__(self, train_sents):
137 |     train_set = []
138 |     for tagged_sent in train_sents:
139 |       untagged_sent = nltk.tag.untag(tagged_sent)
140 |       history = []
141 |       for i, (word, tag) in enumerate(tagged_sent):
142 |         featureset = _npchunk_features(untagged_sent, i, history)
143 |         train_set.append((featureset, tag))
144 |         history.append(tag)
145 |     self.classifier = nltk.MaxentClassifier.train(train_set,
146 |       algorithm="GIS", trace=0)
147 | 
148 |   def tag(self, sentence):
149 |     history = []
150 |     for i, word in enumerate(sentence):
151 |       featureset = _npchunk_features(sentence, i, history)
152 |       tag = self.classifier.classify(featureset)
153 |       history.append(tag)
154 |     return zip(sentence, history)
155 | 
156 | class ConsecutiveNPChunker(nltk.ChunkParserI):
157 |   def __init__(self, train_sents):
158 |     tagged_sents = [[((w,t),c) for (w,t,c) in nltk.chunk.tree2conlltags(sent)]
159 |                       for sent in train_sents]
160 |     self.tagger = ConsecutiveNPChunkTagger(tagged_sents)
161 | 
162 |   def parse(self, sentence):
163 |     tagged_sents = self.tagger.tag(sentence)
164 |     conlltags = [(w,t,c) for ((w,t),c) in tagged_sents]
165 |     return nltk.chunk.conlltags2tree(conlltags)
166 | 
167 | def train_classifier_based_chunker():
168 |   from nltk.corpus import conll2000
169 |   test_sents = conll2000.chunked_sents("test.txt", chunk_types=["NP"])
170 |   train_sents = conll2000.chunked_sents("train.txt", chunk_types=["NP"])
171 |   chunker = ConsecutiveNPChunker(train_sents)
172 |   print chunker.evaluate(test_sents)
173 | 
174 | def recursive_chunk_parser():
175 |   grammar = r"""
176 |     NP : {<DT|JJ|NN.*>+}    # chunk sentences of DT,JJ,NN
177 |     PP : {<IN><NP>}         # chunk preposition followed by NP
178 |     VP : {<VB.*><NP|PP|CLAUSE>+$}  # chunk verb and their argument
179 |     CLAUSE : {<NP><VP>}     # chunk NP,VP
180 |   """
181 |   cp = nltk.RegexpParser(grammar, loop=2) # parses sentence multiple times
182 |   sentence = [("Mary", "NN"), ("saw", "VBD"), ("the", "DT"), ("cat", "NN"),
183 |     ("sit", "VB"), ("on", "IN"), ("the", "DT"), ("mat", "NN")]
184 |   print cp.parse(sentence)
185 |   sentence2 = [("John", "NNP"), ("thinks", "VBZ"), ("Mary", "NNP"),
186 |     ("saw", "VBD"), ("the", "DT"), ("cat", "NN"), ("sit", "VB"),
187 |     ("on", "IN"), ("the", "DT"), ("mat", "NN")]
188 |   print cp.parse(sentence2)
189 | 
190 | def _traverse(t):
191 |   try:
192 |     t.node
193 |   except AttributeError:
194 |     print t,
195 |   else:
196 |     print "(", t.node,
197 |     for child in t:
198 |       _traverse(child)
199 |     print ")",
200 |     
201 | def nltk_tree_handling():
202 |   # construction
203 |   tree1 = nltk.Tree("NP", ["Alice"])
204 |   print "tree1=", tree1
205 |   tree2 = nltk.Tree("NP", ["the", "rabbit"])
206 |   print "tree2=", tree2
207 |   tree3 = nltk.Tree("VP", ["chased", tree2])
208 |   print "tree3=", tree3
209 |   tree4 = nltk.Tree("S", [tree1, tree3])
210 |   print "tree4=", tree4
211 |   # deconstruction
212 |   print "tree4[1]=", tree4[1]
213 |   print "tree4[1].node=", tree4[1].node, \
214 |     "tree4[1].leaves()=", tree4[1].leaves()
215 |   print "tree4[1][1][1]=", tree4[1][1][1]
216 |   _traverse(tree4)
217 | 
218 | def named_entity_recognition():
219 |   # Gazetteers: Alexandria or Getty
220 |   sent = nltk.corpus.treebank.tagged_sents()[22]
221 |   print "NE (binary=True)", nltk.ne_chunk(sent, binary=True)
222 |   print "indiv NE types (binary=False)", nltk.ne_chunk(sent)
223 | 
224 | def relation_extraction():
225 |   IN = re.compile(r".*\bin\b(?!\b.+ing)")
226 |   for doc in nltk.corpus.ieer.parsed_docs("NYT_19980315"):
227 |     for rel in nltk.sem.extract_rels("ORG", "LOC", doc, corpus="ieer", pattern=IN):
228 |       print nltk.sem.show_raw_rtuple(rel)
229 | 
230 | def relation_extraction2():
231 |   # needs POS as well as NE annotations (in Dutch)
232 |   from nltk.corpus import conll2002
233 |   vnv = """
234 | (
235 | is/V|       # 3rd sing present and
236 | was/V|      # past forms of the verm zijn (be)
237 | werd/V|     # and also present
238 | wordt/V     # past of worden (become)
239 | ).*           # followed by anything
240 | van/Prep      # followed by van (of)
241 |   """
242 |   VAN = re.compile(vnv, re.VERBOSE)
243 |   for doc in conll2002.chunked_sents("ned.train"):
244 |     for r in nltk.sem.extract_rels("PER", "ORG", doc,
245 |         corpus="conll2002", pattern=VAN):
246 | #      print nltk.sem.show_clause(r, relsym="VAN")
247 |       print nltk.sem.show_raw_rtuple(r, lcon=True, rcon=True)
248 | 
249 | def main():
250 |   simple_regex_based_np_chunker()
251 | #  extract_pos_pattern_with_chunk_parser()
252 | #  iob_to_tree()
253 | #  read_chunked_corpus()
254 | #  evaluate_chunker()
255 | #  chunk_with_unigram_tagger()
256 | #  train_classifier_based_chunker() # TODO: finish running
257 | #  recursive_chunk_parser()
258 | #  nltk_tree_handling()
259 | #  named_entity_recognition()
260 | #  relation_extraction()
261 | #  relation_extraction2()
262 |   print "end"
263 |   
264 | if __name__ == "__main__":
265 |   main()
266 | 


--------------------------------------------------------------------------------
/src/book/ch07_ex.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | from __future__ import division
  4 | import nltk
  5 | import re
  6 | 
  7 | def ch07_02_match_np_containing_plural_nouns():
  8 |   grammar = r"""
  9 |     NP : {<JJ> <NNS>}
 10 |   """
 11 |   sent = [("Many", "JJ"), ("researchers", "NNS"), ("two", "CD"),
 12 |     ("weeks", "NNS"), ("both", "DT"), ("new", "JJ"), ("positions", "NNS")]
 13 |   cp = nltk.RegexpParser(grammar)
 14 |   print cp.parse(sent)
 15 | 
 16 | def ch07_03_develop_grammar_with_chunkparser():
 17 |   # nltk.app.chunkparser()
 18 |   from nltk.corpus import conll2000
 19 |   grammar = r"""
 20 |     NP: {<NN.*>}
 21 |        {<DT> <NN> <JJ> <NN>}
 22 |        {<DT> <JJ>* <NN.*>}
 23 |        {<POS> <JJ>* <NN>}
 24 |        {<NNP> <CC> <NNP>}
 25 |   """
 26 |   cp = nltk.RegexpParser(grammar)
 27 |   for sentence in conll2000.chunked_sents("train.txt", chunk_types=["NP"]):
 28 |     print cp.parse(sentence)
 29 | 
 30 | def ch07_05_tag_pattern_np_containing_gerund():
 31 |   grammar = r"""
 32 |   NP: {<.*> <VBG> <NN.*>}
 33 |   """
 34 |   cp = nltk.RegexpParser(grammar)
 35 |   from nltk.corpus import brown
 36 |   tagged_sents = brown.tagged_sents(categories="news")
 37 |   for sent in tagged_sents:
 38 |     tree = str(cp.parse(sent))
 39 |     if tree.find("(NP ") > -1:
 40 |       print tree
 41 | 
 42 | def ch07_06_coordinated_noun_phrases():
 43 |   from nltk.corpus import brown
 44 |   tagged_sents = brown.tagged_sents(categories="news")
 45 |   grammar = r"""
 46 |     NP_CC: {<NN.*> <CC> <NN.*>}
 47 |            {<DT> <PRP> <NN.*> <CC> <NN.*>}
 48 |            {<NN.*>+ <CC> <NN.*>}
 49 |   """
 50 |   cp = nltk.RegexpParser(grammar)
 51 |   for sent in tagged_sents:
 52 |     tree = str(cp.parse(sent))
 53 |     if tree.find("(NP_CC ") > -1:
 54 |       print tree
 55 | 
 56 | def ch07_07_chunker_eval():
 57 |   from nltk.corpus import conll2000
 58 |   grammar = r"""
 59 |     NP: {<NN.*>}
 60 |        {<DT> <NN> <JJ> <NN>}
 61 |        {<DT> <JJ>* <NN.*>}
 62 |        {<POS> <JJ>* <NN>}
 63 |        {<NNP> <CC> <NNP>}
 64 |   """
 65 |   cp = nltk.RegexpParser(grammar)
 66 |   test_sents = conll2000.chunked_sents("test.txt", chunk_types=["NP"])
 67 |   print cp.evaluate(test_sents)
 68 | #  print cp.chunkscore.missed()
 69 | #  print cp.chunkscore.incorrect()
 70 | 
 71 | def ch07_13a_tag_seqs_for_np():
 72 |   from nltk.corpus import conll2000
 73 |   train_sents = conll2000.chunked_sents("train.txt", chunk_types=["NP"])
 74 |   fdist = nltk.FreqDist()
 75 |   tagseq = []
 76 |   for sent in train_sents:
 77 |     for word, postag, iobtag in nltk.chunk.tree2conlltags(sent):
 78 |       if iobtag == "B-NP":
 79 |         fdist.inc(" ".join(tagseq))
 80 |         tagseq = []
 81 |         tagseq.append(postag)
 82 |       elif iobtag == "O":
 83 |         continue
 84 |       else:
 85 |         tagseq.append(postag)
 86 |   for tagseq in fdist.keys():
 87 |     print tagseq, fdist[tagseq]
 88 | 
 89 | def ch07_13c_better_chunker():
 90 |   # can be improved with more patterns from the top from previous method
 91 |   from nltk.corpus import conll2000
 92 |   grammar = r"""
 93 |   NP : {<DT> <JJ> <NN.*>}
 94 |        {<DT> <NN.*>}
 95 |        {<JJ> <NN.*>}
 96 |        {<NN.*>+}
 97 |   """
 98 |   cp = nltk.RegexpParser(grammar)
 99 |   test_sents = conll2000.chunked_sents("test.txt", chunk_types=["NP"])
100 |   print cp.evaluate(test_sents)
101 | 
102 | def _chunk2brackets(sent):
103 |   bracks = []
104 |   for wpi in nltk.chunk.tree2conllstr(sent).split("\n"):
105 |     (word, pos, iob) = wpi.split(" ")
106 |     bracks.append((word, pos))
107 |   return bracks
108 | 
109 | def _chunk2iob(sent):
110 |   iobs = []
111 |   for wpi in nltk.chunk.tree2conllstr(sent).split("\n"):
112 |     (word, pos, iob) = wpi.split(" ")
113 |     iobs.append((word, pos, iob))
114 |   return iobs
115 | 
116 | def ch07_16a_penn_treebank():
117 |   from nltk.corpus import treebank_chunk
118 |   for sent in treebank_chunk.chunked_sents("wsj_0001.pos"):
119 |     print "sent=", sent
120 |     print "chunk2brackets=", _chunk2brackets(sent)
121 |     print "chunk2iob=", _chunk2iob(sent)
122 |     
123 | def main():
124 | #  ch07_02_match_np_containing_plural_nouns()
125 | #  ch07_03_develop_grammar_with_chunkparser()
126 | #  ch07_05_tag_pattern_np_containing_gerund()
127 | #  ch07_06_coordinated_noun_phrases()
128 | #  ch07_07_chunker_eval()
129 | #  ch07_13a_tag_seqs_for_np()
130 | #  ch07_13c_better_chunker()
131 |   ch07_16a_penn_treebank()
132 | 
133 | if __name__ == "__main__":
134 |   main()
135 | 


--------------------------------------------------------------------------------
/src/book/ch08.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # Analyzing sentence structure
  3 | 
  4 | from __future__ import division
  5 | import nltk
  6 | import re
  7 | 
  8 | def sentence_parse_example():
  9 |   groucho_grammar = nltk.parse_cfg("""
 10 |     S -> NP VP
 11 |     PP -> P NP
 12 |     NP -> Det N | Det N PP | 'I'
 13 |     VP -> V NP | VP PP
 14 |     Det -> 'an' | 'my'
 15 |     N -> 'elephant' | 'pajamas'
 16 |     V -> 'shot'
 17 |     P -> 'in'
 18 |   """)
 19 |   sent = ["I", "shot", "an", "elephant", "in", "my", "pajamas"]
 20 |   parser = nltk.ChartParser(groucho_grammar)
 21 |   trees = parser.nbest_parse(sent)
 22 |   for tree in trees:
 23 |     print tree
 24 | 
 25 | def simple_cfg():
 26 | #  grammar = nltk.parse_cfg("""
 27 | #    S -> NP VP
 28 | #    VP -> V NP | V NP PP
 29 | #    PP -> P NP
 30 | #    V -> "saw" | "ate" | "walked"
 31 | #    NP -> "John" | "Mary" | "Bob" | Det N | Det N PP
 32 | #    Det -> "a" | "an" | "the" | "my"
 33 | #    N -> "man" | "dog" | "cat" | "telescope" | "park"
 34 | #    P -> "in" | "on" | "by" | "with"
 35 | #  """)
 36 |   # also can load grammar from text file
 37 |   # grammar = nltk.data.load("file:mygrammar.cfg")
 38 |   grammar = nltk.parse_cfg("""
 39 |     S -> NP VP
 40 |     NP -> Det Nom | PropN
 41 |     Nom -> Adj Nom | N
 42 |     VP -> V Adj | V NP | V S | V NP PP
 43 |     PP -> P NP
 44 |     PropN -> 'Buster' | 'Chatterer' | 'Joe'
 45 |     Det -> 'the' | 'a'
 46 |     N -> 'bear' | 'squirrel' | 'tree' | 'fish' | 'log'
 47 |     Adj -> 'angry' | 'frightened' | 'little' | 'tall'
 48 |     V -> 'chased' | 'saw' | 'said' | 'thought' | 'was' | 'put'
 49 |     P -> 'on'
 50 |   """)
 51 | #  sent = "Mary saw Bob".split()
 52 |   # structural ambiguity - 2 parse trees for this.
 53 |   # prepositional phrase attach ambiguity.
 54 | #  sent = "the dog saw a man in a park".split()
 55 |   # For second grammar
 56 | #  sent = "the angry bear chased the frightened little squirrel".split()
 57 |   sent = "Chatterer said Buster thought the tree was tall".split()
 58 | #  rd_parser = nltk.RecursiveDescentParser(grammar, trace=2) # for debug
 59 |   # NOTE: production rules need to be right-recursive, ie X -> Y X
 60 |   rd_parser = nltk.RecursiveDescentParser(grammar)
 61 |   for tree in rd_parser.nbest_parse(sent):
 62 |     print tree
 63 | 
 64 | # recursive descent parsing - top down
 65 | #   nltk.app.rdparser() - recursive descent demo
 66 | #   shortcomings - left recursive productions result in infinite loop
 67 | #                  parser wastes time considering paths that it discards
 68 | #                  backtracking discards what may need to be rebuilt
 69 | # shift-reduce - bottom up
 70 | #    nltk.app.srparser() - demo
 71 | #    can reach dead end and fail to find a parse
 72 | #    with Lookahead LR parser
 73 | #    only builds structure corresponding to word in input.
 74 | # left-corner filtering - top down w/ bottom up filtering
 75 | #    each production is stored along with its left corner element on RHS
 76 | #    eg, S -> NP VP; VP -> V NP | ... => (S,NP), (VP,V), ...
 77 | # chart parsing - dynamic programming
 78 | #    nltk.app.chartparser()
 79 | def parsing_types():
 80 |   grammar = nltk.parse_cfg("""
 81 |     S -> NP VP
 82 |     VP -> V NP | V NP PP
 83 |     PP -> P NP
 84 |     V -> "saw" | "ate" | "walked"
 85 |     NP -> "John" | "Mary" | "Bob" | Det N | Det N PP
 86 |     Det -> "a" | "an" | "the" | "my"
 87 |     N -> "man" | "dog" | "cat" | "telescope" | "park"
 88 |     P -> "in" | "on" | "by" | "with"
 89 |   """)
 90 |   sent = "Mary saw a dog".split()
 91 |   rd_parser = nltk.RecursiveDescentParser(grammar)
 92 |   print "==== recursive descent ===="
 93 |   for t in rd_parser.nbest_parse(sent):
 94 |     print t
 95 |   sr_parser = nltk.ShiftReduceParser(grammar)
 96 |   print "==== shift reduce ===="
 97 |   for t in sr_parser.nbest_parse(sent):
 98 |     print t
 99 | 
100 | def _chart_init_wfst(tokens, grammar):
101 |   numtokens = len(tokens)
102 |   wfst = [[None for i in range(numtokens+1)] for j in range(numtokens+1)]
103 |   for i in range(numtokens):
104 |     productions = grammar.productions(rhs = tokens[i])
105 |     wfst[i][i+1] = productions[0].lhs()
106 |   return wfst
107 | 
108 | def _chart_complete_wfst(wfst, tokens, grammar, trace=False):
109 |   index = dict((p.rhs(), p.lhs()) for p in grammar.productions())
110 |   numtokens = len(tokens)
111 |   for span in range(2, numtokens+1):
112 |     for start in range(numtokens+1-+span):
113 |       end = start + span
114 |       for mid in range(start+1, end):
115 |         nt1, nt2 = wfst[start][mid], wfst[mid][end]
116 |         if nt1 and nt2 and (nt1,nt2) in index:
117 |           wfst[start][end] = index[(nt1, nt2)]
118 |           if trace:
119 |             print "[%s] %3s [%s] %3s [%s] ==> [%s] %3s [%s]" % \
120 |               (start, nt1, mid, nt2, end, start, index[(nt1,nt2)], end)
121 |   return wfst
122 | 
123 | def _chart_display(wfst, tokens):
124 |   print "\nWFST " + " ".join([("%-4d" %i) for i in range(1, len(wfst))])
125 |   for i in range(len(wfst)-1):
126 |     print "%-4d" % i,
127 |     for j in range(1, len(wfst)):
128 |       print "%-4s" % (wfst[i][j] or "."),
129 |     print
130 |     
131 | def chart_parsing():
132 |   groucho_grammar = nltk.parse_cfg("""
133 |     S -> NP VP
134 |     PP -> P NP
135 |     NP -> Det N | Det N PP | 'I'
136 |     VP -> V NP | VP PP
137 |     Det -> 'an' | 'my'
138 |     N -> 'elephant' | 'pajamas'
139 |     V -> 'shot'
140 |     P -> 'in'
141 |   """)
142 |   tokens = "I shot an elephant in my pajamas".split()
143 |   wfst0 = _chart_init_wfst(tokens, groucho_grammar)
144 |   _chart_display(wfst0, tokens)
145 |   wfst1 = _chart_complete_wfst(wfst0, tokens, groucho_grammar, trace=True)
146 |   _chart_display(wfst1, tokens)
147 | 
148 | def dependency_grammar():
149 |   groucho_dep_grammar = nltk.parse_dependency_grammar("""
150 |     'shot' -> 'I' | 'elephant' | 'in'
151 |     'elephant' -> 'an' | 'in'
152 |     'in' -> 'pajamas'
153 |     'pajamas' -> 'my'
154 |   """)
155 |   print groucho_dep_grammar
156 |   pdp = nltk.ProjectiveDependencyParser(groucho_dep_grammar)
157 |   sent = "I shot an elephant in my pajamas".split()
158 |   trees = pdp.parse(sent)
159 |   for tree in trees:
160 | #    tree.draw()
161 |     print tree
162 | 
163 | def _grammar_filter(tree):
164 |   child_nodes = [child.node for child in tree
165 |     if isinstance(child, nltk.Tree)]
166 |   return (tree.node == "VP") and ("S" in child_nodes)
167 | 
168 | def grammar_development_with_treebank():
169 |   from nltk.corpus import treebank
170 |   t = treebank.parsed_sents("wsj_0001.mrg")[0]
171 |   print t
172 |   print "identify verbs for SV in VP -> SV S", [subtree for tree
173 |     in treebank.parsed_sents()
174 |     for subtree in tree.subtrees(_grammar_filter)]
175 | 
176 | def word_valency():
177 |   table = nltk.defaultdict(lambda: nltk.defaultdict(set))
178 |   entries = nltk.corpus.ppattach.attachments("training")
179 |   for entry in entries:
180 | #    print entry
181 |     key = entry.noun1 + "-" + entry.prep + "-" + entry.noun2
182 |     table[key][entry.attachment].add(entry.verb)
183 |   for key in sorted(table):
184 |     if len(table[key]) > 1:
185 |       print key, "N:", sorted(table[key]["N"]), "V:", sorted(table[key]["V"])
186 | 
187 | def _give_give(t):
188 |   return t.node == "VP" and len(t) > 3 and t[1].node == "NP" and \
189 |     (t[2].node == "PP-DIV" or t[2].node == "NP") and \
190 |     ("give" in t[0].leaves() or "gave" in t[0].leaves())
191 | 
192 | def _give_sent(t):
193 |   return " ".join(token for token in t.leaves() if token[0] not in "*-O")
194 | 
195 | def _give_print_node(t, width):
196 |   output = "%s %s: %s / %s: %s" % \
197 |     (_give_sent(t[0]), t[1].node, _give_sent(t[1]), t[2].node, _give_sent(t[2]))
198 |   if len(output) > width:
199 |     output = output[:width] + "..."
200 |   print output
201 | 
202 | def give_gave_usage():
203 |   # Kim gave a bone to the dog VS Kim gave the dog a bone (equally likely)
204 |   # Kim gives the heebie-jeebies to me LESS LIKELY THAN
205 |   # Kim gives me the heebie-jeebies.
206 |   for tree in nltk.corpus.treebank.parsed_sents():
207 |     for t in tree.subtrees(_give_give):
208 |       _give_print_node(t, 72)
209 | 
210 | def pcfg_parser():
211 | #  grammar = nltk.parse_pcfg("""
212 | #    S -> NP VP         [1.0]
213 | #    VP -> TV NP        [0.4]
214 | #    VP -> IV           [0.3]
215 | #    VP -> DatV NP NP   [0.3]
216 | #    TV -> 'saw'        [1.0]
217 | #    IV -> 'ate'        [1.0]
218 | #    DatV -> 'gave'     [1.0]
219 | #    NP -> 'telescopes' [0.8]
220 | #    NP -> 'Jack'       [0.2]
221 | #  """)
222 |   # alternative repr, or clause probs must sum to 1
223 |   grammar = nltk.parse_pcfg("""
224 |     S -> NP VP         [1.0]
225 |     VP -> TV NP [0.4] | IV [0.3] | DatV NP NP [0.3]
226 |     TV -> 'saw'        [1.0]
227 |     IV -> 'ate'        [1.0]
228 |     DatV -> 'gave'     [1.0]
229 |     NP -> 'telescopes' [0.8]
230 |     NP -> 'Jack'       [0.2]
231 |   """)
232 |   print grammar
233 |   viterbi_parser = nltk.ViterbiParser(grammar)
234 |   print viterbi_parser.parse("Jack saw telescopes".split())
235 |   
236 | def main():
237 | #  sentence_parse_example()
238 | #  simple_cfg()
239 | #  parsing_types()
240 | #  chart_parsing()
241 | #  dependency_grammar()
242 | #  grammar_development_with_treebank()
243 | #  word_valency()
244 | #  give_gave_usage()
245 |   pcfg_parser()
246 |   print "end"
247 |   
248 | if __name__ == "__main__":
249 |   main()
250 | 


--------------------------------------------------------------------------------
/src/book/ch09.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # Building feature based grammars
  3 | from __future__ import division
  4 | import nltk
  5 | import re
  6 | 
  7 | def _grammatical_lex2fs(word):
  8 |   kim = {"CAT": "NP", "ORTH": "Kim", "REF": "k"}
  9 |   chase = {"CAT": "V", "ORTH": "chased", "REL": "chase"}
 10 |   lee = {"CAT": "NP", "ORTH": "Lee", "REF": "l"}
 11 |   for fs in [kim, lee, chase]:
 12 |     if fs["ORTH"] == word:
 13 |       return fs
 14 | 
 15 | def grammatical_features():
 16 |   tokens = "Kim chased Lee".split()
 17 |   subj, verb, obj = _grammatical_lex2fs(tokens[0]), \
 18 |     _grammatical_lex2fs(tokens[1]), _grammatical_lex2fs(tokens[2])
 19 |   verb["AGT"] = subj["REF"] # agent of chase is Kim
 20 |   verb["PAT"] = obj["REF"]  # patient of chase is Lee
 21 |   for k in ["ORTH", "REL", "AGT", "PAT"]:
 22 |     print "%-5s => %s" % (k, verb[k])
 23 | 
 24 | def the_dog_runs():
 25 |   grammar1 = """
 26 |     S -> NP VP
 27 |     NP -> Det N
 28 |     VP - V
 29 |     Det -> 'this'
 30 |     N -> 'dog'
 31 |     V -> 'runs'
 32 |   """
 33 |   grammar2 = """
 34 |     S -> NP_SG VP_SG
 35 |     S -> NP_PL VP_PL
 36 |     NP_SG -> Det_SG N_SG
 37 |     NP_PL -> Det_PL N_PL
 38 |     VP_SG -> V_SG
 39 |     VP_PL -> V_PL
 40 |     Det_SG -> 'this'
 41 |     Det_PL -> 'these'
 42 |     N_SG -> 'dog
 43 |     N_PL -> 'dogs'
 44 |     V_SG -> 'runs'
 45 |     V_PL -> 'run'
 46 |   """
 47 |   grammar3 = """
 48 |     S -> NP[NUM=?n] VP[NUM=?n]
 49 |     S -> NP_PL VP_PL
 50 |     NP[NUM=?n] -> Det[NUM=?n] N[NUM=?n]
 51 |     NP_PL -> Det_PL N_PL
 52 |     VP[NUM=?n] -> V[NUM=?n]
 53 |     VP_PL -> V_PL
 54 |     Det_SG -> 'this'
 55 |     Det_PL -> 'these'
 56 |     N_SG -> 'dog
 57 |     N_PL -> 'dogs'
 58 |     V_SG -> 'runs'
 59 |     V_PL -> 'run'
 60 |   """
 61 | 
 62 | def sample_grammar():
 63 |   nltk.data.show_cfg("grammars/book_grammars/feat0.fcfg")
 64 |   tokens = "Kim likes children".split()
 65 |   from nltk import load_parser
 66 |   cp = load_parser("grammars/book_grammars/feat0.fcfg", trace=2)
 67 |   trees = cp.nbest_parse(tokens)
 68 | 
 69 | def feature_structures():
 70 |   fs1 = nltk.FeatStruct(TENSE='past', NUM='sg')
 71 |   print "fs1=", fs1
 72 |   print "fs1[TENSE]=", fs1['TENSE']
 73 |   fs1['CASE'] = 'acc'
 74 |   fs2 = nltk.FeatStruct(POS='N', AGR=fs1)
 75 |   print "fs2=", fs2
 76 |   person = nltk.FeatStruct(name='Lee', telno='212 444 1212', age=33)
 77 |   print "person=", person
 78 |   print nltk.FeatStruct("""
 79 |   [NAME='Lee', ADDRESS=(1)[NUMBER=74, STREET='rue Pascal'],
 80 |   SPOUSE=[Name='Kim', ADDRESS->(1)]]
 81 |   """)
 82 | 
 83 | def feature_structure_unification():
 84 |   fs1 = nltk.FeatStruct(NUMBER=74, STREE='rue Pascal')
 85 |   fs2 = nltk.FeatStruct(CITY='Paris')
 86 |   print fs1.unify(fs2)
 87 |   # result of unification if fs1 subsumes fs2 or vice versa, the more
 88 |   # specific of the two.
 89 |   fs0 = nltk.FeatStruct("""
 90 |     [NAME='Lee', ADDRESS=(1)[NUMBER=74, STREET='rue Pascal'],
 91 |     SPOUSE=[Name='Kim', ADDRESS->(1)]]
 92 |   """)
 93 |   print "fs0=", fs0
 94 |   fs1 = nltk.FeatStruct("[SPOUSE=[ADDRESS=[CITY=Paris]]]")
 95 |   print fs1.unify(fs0)
 96 |   print "fs1=", fs1
 97 |   fs2 = nltk.FeatStruct("""
 98 |     [NAME=Lee, ADDRESS=(1)[NUMBER=74, STREET='rue Pascal'],
 99 |     SPOUSE=[NAME=Kim, ADRRESS->(1)]]
100 |   """)
101 |   print "fs1.unify(fs2)=", fs1.unify(fs2)
102 |   fs3 = nltk.FeatStruct("[ADDRESS=?x, ADDRESS2=?x]")
103 |   print "fs2.unify(fs3)=", fs2.unify(fs3)
104 | 
105 | def sentence_parsing():
106 | #  tokens = "who do you claim that you like".split()
107 | #  tokens = "you claim that you like cats".split()
108 |   tokens = "rarely do you sing".split()
109 |   from nltk import load_parser
110 |   cp = load_parser("grammars/book_grammars/feat1.fcfg")
111 |   for tree in cp.nbest_parse(tokens):
112 |     print tree
113 |     tree.draw()
114 |     
115 | def main():
116 | #  grammatical_features()
117 | #  the_dog_runs()
118 | #  sample_grammar()
119 | #  feature_structures()
120 | #  feature_structure_unification()
121 |   sentence_parsing()
122 |   print "end"
123 |   
124 | if __name__ == "__main__":
125 |   main()
126 | 


--------------------------------------------------------------------------------
/src/book/ch10.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # Analyzing meaning of sentences
 3 | 
 4 | from __future__ import division
 5 | import nltk
 6 | import re
 7 | 
 8 | def english_to_sql():
 9 |   nltk.data.show_cfg("grammars/book_grammars/sql0.fcfg")
10 |   from nltk import load_parser
11 |   cp = load_parser("grammars/book_grammars/sql0.fcfg", trace=3)
12 |   query = "What cities are located in China"
13 |   trees = cp.nbest_parse(query.split())
14 |   answer = trees[0].node['SEM']
15 |   q = " ".join(answer)
16 |   print q
17 |   from nltk.sem import chat80
18 |   rows = chat80.sql_query('corpora/city_database/city.db', q)
19 |   for r in rows:
20 |     print r[0],
21 | 
22 | def logic_parser():
23 |   lp = nltk.LogicParser()
24 |   SnF = lp.parse('SnF')
25 |   NotFnS = lp.parse('-FnS')
26 |   R = lp.parse('SnF -> -FnS')
27 | #  prover = nltk.Prover9()
28 | #  print prover.prove(NotFnS, [SnF, R])
29 |   val = nltk.Valuation([('P',True), ('Q', True), ('R', False)])
30 |   dom = set([])
31 |   g = nltk.Assignment(dom)
32 |   m = nltk.Model(dom, val)
33 |   print "eval(P&Q)=", m.evaluate('(P & Q)', g)
34 |   print "eval -(P&Q)=", m.evaluate('-(P & Q)', g)
35 |   print "eval(P&R)=", m.evaluate('(P & R)', g)
36 |   print "eval(-(P|R))=", m.evaluate('-(P | R)', g)
37 | 
38 | def first_order_logic():
39 |   tlp = nltk.LogicParser(type_check=True)
40 |   sig = {"walk": "<e,t>"}
41 |   parsed = tlp.parse("walk(angus)", sig)
42 |   print "parsed_arg(value,type)=", parsed.argument, parsed.argument.type
43 |   print "parsed_func(value,type)=", parsed.function, parsed.function.type
44 | 
45 | def truth_model():
46 |   domain = set(['b', 'o', 'c'])
47 |   v = """
48 |   bertie => b
49 |   olive => o
50 |   cyril => c
51 |   boy => {b}
52 |   girl => {o}
53 |   dog => {c}
54 |   walk => {o, c}
55 |   see => {(b,o), (c,b), (o,c)}
56 |   """
57 |   val = nltk.parse_valuation(v)
58 |   print val
59 |   print ('o', 'c') in val["see"]
60 |   print ('b',) in val["boy"]
61 |   g = nltk.Assignment(domain, [('x', 'o'), ('y', 'c')])
62 |   model = nltk.Model(domain, val)
63 |   print "model.evaluate=", model.evaluate("see(olive,y)", g)
64 |   
65 | def main():
66 | #  english_to_sql()
67 | #  logic_parser()
68 | #  first_order_logic()
69 |   truth_model()
70 |   print "end"
71 |   
72 | if __name__ == "__main__":
73 |   main()
74 | 


--------------------------------------------------------------------------------
/src/brown_dict/dict_build.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import brown
 3 | 
 4 | DELIM = "_|_"
 5 | NORMED_TAGS = ["NN", "VB", "JJ", "RB", "DT", "IN", "OT"]
 6 | POSTAGS = {
 7 |     "NN" : "noun",
 8 |     "VB" : "verb",
 9 |     "JJ" : "adjective",
10 |     "RB" : "adverb",
11 |     "DT" : "determiner",
12 |     "IN" : "preposition",
13 |     "OT" : "other"
14 | }
15 | 
16 | def normalize_brown_postags():
17 |     brown_tags = open("../../data/brown_dict/brown_tags.csv", 'rb')
18 |     tag_map = dict()
19 |     for line in brown_tags:
20 |         line = line.strip()
21 |         if len(line) == 0 or line.startswith("#"):
22 |             continue
23 |         tag_name, tag_description = line.split("\t")[0:2]
24 |         tag_desc_words = set(nltk.word_tokenize(tag_description.lower()))
25 |         is_tagged = False
26 |         for normed_tag in NORMED_TAGS[:-1]:
27 |             desc_pattern = POSTAGS[normed_tag]
28 |             if desc_pattern in tag_desc_words:
29 |                 tag_map[tag_name] = normed_tag
30 |                 is_tagged = True
31 |         if not is_tagged:
32 |             tag_map[tag_name] = "OT"
33 |     brown_tags.close()
34 |     return tag_map
35 | 
36 | def retag_brown_words(tag_map):
37 |     wordpos_fd = nltk.FreqDist()
38 |     for word, tag in brown.tagged_words():
39 |         if tag_map.has_key(tag):
40 |             normed_pos = tag_map[tag]
41 |             retagged_word = DELIM.join([word.lower(), normed_pos])
42 |             wordpos_fd.inc(retagged_word)  
43 |     return wordpos_fd
44 |     
45 | def compose_record(word, wordpos_fd):
46 |     freqs = []
47 |     for tag in NORMED_TAGS:
48 |         wordpos = DELIM.join([word, tag])
49 |         freqs.append(wordpos_fd[wordpos])
50 |     sum_freqs = float(sum(freqs))
51 |     nf = [float(f) / sum_freqs for f in freqs]
52 |     return "%s\t%s\n" % (word, "\t".join(["%5.3f" % (x) for x in nf]))
53 | 
54 | 
55 | tag_map = normalize_brown_postags()
56 | wordpos_fd = retag_brown_words(tag_map)
57 | already_seen_words = set()
58 | brown_dict = open("../../data/brown_dict/brown_dict.csv", 'wb')
59 | brown_dict.write("#WORD\t%s\n" % ("\t".join(NORMED_TAGS)))
60 | for wordpos in wordpos_fd.keys():
61 |     word, tag = wordpos.split(DELIM)
62 |     if word in already_seen_words:
63 |         continue
64 |     brown_dict.write(compose_record(word, wordpos_fd))
65 |     already_seen_words.add(word)
66 | brown_dict.close()
67 | 


--------------------------------------------------------------------------------
/src/brown_dict/phrase_seqs.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | import nltk
 3 | import numpy as np
 4 | 
 5 | from nltk.corpus import treebank_chunk
 6 | 
 7 | NORMTAGS = ["NN", "VB", "JJ", "RB", "DT", "IN", "OT"]
 8 | POSTAGS = {
 9 |     "NN" : "noun",
10 |     "VB" : "verb",
11 |     "JJ" : "adjective",
12 |     "RB" : "adverb",
13 |     "DT" : "determiner",
14 |     "IN" : "preposition",
15 |     "OT" : "other"
16 | }
17 | 
18 | def normalize_ptb_tags():
19 |     tag_map = {}
20 |     ptb_tags = open("../../data/brown_dict/ptb_tags.csv", 'rb')
21 |     for line in ptb_tags:
22 |         line = line.strip()
23 |         if len(line) == 0 or line.startswith("#"):
24 |             continue
25 |         tag, desc = line.split("\t")
26 |         desc_words = nltk.word_tokenize(desc.lower().replace("-", " "))
27 |         is_tagged = False
28 |         for key in NORMTAGS[:-1]:
29 |             postag_desc = POSTAGS[key]
30 |             if postag_desc in desc_words:
31 |                 tag_map[tag] = key
32 |                 is_tagged = True
33 |         if not is_tagged:
34 |             tag_map[tag] = "OT"
35 |     ptb_tags.close()
36 |     return tag_map
37 |     
38 | def get_chunks(tree, phrase_type, tags):
39 |     try:
40 |         tree.node
41 |     except AttributeError:
42 |         return 
43 |     else:
44 |         if tree.node == phrase_type:
45 |             tags.append(tree)
46 |         else:
47 |             for child in tree:
48 |                 get_chunks(child, phrase_type, tags)
49 | 
50 | def index_of(tag):
51 |     if tag == "START":
52 |         return 0
53 |     elif tag == "END":
54 |         return len(NORMTAGS) + 1
55 |     else:
56 |         return NORMTAGS.index(tag) + 1
57 |         
58 | def update_trans_freqs(trans_freqs, tag_seq):
59 |     tags = ["START"]
60 |     tags.extend(tag_seq.split(" "))
61 |     tags.append("END")
62 |     bigrams = nltk.bigrams(tags)
63 |     for bigram in bigrams:
64 |         row = index_of(bigram[0])
65 |         col = index_of(bigram[1])
66 |         trans_freqs[row, col] += 1
67 |     
68 | # generate phrases as a sequence of (normalized) POS tags and
69 | # transition probabilities across POS tags.
70 | tag_map = normalize_ptb_tags()
71 | np_fd = nltk.FreqDist()
72 | trans_freqs = np.zeros((len(NORMTAGS) + 2, len(NORMTAGS) + 2))
73 | for tree in treebank_chunk.chunked_sents():
74 |     chunks = []
75 |     get_chunks(tree, "NP", chunks)
76 |     for chunk in chunks:
77 |         tagged_poss = [tagged_word[1] for tagged_word in chunk]
78 |         normed_tags = []
79 |         for tagged_pos in tagged_poss:
80 |             try:
81 |                 normed_tags.append(tag_map[tagged_pos])
82 |             except KeyError:
83 |                 normed_tags.append("OT")
84 |         np_fd.inc(" ".join(normed_tags))
85 |         
86 | fout = open("../../data/brown_dict/np_tags.csv", 'wb')
87 | for tag_seq in np_fd.keys():
88 |     fout.write("%s\t%d\n" % (tag_seq, np_fd[tag_seq]))
89 |     update_trans_freqs(trans_freqs, tag_seq)
90 | fout.close()
91 | # normalize so they are all probablities (by row sum)
92 | trans_probs = trans_freqs / np.linalg.norm(trans_freqs, axis=1)[:, np.newaxis]
93 | trans_probs[~np.isfinite(trans_probs)] = 0.0
94 | np.savetxt("../../data/brown_dict/pos_trans.csv", trans_probs, fmt="%7.5f", delimiter="\t")
95 | 


--------------------------------------------------------------------------------
/src/brown_dict/predict.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | import numpy as np
 3 | import nltk
 4 | 
 5 | NORMTAGS = ["NN", "VB", "JJ", "RB", "DT", "IN", "OT"]
 6 | 
 7 | def load_word_dict(dict_file):
 8 |     word_dict = {}
 9 |     fdict = open(dict_file, "rb")
10 |     for line in fdict:
11 |         line = line.strip()
12 |         if len(line) == 0 or line.startswith("#"):
13 |             continue
14 |         cols = line.split("\t")
15 |         word = cols[0]
16 |         probs = [float(x) for x in cols[1:]]
17 |         word_dict[word] = probs
18 |     fdict.close()
19 |     return word_dict
20 | 
21 | def load_phrase_tags(phrase_tag_file):
22 |     phrase_tags = set()
23 |     ftags = open(phrase_tag_file, 'rb')
24 |     for line in ftags:
25 |         line = line.strip()
26 |         if len(line) == 0 or line.startswith("#"):
27 |             continue
28 |         phrase, count = line.split("\t")
29 |         phrase_tags.add(phrase)
30 |     ftags.close()
31 |     return phrase_tags
32 | 
33 | def assert_true(fn, message):
34 |     if fn != True:
35 |         print "Assert failed:", message
36 | 
37 | def tag_to_index(tag):
38 |     if tag == "START":
39 |         return 0
40 |     elif tag == "END":
41 |         return len(NORMTAGS) + 1
42 |     else:
43 |         return NORMTAGS.index(tag) + 1
44 | 
45 | def index_to_tag(index):
46 |     if index == 0: 
47 |         return "START"
48 |     elif index == len(NORMTAGS) + 1:
49 |         return "END"
50 |     else:
51 |         return NORMTAGS[index - 1]
52 | 
53 | def predict_likely_pos(prev_tag, trans_probs):
54 |     row = tag_to_index(prev_tag)
55 |     probs = trans_probs[row, :]
56 |     return index_to_tag(np.argmax(probs))
57 | 
58 | def predict_pos(word, word_dict):
59 |     if word_dict.has_key(word):
60 |         probs = np.array(word_dict[word])
61 |         return NORMTAGS[np.argmax(probs)]
62 |     else:
63 |         return "OT"
64 |         
65 | def predict_if_noun(word, word_dict):
66 |     return predict_pos(word, word_dict) == "NN"
67 | 
68 | def predict_if_noun_phrase(phrase, trans_probs, phrase_tags):
69 |     words = nltk.word_tokenize(phrase)
70 |     tags = []
71 |     for word in words:
72 |         if word_dict.has_key(word):
73 |             tags.append(predict_pos(word, word_dict))
74 |         else:
75 |             prev_tag = "START" if len(tags) == 0 else tags[-1]
76 |             tags.append(predict_likely_pos(prev_tag, trans_probs))
77 |     return " ".join(tags) in phrase_tags
78 | 
79 | # test cases for individual words
80 | word_dict = load_word_dict("../../data/brown_dict/brown_dict.csv")
81 | assert_true(predict_if_noun("hypothalamus", word_dict), "Hypothalamus == NOUN!")
82 | assert_true(not predict_if_noun("intermediate", word_dict), "Intermediate != NOUN!")
83 | assert_true(predict_if_noun("laugh", word_dict), "Laugh ~= NOUN!")
84 | 
85 | # test cases for phrases
86 | phrase_tags = load_phrase_tags("../../data/brown_dict/np_tags.csv")
87 | trans_probs = np.loadtxt("../../data/brown_dict/pos_trans.csv", delimiter="\t")
88 | assert_true(predict_if_noun_phrase("time flies", trans_probs, phrase_tags), 
89 |             "time flies == NP!")
90 | assert_true(not predict_if_noun_phrase("were spoken", trans_probs, phrase_tags), 
91 |             "were spoken == VP!")
92 |             
93 | 


--------------------------------------------------------------------------------
/src/cener/bootstrap.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | from nltk.tokenize import sent_tokenize
 4 | from nltk.tokenize import word_tokenize
 5 | import re
 6 | 
 7 | stopwords = set(["The", "This", "Though", "While", 
 8 |   "Using", "It", "Its", "A", "An", "As", "Now",
 9 |   "At", "But", "Although", "Am", "Perhaps",
10 |   "January", "February", "March", "April", "May", "June",
11 |   "July", "August", "September", "October", "November", "December"])
12 | 
13 | def iotag(token):
14 |   # remove stopwords
15 |   if token in stopwords:
16 |     return False
17 |   if (re.match("^[A-Z].*", token) or
18 |       re.match("^[a-z][A-Z].*", token) or
19 |       re.search("[0-9]", token) or
20 |       token == ",s"):
21 |     return True
22 |   else:
23 |     return False
24 | 
25 | # if current iotag == "I" and (prev iotag == "I" or next iotag == "I"
26 | # then keep the iotag value else flip it
27 | def modify_tags(pairs):
28 |   output_tags = []
29 |   idx = 0
30 |   for pair in pairs:
31 |     if pair[1]:
32 |       if idx == 0:
33 |         output_tags.append((pair[0], pair[1] and pairs[idx+1][1]))
34 |       elif idx == len(pairs):
35 |         output_tags.append((pair[0], pair[1] and pairs[idx-1][1]))
36 |       else:
37 |         output_tags.append((pair[0], pair[1] and
38 |           (pairs[idx-1][1] or pairs[idx+1][1])))
39 |     else:
40 |       output_tags.append(pair)
41 |     idx = idx + 1
42 |   return output_tags
43 | 
44 | def partition_pairs(pairs):
45 |   output_pairs_list = []
46 |   output_pairs = []
47 |   for pair in pairs:
48 |     if pair[1]:
49 |       output_pairs.append(pair)
50 |     else:
51 |       if len(output_pairs) > 0:
52 |         output_pairs_list.append(output_pairs)
53 |         output_pairs = []
54 |   return output_pairs_list
55 | 
56 | def main():
57 |   ce_words = set()
58 |   input = open("cnet_reviews.txt", 'rb')
59 |   for line in input:
60 |     line = line[:-1]
61 |     if len(line.strip()) == 0:
62 |       continue
63 |     sents = sent_tokenize(line)
64 |     for sent in sents:
65 | #      print sent
66 |       tokens = word_tokenize(sent)
67 |       iotags = map(lambda token: iotag(token), tokens)
68 |       ce_pairs_list = partition_pairs(modify_tags(zip(tokens, iotags)))
69 |       if len(ce_pairs_list) == 0:
70 |         continue
71 |       for ce_pairs in ce_pairs_list:
72 |         print " ".join(map(lambda pair: pair[0], ce_pairs))
73 |         for ce_pair in ce_pairs:
74 |           ce_words.add(ce_pair[0])
75 |   input.close()
76 | 
77 | if __name__ == "__main__":
78 |   main()
79 | 


--------------------------------------------------------------------------------
/src/cener/cener.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import sys
 4 | import cPickle as pickle
 5 | from cener_lib import *
 6 | from nltk.tokenize import sent_tokenize, word_tokenize
 7 | 
 8 | def train_ner(pickle_file):
 9 |   # initialize
10 |   pos_tagger = train_pos_tagger()
11 |   ceps = ce_phrases()
12 |   cep_words = ce_phrase_words(ceps)
13 |   # train classifier
14 |   sentfile = open("cnet_reviews_sents.txt", 'rb')
15 |   featuresets = []
16 |   for sent in sentfile:
17 |     tagged_sent = tag(sent, pos_tagger, ceps, cep_words)
18 |     for idx, (word, pos_tag, io_tag) in enumerate(tagged_sent):
19 |       featuresets.append((word_features(tagged_sent, idx), io_tag))
20 |   sentfile.close()
21 |   split = int(0.9 * len(featuresets))
22 | #  random.shuffle(featuresets)
23 |   train_set, test_set = featuresets[0:split], featuresets[split:]
24 | #  classifier = nltk.NaiveBayesClassifier.train(train_set)
25 | #  classifier = nltk.DecisionTreeClassifier.train(train_set)
26 |   classifier = nltk.MaxentClassifier.train(train_set, algorithm="GIS", trace=0)
27 |   # evaluate classifier
28 |   print "accuracy=", nltk.classify.accuracy(classifier, test_set)
29 |   if pickle_file != None:
30 |     # pickle classifier
31 |     pickled_classifier = open(pickle_file, 'wb')
32 |     pickle.dump(classifier, pickled_classifier)
33 |     pickled_classifier.close()
34 |   return classifier
35 | 
36 | def get_trained_ner(pickle_file):
37 |   pickled_classifier = open(pickle_file, 'rb')
38 |   classifier = pickle.load(pickled_classifier)
39 |   pickled_classifier.close()
40 |   return classifier
41 | 
42 | def test_ner(input_file, classifier):
43 |   pos_tagger = train_pos_tagger()
44 |   input = open(input_file, 'rb')
45 |   for line in input:
46 |     line = line[:-1]
47 |     if len(line.strip()) == 0:
48 |       continue
49 |     for sent in sent_tokenize(line):
50 |       tokens = word_tokenize(sent)
51 |       pos_tagged = pos_tagger.tag(tokens)
52 |       io_tags = []
53 |       for idx, (word, pos) in enumerate(pos_tagged):
54 |         io_tags.append(classifier.classify(word_features(pos_tagged, idx)))
55 |       ner_sent = zip(tokens, io_tags)
56 |       print_sent = []
57 |       for token, io_tag in ner_sent:
58 |         if io_tag == True:
59 |           print_sent.append("<u>" + token + "</u>")
60 |         else:
61 |           print_sent.append(token)
62 |       print " ".join(print_sent)
63 | 
64 |   input.close()
65 |       
66 | def main():
67 |   if len(sys.argv) != 2:
68 |     print "Usage ./cener.py [train|test]"
69 |     sys.exit(-1)
70 |   if sys.argv[1] == "train":
71 |     classifier = train_ner("ce_ner_classifier.pkl")
72 |   else:
73 |     classifier = get_trained_ner("ce_ner_classifier.pkl")
74 |     test_ner("test.txt", classifier)
75 |   
76 | if __name__ == "__main__":
77 |   main()
78 | 


--------------------------------------------------------------------------------
/src/cener/cener_lib.py:
--------------------------------------------------------------------------------
  1 | import nltk
  2 | from nltk.corpus import treebank
  3 | from nltk.tokenize import word_tokenize
  4 | import re
  5 | 
  6 | def train_pos_tagger():
  7 |   """
  8 |   Trains a POS tagger with sentences from Penn Treebank
  9 |   and returns it.
 10 |   """
 11 |   train_sents = treebank.tagged_sents(simplify_tags=True)
 12 |   tagger = nltk.TrigramTagger(train_sents, backoff=
 13 |     nltk.BigramTagger(train_sents, backoff=
 14 |     nltk.UnigramTagger(train_sents, backoff=
 15 |     nltk.DefaultTagger("NN"))))
 16 |   return tagger
 17 | 
 18 | def ce_phrases():
 19 |   """
 20 |   Returns a list of phrases found using bootstrap.py ordered
 21 |   by number of words descending (so code traversing the list
 22 |   will encounter the longest phrases first).
 23 |   """
 24 |   def by_phrase_len(x, y):
 25 |     lx = len(word_tokenize(x))
 26 |     ly = len(word_tokenize(y))
 27 |     if lx == ly:
 28 |       return 0
 29 |     elif lx < ly:
 30 |       return 1
 31 |     else:
 32 |       return -1
 33 |   ceps = []
 34 |   phrasefile = open("ce_phrases.txt", 'rb')
 35 |   for cep in phrasefile:
 36 |     ceps.append(cep[:-1])
 37 |   phrasefile.close()
 38 |   return map(lambda phrase: word_tokenize(phrase),
 39 |     sorted(ceps, cmp=by_phrase_len))
 40 | 
 41 | def ce_phrase_words(ce_phrases):
 42 |   """
 43 |   Returns a set of words in the ce_phrase list. This is
 44 |   used to tag words that refer to the NE but does not
 45 |   have a consistent pattern to match against.
 46 |   """
 47 |   ce_words = set()
 48 |   for ce_phrase_tokens in ce_phrases:
 49 |     for ce_word in ce_phrase_tokens:
 50 |       ce_words.add(ce_word)
 51 |   return ce_words
 52 | 
 53 | def slice_matches(a1, a2):
 54 |   """
 55 |   Returns True if the two arrays are content wise identical,
 56 |   False otherwise.
 57 |   """
 58 |   if len(a1) != len(a2):
 59 |     return False
 60 |   else:
 61 |     for i in range(0, len(a1)):
 62 |       if a1[i] != a2[i]:
 63 |         return False
 64 |     return True
 65 |   
 66 | def slots_available(matched_slots, start, end):
 67 |   """
 68 |   Returns True if all the slots in the matched_slots array slice
 69 |   [start:end] are False, ie, available, else returns False.
 70 |   """
 71 |   return len(filter(lambda slot: slot, matched_slots[start:end])) == 0
 72 | 
 73 | def promote_coreferences(tuple, ce_words):
 74 |   """
 75 |   Sets the io_tag to True if it is not set and if the word is
 76 |   in the set ce_words. Returns the updated tuple (word, pos, iotag)
 77 |   """
 78 |   return (tuple[0], tuple[1],
 79 |     True if tuple[2] == False and tuple[0] in ce_words else tuple[2])
 80 | 
 81 | def tag(sentence, pos_tagger, ce_phrases, ce_words):
 82 |   """
 83 |   Tokenizes the input sentence into words, computes the part of
 84 |   speech and the IO tag (for whether this word is "in" a CE named
 85 |   entity or not), and returns a list of (word, pos_tag, io_tag)
 86 |   tuples.
 87 |   """
 88 |   tokens = word_tokenize(sentence)
 89 |   # add POS tags using our trained POS Tagger
 90 |   pos_tagged = pos_tagger.tag(tokens)
 91 |   # add the IO(not B) tags from the phrases we discovered
 92 |   # during bootstrap.
 93 |   words = [w for (w, p) in pos_tagged]
 94 |   pos_tags = [p for (w, p) in pos_tagged]
 95 |   io_tags = map(lambda word: False, words)
 96 |   for ce_phrase in ce_phrases:
 97 |     start = 0
 98 |     while start < len(words):
 99 |       end = start + len(ce_phrase)
100 |       if slots_available(io_tags, start, end) and \
101 |           slice_matches(words[start:end], ce_phrase):
102 |         for j in range(start, end):
103 |           io_tags[j] = True
104 |         start = end + 1
105 |       else:
106 |         start = start + 1
107 |   # zip the three lists together
108 |   pos_io_tagged = map(lambda ((word, pos_tag), io_tag):
109 |     (word, pos_tag, io_tag), zip(zip(words, pos_tags), io_tags))
110 |   # "coreference" handling. If a single word is found which is
111 |   # contained in the set of words created by our phrases, set
112 |   # the IO(not B) tag to True if it is False
113 |   return map(lambda tuple: promote_coreferences(tuple, ce_words),
114 |     pos_io_tagged)
115 | 
116 | shape_A = re.compile("[A-Zbdfhklt0-9#$&/@|]")
117 | shape_x = re.compile("[acemnorsuvwxz]")
118 | shape_i = re.compile("[i]")
119 | shape_g = re.compile("[gpqy]")
120 | shape_j = re.compile("[j]")
121 | 
122 | def shape(word):
123 |   wbuf = []
124 |   for c in word:
125 |     wbuf.append("A" if re.match(shape_A, c) != None
126 |       else "x" if re.match(shape_x, c) != None
127 |       else "i" if re.match(shape_i, c) != None
128 |       else "g" if re.match(shape_g, c) != None
129 |       else "j")
130 |   return "".join(wbuf)
131 | 
132 | def word_features(tagged_sent, wordpos):
133 |   return {
134 |     "word": tagged_sent[wordpos][0],
135 |     "pos": tagged_sent[wordpos][1],
136 |     "prevword": "<START>" if wordpos == 0 else tagged_sent[wordpos-1][0],
137 |     "prevpos": "<START>" if wordpos == 0 else tagged_sent[wordpos-1][1],
138 |     "nextword": "<END>" if wordpos == len(tagged_sent)-1
139 |                         else tagged_sent[wordpos+1][0],
140 |     "nextpos": "<END>" if wordpos == len(tagged_sent)-1
141 |                        else tagged_sent[wordpos+1][1],
142 |     "shape": shape(tagged_sent[wordpos][0])
143 |   }
144 | 


--------------------------------------------------------------------------------
/src/cener/test.txt:
--------------------------------------------------------------------------------
 1 | The wait for a decent LG phone on Verizon is finally over with the Spectrum 2.
 2 | 
 3 | Not only does it run the new(ish) Android 4.0 Ice Cream Sandwich operating system, it also has a screen that doesn't require two hands and a stylus. In addition, it's priced right at the $100 mark, making it one of the more affordable Big Red handsets.
 4 | 
 5 | With its noticeably sectioned back plate and defined edges, the LG Spectrum 2's design looks more thought-out and deliberate than is usual for LG's latest run of devices, save for the high-end Nexus 4 and Optimus G.
 6 | 
 7 | It measures 5.31 inches tall and 2.69 inches wide. At 0.36 inch thick and 5.16 ounces, it's thicker and a bit heavier than most LG handsets I've run into, and it's a tight fit in a small jeans pocket, but it's comfortable when held in the hand or pinned between the cheek and shoulder. 
 8 | 
 9 | On the left there are a Micro-USB port and two separate buttons for adjusting the volume. Up top are a 3.5mm headphone jack and a circular sleep/power button, the edges of which light up blue whenever it's pressed. 
10 | 
11 | The rear of the phone houses an 8-megapixel camera with an LED flash.  Though plastic, the black plate is coated with a textured, rubberlike material that feels almost like leather. The cover has two small slits at the bottom for the audio speaker. Removing the plate gives access to the 2,150mAh battery, a microSD card slot, and Verizon's 4G LTE SIM card. Directly on the other side of the cover are the NFC antenna and wireless charging coil.
12 | 
13 | The 4.7-inch True HD IPS screen is bright and vivid, and texts and icons rendered crisply and clearly. It has the same screen as the unlocked LG Optimus 4X HD, with the same 1,280x720-pixel resolution. Overall, the display is vivid and bright, not to mention responsive to the touch. At the time of the 4X HD review, I was very impressed with the screen. 
14 | 
15 | However, having now spent time with higher-tier LG devices such as the Nexus 4 and the Optimus G, I noticed that upon closer inspection, the Spectrum 2's display isn't as crystal-clear as the two others. Default wallpapers looked a tad noisy, and gradient patterns appeared streaky, but only by a small margin. Above the screen is a 1.3-megapixel camera and below are four hot keys (back, home, recent apps, and menu) that illuminate in blue when in use.
16 | 


--------------------------------------------------------------------------------
/src/docsim/blogdoctest.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import re
 3 | import scam_dist as scam
 4 | from sklearn.feature_extraction.text import CountVectorizer
 5 | from sklearn.feature_extraction.text import TfidfTransformer
 6 | from sklearn.pipeline import Pipeline
 7 | 
 8 | def main():
 9 |   docs = []
10 |   cats = []
11 |   files = ["blogdocs/doc1.txt", "blogdocs/doc2.txt", "blogdocs/doc3.txt"]
12 |   for file in files:
13 |     f = open(file, 'rb')
14 |     body = re.sub("\\s+", " ", " ".join(f.readlines()))
15 |     f.close()
16 |     docs.append(body)
17 |     cats.append("X")
18 |   pipeline = Pipeline([
19 |     ("vect", CountVectorizer(min_df=0, stop_words="english")),
20 |     ("tfidf", TfidfTransformer(use_idf=False))])
21 |   tdMatrix = pipeline.fit_transform(docs, cats)
22 |   testDocs = []
23 |   for i in range(0, tdMatrix.shape[0]):
24 |     testDocs.append(np.asarray(tdMatrix[i, :].todense()).reshape(-1))
25 |   scamDist12 = scam.scam_distance(testDocs[0], testDocs[1])
26 |   scamDist23 = scam.scam_distance(testDocs[1], testDocs[2])
27 |   print scamDist12, scamDist23
28 | 
29 | if __name__ == "__main__":
30 |   main()
31 | 


--------------------------------------------------------------------------------
/src/docsim/docsim.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | 
 3 | from operator import itemgetter
 4 | 
 5 | import nltk.cluster.util as nltkutil
 6 | import numpy as np
 7 | import random
 8 | import re
 9 | import scam_dist as scam
10 | from sklearn.feature_extraction.text import CountVectorizer
11 | from sklearn.feature_extraction.text import TfidfTransformer
12 | from sklearn.pipeline import Pipeline
13 | 
14 | def preprocess(fnin, fnout):
15 |   fin = open(fnin, 'rb')
16 |   fout = open(fnout, 'wb')
17 |   buf = []
18 |   id = ""
19 |   category = ""
20 |   for line in fin:
21 |     line = line.strip()
22 |     if line.find("-- Document Separator --") > -1:
23 |       if len(buf) > 0:
24 |         # write out body,
25 |         body = re.sub("\s+", " ", " ".join(buf))
26 |         fout.write("%s\t%s\t%s\n" % (id, category, body))
27 |       # process next header and init buf
28 |       id, category, rest = map(lambda x: x.strip(), line.split(": "))
29 |       buf = []
30 |     else:
31 |       # process body
32 |       buf.append(line)
33 |   fin.close()
34 |   fout.close()
35 | 
36 | def train(fnin):
37 |   docs = []
38 |   cats = []
39 |   fin = open(fnin, 'rb')
40 |   for line in fin:
41 |     id, category, body = line.strip().split("\t")
42 |     docs.append(body)
43 |     cats.append(category)
44 |   fin.close()
45 |   pipeline = Pipeline([
46 |     ("vect", CountVectorizer(min_df=0, stop_words="english")),
47 |     ("tfidf", TfidfTransformer(use_idf=False))])
48 |   tdMatrix = pipeline.fit_transform(docs, cats)
49 |   return tdMatrix, cats
50 | 
51 | def test(tdMatrix, cats, fsim):
52 |   testIds = random.sample(range(0, len(cats)), int(0.1 * len(cats)))
53 |   testIdSet = set(testIds)
54 |   refIds = filter(lambda x: x not in testIdSet, range(0, len(cats)))
55 |   sims = np.zeros((len(testIds), len(refIds)))
56 |   for i in range(0, len(testIds)):
57 |     for j in range(0, len(refIds)):
58 |       doc1 = np.asarray(tdMatrix[testIds[i], :].todense()).reshape(-1)
59 |       doc2 = np.asarray(tdMatrix[refIds[j], :].todense()).reshape(-1)
60 |       sims[i, j] = fsim(doc1, doc2)
61 |   for i in range(0, sims.shape[0]):
62 |     xsim = list(enumerate(sims[i, :]))
63 |     sortedSims = sorted(xsim, key=itemgetter(1), reverse=True)[0:5]
64 |     sourceCat = cats[testIds[i]]
65 |     numMatchedCats = 0
66 |     numTestedCats = 0
67 |     for j, score in sortedSims:
68 |       targetCat = cats[j]
69 |       if sourceCat == targetCat:
70 |         numMatchedCats += 1
71 |       numTestedCats += 1
72 |     print("Test Doc: %d, Source Category: %s, Target Matched: %d/%d times" %
73 |       (i, sourceCat, numMatchedCats, numTestedCats))
74 |       
75 | def main():
76 |   preprocess("sugar-coffee-cocoa-docs.txt", "sccpp.txt")
77 |   tdMatrix, cats = train("sccpp.txt")
78 |   print "Results with Cosine Distance Similarity Measure"
79 |   test(tdMatrix, cats, nltkutil.cosine_distance)
80 |   print "Results with Euclidean Distance Similarity Measure"
81 |   test(tdMatrix, cats, nltkutil.euclidean_distance)
82 |   print "Results with SCAM Distance Similarity Measure"
83 |   test(tdMatrix, cats, scam.scam_distance)
84 |   
85 | if __name__ == "__main__":
86 |   main()
87 |   


--------------------------------------------------------------------------------
/src/docsim/scam_dist.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | import numpy as np
 3 | import scipy.sparse as ss
 4 | 
 5 | def _s_pos_or_zero(x):
 6 |   return x if x > 0 else 0
 7 | 
 8 | def _s_zero_mask(x, y):
 9 |   return 0 if y == 0 else x
10 | 
11 | def _s_safe_divide(x, y):
12 |   return 0 if x == 0 or y == 0 else x / y
13 | 
14 | _v_pos_or_zero = np.vectorize(_s_pos_or_zero)
15 | _v_zero_mask = np.vectorize(_s_zero_mask)
16 | _v_safe_divide = np.vectorize(_s_safe_divide)
17 | 
18 | def _assymetric_subset_measure(doc1, doc2):
19 |   epsilon = np.ones(doc1.shape) * 2
20 |   filtered = _v_pos_or_zero(epsilon - (_v_safe_divide(doc1, doc2) +
21 |     _v_safe_divide(doc2, doc1)))
22 |   zdoc1 = _v_zero_mask(doc1, filtered)
23 |   zdoc2 = _v_zero_mask(doc2, filtered)
24 |   return np.sum(np.dot(zdoc1, zdoc2)) / np.sum(np.dot(doc1, doc2))
25 | 
26 | def scam_distance(doc1, doc2):
27 |   asm12 = _assymetric_subset_measure(doc1, doc2)
28 |   asm21 = _assymetric_subset_measure(doc2, doc1)
29 |   return max(asm12, asm21)
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/src/drug_ner/apply_model.py:
--------------------------------------------------------------------------------
 1 | from sklearn.externals import joblib
 2 | from sklearn.feature_extraction.text import CountVectorizer
 3 | from sklearn.svm import LinearSVC
 4 | import drug_ner_utils as dnu
 5 | import numpy as np
 6 | import os
 7 | 
 8 | def vectorize_ngrams(ngrams, vocab):
 9 |     vec = np.zeros((1, len(vocab)))
10 |     for ngram in ngrams:
11 |         if vocab.has_key(ngram):
12 |             vec[0, vocab[ngram]] = 1
13 |     return vec
14 |     
15 | 
16 | X, y, generic_vec = dnu.vectorize("unlabeled.txt", "generic_positive.txt", 100)
17 | y = joblib.load(os.path.join(dnu.DATA_DIR, "y_generic_4.pkl"))
18 | generic_clf = LinearSVC()
19 | generic_clf.fit(X, y)
20 | print("Score for generic classifier: %.3f" % (generic_clf.score(X, y)))
21 | 
22 | X, y, brand_vec = dnu.vectorize("unlabeled.txt", "brand_positive.txt", 100)
23 | 
24 | y = joblib.load(os.path.join(dnu.DATA_DIR, "y_brand_3.pkl"))
25 | brand_clf = LinearSVC()
26 | brand_clf.fit(X, y)
27 | print("Score for brand classifier: %.3f" % (brand_clf.score(X, y)))
28 | 
29 | fraw = open(os.path.join(dnu.DATA_DIR, "raw_data.txt"), 'rb')
30 | i = 0
31 | for line in fraw:
32 |     line = line.strip().lower()
33 |     annotated = []
34 |     for word in line.split():
35 |         ngrams = dnu.str_to_ngrams(word, dnu.GRAM_SIZE)
36 |         Xgen = generic_vec.transform([" ".join(ngrams)])
37 |         Xbrand = brand_vec.transform([" ".join(ngrams)])
38 |         is_generic = generic_clf.predict(Xgen)
39 |         is_brand = brand_clf.predict(Xbrand)
40 |         if is_generic == 1:
41 |             annotated.append("<GENERIC>" + word + "</GENERIC>")
42 |         elif is_brand == 1:
43 |             annotated.append("<BRAND>" + word + "</BRAND>")
44 |         else:
45 |             annotated.append(word)
46 |     print("Input: %s" % (line))
47 |     print("Output: %s" % (" ".join(annotated)))
48 |     i += 1
49 |     if i > 10:
50 |         break
51 | fraw.close()
52 | 


--------------------------------------------------------------------------------
/src/drug_ner/apply_regex_model.py:
--------------------------------------------------------------------------------
 1 | from sklearn.externals import joblib
 2 | import drug_ner_utils as dnu
 3 | import os
 4 | 
 5 | generic_fd = set(dnu.truncate_fd(joblib.load(os.path.join(dnu.DATA_DIR, 
 6 |                                             "generic_fd.pkl")), 100))
 7 | brand_fd = set(dnu.truncate_fd(joblib.load(os.path.join(dnu.DATA_DIR, 
 8 |                                             "brand_fd.pkl")), 50))
 9 | 
10 | fraw = open(os.path.join(dnu.DATA_DIR, "raw_data.txt"), 'rb')
11 | i = 0
12 | for line in fraw:
13 |     line = line.strip().lower()
14 |     annotated = []
15 |     for word in line.split():
16 |         ngrams = set(dnu.str_to_ngrams(word, dnu.GRAM_SIZE))
17 |         jc_generic = 1.0 * (len(ngrams.intersection(generic_fd)) / 
18 |                             len(ngrams.union(generic_fd)))
19 |         jc_brand = 1.0 * (len(ngrams.intersection(brand_fd)) / 
20 |                           len(ngrams.union(brand_fd)))
21 |         print word, jc_generic, jc_brand
22 |         is_generic = jc_generic > 0.01
23 |         is_brand = jc_brand > 0.01
24 |         if is_generic:
25 |             annotated.append("<GENERIC>%s</GENERIC>" % (word))
26 |         elif is_brand:
27 |             annotated.append("<BRAND>%s</BRAND>" % (word))
28 |         else:
29 |             annotated.append(word)
30 |     print("Input: %s" % (line))
31 |     print("Output: %s" % (" ".join(annotated)))
32 |     i += 1
33 |     if i > 10:
34 |         break
35 | fraw.close()
36 | 


--------------------------------------------------------------------------------
/src/drug_ner/co_train.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from sklearn.externals import joblib
 3 | from sklearn.svm import LinearSVC
 4 | import drug_ner_utils as dnu
 5 | import matplotlib.pyplot as plt
 6 | import numpy as np
 7 | import os
 8 | 
 9 | MAX_ITERS = 10
10 | EST_POSITIVE = 0.7
11 | MAX_FEATURES = 3000
12 | 
13 | def conservative_min(xs):
14 |     # remove outliers
15 |     q25, q75 = np.percentile(xs, [25, 75])
16 |     iqr = q75 - q25
17 |     lb = q25 - (1.5 * iqr)
18 |     ub = q75 + (1.5 * iqr)
19 |     xs_con = xs[(xs >= lb) & (xs <= ub)]
20 |     return np.min(xs_con)
21 |     
22 |     
23 | for borg in ["generic", "brand"]:
24 |     X, y, vec = dnu.vectorize("unlabeled.txt", "%s_positive.txt" % (borg), 
25 |                               MAX_FEATURES)
26 | 
27 |     y_pos = y[y == 1]
28 |     num_positives = [y_pos.shape[0]]
29 | 
30 |     clf = LinearSVC()
31 |     clf.fit(X, y)
32 | 
33 |     num_iters = 0
34 |     while (num_iters < MAX_ITERS):
35 |         print("Iteration #%d, #-positive examples: %d" % 
36 |               (num_iters, num_positives[-1]))
37 |         confidence = clf.decision_function(X)
38 |         min_pos_confidence = conservative_min(confidence[y_pos])
39 |         y_pos = np.where(confidence >= min_pos_confidence)[0]
40 | #        if y_pos.shape[0] <= num_positives[-1]:
41 | #            break
42 |         num_positives.append(y_pos.shape[0])
43 |         y = np.zeros(y.shape)
44 |         y[y_pos] = 1
45 |         clf = LinearSVC()
46 |         clf.fit(X, y)
47 |         joblib.dump(y, os.path.join(dnu.DATA_DIR, "y_%s_%d.pkl" % 
48 |                     (borg, num_iters)))
49 |         num_iters += 1
50 |     
51 |     # visualize output
52 |     plt.plot(np.arange(len(num_positives)), num_positives, "b-")
53 |     plt.plot(np.arange(len(num_positives)), 
54 |              X.shape[0] * EST_POSITIVE * np.ones(len(num_positives)), 'r--')
55 |     plt.title("Cotraining for %s classifier (%d features)" % 
56 |               (borg.title(), MAX_FEATURES))
57 |     plt.xlabel("Iterations")
58 |     plt.ylabel("#-Positives")
59 |     plt.show()
60 |     
61 | 


--------------------------------------------------------------------------------
/src/drug_ner/drug_ner_utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from operator import itemgetter
 3 | from sklearn.feature_extraction.text import CountVectorizer
 4 | import matplotlib.pyplot as plt
 5 | import nltk
 6 | import numpy as np
 7 | import os
 8 | import string
 9 | 
10 | DATA_DIR = "../../data/drug_ner"
11 | GRAM_SIZE = 3
12 | PUNCTS = set([c for c in string.punctuation])
13 | NUMBERS = set([c for c in "0123456789"])
14 | 
15 | def is_punct(c):
16 |     return c in PUNCTS
17 |     
18 | def is_number(c):
19 |     return c in NUMBERS
20 |     
21 | def str_to_ngrams(instring, gram_size):
22 |     ngrams = []
23 |     for word in nltk.word_tokenize(instring.lower()):
24 |         try:
25 |             word = "".join(["S", word, "E"]).encode("utf-8")
26 |             cword = [c for c in word if not(is_punct(c) or is_number(c))]
27 |             ngrams.extend(["".join(x) for x in nltk.ngrams(cword, gram_size)])
28 |         except UnicodeDecodeError:
29 |             pass
30 |     return ngrams
31 |             
32 | def ngram_distrib(names, gram_size):
33 |     tokens = []
34 |     for name in names:
35 |         tokens.extend(str_to_ngrams(name, gram_size))
36 |     return nltk.FreqDist(tokens)
37 | 
38 | def plot_ngram_distrib(fd, nbest, title, gram_size):
39 |     kvs = sorted([(k, fd[k]) for k in fd], key=itemgetter(1), reverse=True)[0:nbest]
40 |     ks = [k for k, v in kvs]
41 |     vs = [v for k, v in kvs]
42 |     plt.plot(np.arange(nbest), vs)
43 |     plt.xticks(np.arange(nbest), ks, rotation="90")
44 |     plt.title("%d-gram frequency for %s names (Top %d)" % 
45 |               (gram_size, title, nbest))
46 |     plt.xlabel("%d-grams" % (gram_size))
47 |     plt.ylabel("Frequency")
48 |     plt.show()
49 | 
50 | def truncate_fd(fd, nbest):
51 |     kvs = sorted([(k, fd[k]) for k in fd], key=itemgetter(1), reverse=True)[0:nbest]
52 |     return {k:v for k, v in kvs}
53 | 
54 | def vectorize(ufile, pfile, max_feats):
55 |     text = []
56 |     labels = []
57 |     fno = 0
58 |     for fname in [ufile, pfile]:
59 |         f = open(os.path.join(DATA_DIR, fname), 'rb')
60 |         for line in f:
61 |             text.append(line.strip())
62 |             labels.append(fno)
63 |         fno = fno + 1
64 |         f.close()
65 |     vec = CountVectorizer(min_df=0.0, max_features=max_feats, binary=True)
66 |     X = vec.fit_transform(text)
67 |     y = np.array(labels)
68 |     return X, y, vec
69 | 


--------------------------------------------------------------------------------
/src/drug_ner/ngram_convert.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import drug_ner_utils as dnu
 3 | import os
 4 | 
 5 | def build_ngram_text(infile, outfile):
 6 |     fin = open(os.path.join(dnu.DATA_DIR, infile), 'rb')
 7 |     fout = open(os.path.join(dnu.DATA_DIR, outfile), 'wb')
 8 |     for line in fin:
 9 |         for word in line.strip().split():
10 |             ngrams = dnu.str_to_ngrams(word, dnu.GRAM_SIZE)
11 |             if len(ngrams) > 0:
12 |                 fout.write("%s\n" % " ".join(ngrams))
13 |     fin.close()
14 |     fout.close()
15 | 
16 | 
17 | build_ngram_text("generic_names.txt", "generic_positive.txt")
18 | build_ngram_text("brand_names.txt", "brand_positive.txt")
19 | build_ngram_text("raw_data.txt", "unlabeled.txt")
20 | 


--------------------------------------------------------------------------------
/src/drug_ner/parse_drugbank.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from sklearn.externals import joblib
 3 | import drug_ner_utils as dnu
 4 | import os
 5 | import xml.sax
 6 | 
 7 | class DrugXmlContentHandler(xml.sax.ContentHandler):
 8 |     
 9 |     def __init__(self):
10 |         xml.sax.ContentHandler.__init__(self)
11 |         self.tags = []
12 |         self.generic_names = []
13 |         self.brand_names = []
14 |         
15 |     def startElement(self, name, attrs):
16 |         self.tags.append(name)
17 |     
18 |     def endElement(self, name):
19 |         self.tags.pop()
20 |         
21 |     def characters(self, content):
22 |         breadcrumb = "/".join(self.tags)
23 |         if breadcrumb == "drugbank/drug/brands/brand":
24 |             self.brand_names.append(content)
25 |         if breadcrumb == "drugbank/drug/name":
26 |             self.generic_names.append(content)
27 |     
28 | def write_list_to_file(lst, filename):
29 |     fout = open(os.path.join(dnu.DATA_DIR, filename), 'wb')
30 |     for e in lst:
31 |         fout.write("%s\n" % (e.encode("utf-8")))
32 |     fout.close()
33 | 
34 |     
35 | source = open(os.path.join(dnu.DATA_DIR, "drugbank.xml"), 'rb')
36 | handler = DrugXmlContentHandler()
37 | xml.sax.parse(source, handler)
38 | source.close()
39 | 
40 | write_list_to_file(handler.generic_names, "generic_names.txt")
41 | write_list_to_file(handler.brand_names, "brand_names.txt")
42 | 
43 | generic_fd = dnu.ngram_distrib(handler.generic_names, dnu.GRAM_SIZE)
44 | brand_fd = dnu.ngram_distrib(handler.brand_names, dnu.GRAM_SIZE)
45 | 
46 | joblib.dump(generic_fd, os.path.join(dnu.DATA_DIR, "generic_fd.pkl"))
47 | joblib.dump(brand_fd, os.path.join(dnu.DATA_DIR, "brand_fd.pkl"))
48 | 
49 | # Plot visualizations
50 | dnu.plot_ngram_distrib(generic_fd, 30, "Generic", dnu.GRAM_SIZE)
51 | dnu.plot_ngram_distrib(brand_fd, 30, "Brand", dnu.GRAM_SIZE)
52 | 
53 | 


--------------------------------------------------------------------------------
/src/entity-graph/01-preprocess-data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import spacy
 4 | 
 5 | DATA_DIR = "../../data/entity-graph"
 6 | 
 7 | TEXT_FILENAME = os.path.join(DATA_DIR, "db-article.txt")
 8 | ACRONYMS_FILENAME = os.path.join(DATA_DIR, "db-acronyms.txt")
 9 | 
10 | SENTENCES_FILENAME = os.path.join(DATA_DIR, "sentences.tsv")
11 | 
12 | acronyms_lookup = dict()
13 | facro = open(ACRONYMS_FILENAME, "r")
14 | for line in facro:
15 |     acro, full = line.strip().split('\t')
16 |     acronyms_lookup[acro] = full
17 | 
18 | facro.close()
19 | 
20 | lm = spacy.load("en")
21 | 
22 | pid, sid = 0, 0
23 | fsents = open(SENTENCES_FILENAME, "w")
24 | ftext = open(TEXT_FILENAME, "r")
25 | for para in ftext:
26 |     para = para.strip()
27 |     if len(para) == 0:
28 |         continue
29 |     for sent in lm(para).sents:
30 |         if sid % 100 == 0:
31 |             print("Wrote {:d} sents from {:d} paragraphs".format(sid, pid))
32 |         sent_tokens = []
33 |         for token in lm(sent.text):
34 |             token_text = token.text
35 |             if token_text in acronyms_lookup.keys():
36 |                 sent_tokens.append(acronyms_lookup[token_text])
37 |             else:
38 |                 sent_tokens.append(token_text)
39 |         fsents.write("{:d}\t{:d}\t{:s}\n".format(pid, sid, " ".join(sent_tokens)))
40 |         sid += 1
41 |     pid += 1
42 | 
43 | print("Wrote {:d} sents from {:d} paragraphs, COMPLETE".format(sid, pid))
44 | 
45 | ftext.close()
46 | fsents.close()
47 | 
48 | 


--------------------------------------------------------------------------------
/src/entity-graph/02-find-entities.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import spacy
 3 | 
 4 | DATA_DIR = "../../data/entity-graph"
 5 | SENTENCES_FILENAME = os.path.join(DATA_DIR, "sentences.tsv")
 6 | 
 7 | ENTITIES_FILENAME = os.path.join(DATA_DIR, "entities.tsv")
 8 | 
 9 | nlp = spacy.load("en")
10 | 
11 | num_sents, num_ents = 0, 0
12 | fents = open(ENTITIES_FILENAME, "w")
13 | fsent = open(SENTENCES_FILENAME, "r")
14 | for line in fsent:
15 |     if num_sents % 100 == 0:
16 |         print("{:d} entities found in {:d} sentences".format(num_ents, num_sents))
17 |     pid, sid, sent = line.strip().split('\t')
18 |     doc = nlp(sent)
19 |     for ent in doc.ents:
20 |         fents.write("{:d}\t{:s}\t{:s}\t{:s}\t{:d}\t{:d}\n".format(
21 |             int(sid), sent, ent.text, ent.label_, ent.start_char, ent.end_char))
22 |         num_ents += 1
23 |     num_sents += 1
24 | 
25 | print("{:d} entities found in {:d} sentences, COMPLETE".format(num_ents, num_sents))
26 | 
27 | fsent.close()
28 | fents.close()
29 | 


--------------------------------------------------------------------------------
/src/entity-graph/04-generate-entity-sets.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import operator
 3 | import os
 4 | 
 5 | DATA_DIR = "../../data/entity-graph"
 6 | 
 7 | parser = argparse.ArgumentParser()
 8 | parser.add_argument("-i", "--input", required=True, help="Input file")
 9 | parser.add_argument("-o", "--output", required=True, help="Output file")
10 | args = parser.parse_args()
11 | 
12 | input_file = args.input
13 | output_file = args.output
14 | 
15 | fout = open(os.path.join(DATA_DIR, output_file), "w")
16 | fin = open(os.path.join(DATA_DIR, input_file), "r")
17 | for line in fin:
18 |     line = line.strip()
19 |     display_name, synonyms = line.split(',', 1)
20 |     synonym_list = synonyms.split('|')
21 |     synonym_list.append(display_name)
22 |     unique_synonyms = sorted(list(set(synonym_list)), key=len, reverse=True)
23 |     display_name = unique_synonyms[0]
24 |     synonyms = '|'.join(unique_synonyms[1:])
25 |     fout.write("{:s},{:s}\n".format(display_name, synonyms))
26 | 
27 | fin.close()
28 | fout.close()
29 | 


--------------------------------------------------------------------------------
/src/entity-graph/05-find-corefs.py:
--------------------------------------------------------------------------------
  1 | import operator
  2 | import os
  3 | import spacy
  4 | import neuralcoref
  5 | import nltk
  6 | 
  7 | PRONOUNS = set(["he", "she", "him", "her", "they", "their", "it"])
  8 | 
  9 | DATA_DIR = "../../data/entity-graph"
 10 | 
 11 | TEXT_FILENAME = os.path.join(DATA_DIR, "db-article.txt")
 12 | SENTENCES_FILENAME = os.path.join(DATA_DIR, "sentences.tsv")
 13 | COREF_FILENAME = os.path.join(DATA_DIR, "corefs.tsv")
 14 | 
 15 | def get_coref_clusters(ptext, nlp, offset=0):
 16 |     output_clusters = []
 17 |     doc = nlp(ptext)
 18 |     for coref_cluster in doc._.coref_clusters:
 19 |         main_text = coref_cluster.main.text
 20 |         for mention in coref_cluster.mentions:
 21 |             if nltk.edit_distance(main_text, mention.text) <= 5:
 22 |                 continue
 23 |             if mention.start_char < offset:
 24 |                 # mentions from previous paragraph, don't report
 25 |                 continue
 26 |             output_clusters.append((mention.start_char - offset, 
 27 |                                     mention.end_char - offset, 
 28 |                                     mention.text,
 29 |                                     main_text))
 30 | 
 31 |     return output_clusters
 32 | 
 33 | 
 34 | def partition_mentions_by_sentence(mentions, ptext, para_id, nlp):
 35 |     curr_sid = 0
 36 |     fsent = open(SENTENCES_FILENAME, "r")
 37 |     for line in fsent:
 38 |         pid, sid, sent = line.strip().split('\t')
 39 |         pid, sid = int(pid), int(sid)
 40 |         if pid == para_id:
 41 |             curr_sid = sid
 42 |             break
 43 |     fsent.close()
 44 |     partitioned_mentions = []
 45 |     sent_bounds = [(sid, s.start_char, s.end_char) for sid, s in enumerate(nlp(ptext).sents)]
 46 |     for mention in mentions:
 47 |         m_sid = None
 48 |         m_start, m_end, m_text, m_main = mention
 49 |         for sent_bound in sent_bounds:
 50 |             sid, s_start, s_end = sent_bound
 51 |             if m_start >= s_start and m_end <= s_end:
 52 |                 m_sid = sid
 53 |                 m_start -= s_start
 54 |                 m_end -= s_start
 55 |                 break
 56 |         if m_sid is not None:
 57 |             partitioned_mentions.append((curr_sid + m_sid, m_start, m_end, m_text, m_main))
 58 |     return partitioned_mentions
 59 | 
 60 | 
 61 | nlp = spacy.load("en")
 62 | neuralcoref.add_to_pipe(nlp)
 63 | 
 64 | fcoref = open(COREF_FILENAME, "w")
 65 | ftext = open(TEXT_FILENAME, "r")
 66 | 
 67 | fcoref.write("sid\tm_start\tm_end\tm_text\tm_main\n")
 68 | 
 69 | prev_ptext = None
 70 | curr_pid, curr_sid = 0, 0
 71 | for ptext in ftext:
 72 |     if curr_pid % 100 == 0:
 73 |         print("{:d} paragraphs processed".format(curr_pid))
 74 |     ptext = ptext.strip()
 75 |     # skip empty lines
 76 |     if len(ptext) == 0:
 77 |         continue
 78 |     # does the text have pronouns
 79 |     doc = nlp(ptext)
 80 |     tokens = set([token.text.lower() for token in doc])
 81 |     if len(tokens.intersection(PRONOUNS)) == 0:
 82 |         curr_pid += 1
 83 |         continue
 84 |     output_clusters = get_coref_clusters(ptext, nlp)
 85 |     # if we couldn't find corefs even though we had pronouns lets
 86 |     # increase the scope to previous paragraph as well
 87 |     if len(output_clusters) == 0 and prev_ptext is not None:
 88 |         output_clusters = get_coref_clusters(" ".join([prev_ptext, ptext]), 
 89 |             nlp, offset=len(prev_ptext)+1)
 90 | 
 91 |     # partition the list among individual sentences
 92 |     partitioned_mentions = partition_mentions_by_sentence(
 93 |         output_clusters, ptext, curr_pid, nlp)
 94 |     for mention_p in partitioned_mentions:
 95 |         pm_sid, pm_start, pm_end, pm_text, pm_main = mention_p
 96 |         fcoref.write("{:d}\t{:d}\t{:d}\t{:s}\t{:s}\n".format(
 97 |             pm_sid, pm_start, pm_end, pm_text, pm_main))
 98 | 
 99 |     # set previous paragraph (in case needed, see above)
100 |     prev_ptext = ptext
101 |     curr_pid += 1
102 | 
103 | print("{:d} paragraphs processed, COMPLETE".format(curr_pid))
104 | 
105 | ftext.close()
106 | fcoref.close()
107 | 


--------------------------------------------------------------------------------
/src/entity-graph/06-find-matches.py:
--------------------------------------------------------------------------------
  1 | import ahocorasick
  2 | import joblib
  3 | import operator
  4 | import os
  5 | 
  6 | DATA_DIR = "../../data/entity-graph"
  7 | ENTITY_FILES = ["org_syns.csv", "person_syns.csv", "gpe_syns.csv"]
  8 | 
  9 | DICT_FILE = os.path.join(DATA_DIR, "entities_dict.pkl")
 10 | DICT_KEYS_FILE = os.path.join(DATA_DIR, "entities_dict.keys")
 11 | 
 12 | SENTENCES_FILE = os.path.join(DATA_DIR, "sentences.tsv")
 13 | COREF_FILE = os.path.join(DATA_DIR, "corefs.tsv")
 14 | 
 15 | MATCHED_ENTITIES_FILE = os.path.join(DATA_DIR, "matched_entities.tsv")
 16 | 
 17 | def build_automaton():
 18 |     print("Building automaton...")
 19 |     if os.path.exists(DICT_FILE):
 20 |         A = joblib.load(DICT_FILE)
 21 |     else:
 22 |         fkeys = open(DICT_KEYS_FILE, "w")
 23 |         A = ahocorasick.Automaton()
 24 |         for entity_file in ENTITY_FILES:
 25 |             entity_type = entity_file.split('_')[0][0:3]
 26 |             entity_id = 1
 27 |             fent = open(os.path.join(DATA_DIR, entity_file), "r")
 28 |             for line in fent:
 29 |                 line = line.strip()
 30 |                 # print("line:", line)
 31 |                 if line.startswith("ent_text_x,synonyms"):
 32 |                     continue
 33 |                 display_name, synonyms = line.split(',', 1)
 34 |                 # print("display_name:", display_name)
 35 |                 # print("synonyms:", synonyms)
 36 |                 if len(synonyms) == 0:
 37 |                     syn_list = []
 38 |                 else:
 39 |                     syn_list = synonyms.split('|')
 40 |                 # print("syn_list:", syn_list)
 41 |                 syn_list.append(display_name)
 42 |                 unique_syns = list(set(syn_list))
 43 |                 key = "{:s}{:05d}".format(entity_type[0:3], entity_id)
 44 |                 fkeys.write("{:s}\t{:s}\n".format(key, display_name))
 45 |                 for syn in unique_syns:
 46 |                     print("...", key, syn)
 47 |                     A.add_word(syn, (key, syn))
 48 |                 entity_id += 1
 49 |         A.make_automaton()
 50 |         fkeys.close()
 51 |         joblib.dump(A, DICT_FILE)
 52 |     return A
 53 | 
 54 | 
 55 | def find_matches(A, sent_text):
 56 |     matched_ents = []
 57 |     for char_end, (eid, ent_text) in A.iter(sent_text):
 58 |         char_start = char_end - len(ent_text)
 59 |         matched_ents.append((eid, ent_text, char_start, char_end))
 60 |     # remove shorter subsumed matches
 61 |     longest_matched_ents = []
 62 |     for matched_ent in sorted(matched_ents, key=lambda x: len(x[1]), reverse=True):
 63 |         # print("matched_ent:", matched_ent)
 64 |         longest_match_exists = False
 65 |         char_start, char_end = matched_ent[2], matched_ent[3]
 66 |         for _, _, ref_start, ref_end in longest_matched_ents:
 67 |             # print("ref_start:", ref_start, "ref_end:", ref_end)
 68 |             if ref_start <= char_start and ref_end >= char_end:
 69 |                 longest_match_exists = True
 70 |                 break
 71 |         if not longest_match_exists:
 72 |             # print("adding match to longest")
 73 |             longest_matched_ents.append(matched_ent)
 74 |     return longest_matched_ents
 75 | 
 76 | 
 77 | def find_corefs(coref_file, sid):
 78 |     corefs = []
 79 |     fcoref = open(coref_file, "r")
 80 |     for line in fcoref:
 81 |         if line.startswith("sid"):
 82 |             continue
 83 |         line = line.strip()
 84 |         m_sid, m_start, m_end, m_text, m_main = line.split('\t')
 85 |         m_sid = int(m_sid)
 86 |         if m_sid == sid:
 87 |             corefs.append((int(m_start), int(m_end), m_text, m_main))
 88 |         if m_sid > sid:
 89 |             break
 90 |     fcoref.close()
 91 |     return sorted(corefs, key=operator.itemgetter(0), reverse=True)
 92 | 
 93 | 
 94 | def replace_corefs(sent_text, corefs):
 95 |     sent_out = sent_text
 96 |     for start, end, m_text, m_main in corefs:
 97 |         sent_out = sent_out[0:start] + m_main + sent_out[end:]
 98 |     return sent_out
 99 | 
100 | 
101 | num_sents, num_ents = 0, 0
102 | A = build_automaton()
103 | 
104 | print("Finding entities...")
105 | fents = open(MATCHED_ENTITIES_FILE, "w")
106 | fsent = open(SENTENCES_FILE, "r")
107 | for line in fsent:
108 |     if num_sents % 100 == 0:
109 |         print("... {:d} sentences read, {:d} entities written"
110 |             .format(num_sents, num_ents))
111 |     line = line.strip()
112 |     pid, sid, sent_text = line.split('\t')
113 |     # extract and replace coreferences with main text in sentence
114 |     sent_corefs = find_corefs(COREF_FILE, int(sid))
115 |     sent_text = replace_corefs(sent_text, sent_corefs)
116 |     # find matches in the coref enhanced sentences
117 |     matched_ents = find_matches(A, sent_text)
118 |     for eid, ent_text, char_start, char_end in matched_ents:
119 |         fents.write("{:s}\t{:s}\t{:s}\t{:s}\t{:d}\t{:d}\n"
120 |             .format(pid, sid, eid, ent_text, char_start, char_end))
121 |         num_ents += 1
122 |     num_sents += 1
123 | 
124 | print("... {:d} sentences read, {:d} entities written, COMPLETE"
125 |     .format(num_sents, num_ents))
126 | 
127 | fsent.close()
128 | fents.close()
129 | 


--------------------------------------------------------------------------------
/src/entity-graph/07-create-graphs.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import os
 3 | import pandas as pd
 4 | 
 5 | DATA_DIR = "../../data/entity-graph"
 6 | 
 7 | DICT_KEYS_FILE = os.path.join(DATA_DIR, "entities_dict.keys")
 8 | MATCHED_ENTITIES_FILE = os.path.join(DATA_DIR, "matched_entities.tsv")
 9 | 
10 | NODE_FILE = os.path.join(DATA_DIR, "neo4j_nodes.csv")
11 | EDGE_FILE = os.path.join(DATA_DIR, "neo4j_edges.csv")
12 | 
13 | # generate Nodes CSV
14 | fnodes = open(NODE_FILE, "w")
15 | fnodes.write("eid:ID,ename,:LABEL\n")
16 | fkeys = open(DICT_KEYS_FILE, "r")
17 | for line in fkeys:
18 |     eid, ename = line.strip().split('\t')
19 |     fnodes.write(','.join([eid, ename, eid[0:3].upper()]) + '\n')
20 | fkeys.close()
21 | fnodes.close()
22 | 
23 | # generate Edges CSV 
24 | ents_df = pd.read_csv(MATCHED_ENTITIES_FILE, sep='\t',
25 |     names=["pid", "sid", "eid", "etext", "estart", "estop"])
26 | edges_df = (
27 |     ents_df[["sid", "eid"]]     # extract (sid, eid)
28 |     .groupby("sid")["eid"]      # group by sid
29 |     .apply(list)                # (sid, list[eid, ...])
30 |     .reset_index(name="eids")
31 | )
32 | # generate entity ID pairs: (sid, list[(eid1, eid2), ...])
33 | edges_df["eids"] = (
34 |     edges_df["eids"]
35 |     .apply(lambda xs: list(set(xs)))
36 |     .apply(lambda xs: [x for x in itertools.combinations(xs, 2)])
37 | )
38 | # unstack the list of pairs
39 | rows = []
40 | for row in edges_df.itertuples():
41 |     # note: 1 based because Index is 0
42 |     sid = row[1]
43 |     for edge in row[2]:
44 |         rows.append([edge[0], edge[1], sid])
45 | edges_df = pd.DataFrame(data=rows, columns=[":START_ID", ":END_ID", "sid"])
46 | edges_df[":TYPE"] = "REL"
47 | # print(edges_df.head())
48 | edges_df.to_csv(EDGE_FILE, index=False)
49 | 
50 | #############################################################################
51 | # Load these files into neo4j by doing the following:
52 | #   1. cd $NEO4J_HOME/data/databases
53 | #   2. rm -rf *
54 | #   3. cd $NEO4J_HOME
55 | #   4. bin/neo4j-admin import --nodes=/path/to/neo4j_nodes.csv \
56 | #           --relationships=/path/to/neo4j_edges.csv 
57 | #   5. bin/neo4j start
58 | #############################################################################
59 | 


--------------------------------------------------------------------------------
/src/entity-graph/graph-snapshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sujitpal/nltk-examples/6fe64f774fe8d2c97f51f4648de6d1be6e6950af/src/entity-graph/graph-snapshot.png


--------------------------------------------------------------------------------
/src/genetagger/file_reformatter.py:
--------------------------------------------------------------------------------
 1 | # Reformats supplied input file to form parseable by NLTK corpus readers.
 2 | def reformat(file_in, file_out, is_tagged):
 3 |   fin = open(file_in, 'rb')
 4 |   fout = open(file_out, 'wb')
 5 |   sent = []
 6 |   for line in fin:
 7 |     line = line.strip()
 8 |     if len(line) == 0:
 9 |       if is_tagged:
10 |         fout.write(" ".join(["/".join([word, tag]) for word, tag in sent]) + "\n")
11 |       else:
12 |         fout.write(" ".join([word for word in sent]) + "\n")
13 |       sent = []
14 |       continue
15 |     if is_tagged:
16 |       word, tag = line.split(" ")
17 |       sent.append((word, tag))
18 |     else:
19 |       sent.append(line)
20 |   fin.close()
21 |   fout.close()
22 | 
23 | def main():
24 |   reformat("gene.train", "gene.train.blog", True)
25 |   reformat("gene.key", "gene.validate.blog", True)
26 |   reformat("gene.test", "gene.test.blog", False)
27 |   
28 | if __name__ == "__main__":
29 |   main()
30 | 


--------------------------------------------------------------------------------
/src/hangman/game.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import operator
  3 | from random import random
  4 | 
  5 | MAX_GUESSES_PER_GAME = 11
  6 | MAX_WORDS_IN_SUBSET = 25
  7 | 
  8 | ########################### Preprocessing ############################
  9 | 
 10 | non_ascii_mappings = list([
 11 |     ('á', 'a'), ('å', 'a'), ('ä', 'a'), ('â', 'a'), ('Å', 'a'), ('Ã', 'a'),
 12 |     ('ç', 'c'), ('¢', 'c'),
 13 |     ('é', 'e'), ('é', 'e'), ('è', 'e'), ('ê', 'e'),
 14 |     ('í', 'i'),
 15 |     ('ñ', 'n'),
 16 |     ('ó', 'o'), ('ó', 'o'), ('ö', 'o'), ('ô', 'o'),
 17 |     ('ü', 'u'), ('û', 'u'),
 18 |     ('´', '\''), ('»', '"')
 19 | ])
 20 | 
 21 | def ascii_fold(s):
 22 |     for x in non_ascii_mappings:
 23 |         s = s.replace(x[0], x[1])
 24 |     return s
 25 | 
 26 | def preprocess(dictfile):    
 27 |     fwords = open(dictfile, 'rb')
 28 |     wset = set()
 29 |     for line in fwords:
 30 |         word = line.strip().lower()
 31 |         word = ascii_fold(word)
 32 |         if word.endswith("'s"):
 33 |             word = word[:-2]
 34 |         word = word.replace(" ", "").replace("'", "").replace("\"", "")
 35 |         wset.add(word)
 36 |     fwords.close()
 37 |     return list(wset)
 38 | 
 39 | ############################# Proposer Side #############################
 40 |     
 41 | def select_secret_word(words):
 42 |     widx = int(random() * len(words))
 43 |     return words[widx]
 44 |     
 45 | def find_all_match_positions(secret_word, guess_char):
 46 |     positions = []
 47 |     curr_pos = 0
 48 |     while curr_pos < len(secret_word):
 49 |         curr_pos = secret_word.find(guess_char, curr_pos)
 50 |         if curr_pos < 0:
 51 |             break
 52 |         positions.append(curr_pos)
 53 |         curr_pos += 1
 54 |     return positions    
 55 |     
 56 | def update_guessed_word(guessed_word, matched_positions, guessed_char):
 57 |     for pos in matched_positions:
 58 |         guessed_word[pos] = guessed_char
 59 |     
 60 | def is_solved(guessed_word):
 61 |     chars_remaining = len(filter(lambda x: x == '_', guessed_word))
 62 |     return chars_remaining == 0
 63 |     
 64 | 
 65 | ############################# Solver Side ###############################
 66 | 
 67 | letters = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", 
 68 |            "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]
 69 | 
 70 | def most_frequent_char(words, previously_guessed):
 71 |     cmap = {x:0 for x in letters}
 72 |     for word in words:
 73 |         cset = set()
 74 |         for c in word:
 75 |             if c not in previously_guessed:
 76 |                 cset.add(c)
 77 |         for c in cset:
 78 |             cmap[c] += 1
 79 |     return sorted(map(lambda x: (x, cmap[x]), cmap.keys()), 
 80 |                  key=operator.itemgetter(1), reverse=True)[0][0]
 81 |     
 82 | def best_guess(words, word_len, bad_guesses, good_guesses, guessed_word):
 83 |     temp_words = filter(lambda x: len(x) == word_len, words)
 84 |     if len(bad_guesses) > 0:
 85 |         for bad_guess in bad_guesses:
 86 |             temp_words = filter(lambda x: x.find(bad_guess) == -1, temp_words)
 87 |     if len(good_guesses) > 0:
 88 |         for good_guess in good_guesses:
 89 |             temp_words = filter(lambda x: x.find(good_guess) > -1, temp_words)
 90 |     previously_guessed = set(filter(lambda x: x != '_', guessed_word))
 91 |     return temp_words, most_frequent_char(temp_words, previously_guessed)
 92 |     
 93 | def init_guess(wordlen):
 94 |     initial_guess = []
 95 |     for i in range(wordlen):
 96 |         initial_guess.append("_")
 97 |     return initial_guess
 98 | 
 99 | def match_words_against_template(words, guessed_word):
100 |     if len(words) > MAX_WORDS_IN_SUBSET:
101 |         return words
102 |     matched_words = []
103 |     for word in words:
104 |         word_chars = [c for c in word]
105 |         merged = zip(guessed_word, word_chars)
106 |         diff = len(filter(lambda x: x[0] != x[1], 
107 |                           filter(lambda x: x[0] != '_', merged)))
108 |         if diff == 0:
109 |             matched_words.append(word)
110 |         if len(matched_words) > 1:
111 |             break
112 |     return matched_words
113 | 
114 | def replace_guessed_word(guessed_word, matched_word):
115 |     matched_chars = [c for c in matched_word]
116 |     for i in range(len(matched_chars)):
117 |         guessed_word[i] = matched_chars[i]
118 |     
119 | ################################# Game ###################################
120 |     
121 | def single_round(words, debug=False):
122 |     solver_wins = False
123 |     secret_word = select_secret_word(words)
124 |     if debug:
125 |         print "secret word:", secret_word
126 |     word_len = len(secret_word)
127 |     bad_guesses = set()
128 |     good_guesses = set()
129 |     guessed_word = init_guess(word_len)
130 |     for num_guesses in range(MAX_GUESSES_PER_GAME):
131 |         filtered_words, guess_char = best_guess(words, word_len, bad_guesses, 
132 |                                                 good_guesses, guessed_word)
133 |         if debug:
134 |             print "guessed char:", guess_char
135 |         matched_positions = find_all_match_positions(secret_word, guess_char)
136 |         if len(matched_positions) == 0:
137 |             bad_guesses.add(guess_char)
138 |         else:
139 |             good_guesses.add(guess_char)
140 |         update_guessed_word(guessed_word, matched_positions, guess_char)
141 |         matched_words = match_words_against_template(filtered_words, guessed_word)
142 |         if len(matched_words) == 1:
143 |             replace_guessed_word(guessed_word, matched_words[0])
144 |         if debug:
145 |             print "#", num_guesses, "guess:", " ".join(guessed_word)
146 |         if is_solved(guessed_word):
147 |             solver_wins = True
148 |             break
149 |     return len(secret_word), solver_wins, num_guesses
150 |     
151 | def multiple_rounds(words, num_games, report_file):
152 |     fdata = open(report_file, 'wb')
153 |     for i in range(num_games):
154 |         word_len, solver_wins, num_guesses = single_round(words, False)
155 |         fdata.write("%d,%d,%d\n" % (word_len, 1 if solver_wins else 0, num_guesses))
156 |     fdata.close()
157 | 
158 | ################################# Main ###################################
159 | 
160 | words = preprocess("/usr/share/dict/words")
161 | 
162 | #single_round(words, True)
163 | 
164 | multiple_rounds(words, 10000, "hangman.csv")
165 | 


--------------------------------------------------------------------------------
/src/hangman/gamestats.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | import pandas as pd
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | df = pd.read_csv("hangman.csv", header=None, 
 7 |                  names=["WORD_LEN", "SOLVER_WINS", "NUM_GUESSES"])
 8 | 
 9 | # wins vs losses
10 | nloss = df[df["SOLVER_WINS"] == 0].count()["SOLVER_WINS"]
11 | nwins = df[df["SOLVER_WINS"] == 1].count()["SOLVER_WINS"]
12 | print "probability of winning=", nwins / (nwins + nloss)
13 | print "probability of losing=", nloss / (nwins + nloss)
14 | 
15 | ## probability of winning and losing for different word lengths
16 | df2 = df.drop("NUM_GUESSES", 1)
17 | df2_wins = df2[df2["SOLVER_WINS"] == 1].groupby("WORD_LEN").count().reset_index()
18 | df2_losses = df2[df2["SOLVER_WINS"] == 0].groupby("WORD_LEN").count().reset_index()
19 | df2_losses.rename(columns={"SOLVER_WINS": "SOLVER_LOSES"}, inplace=True)
20 | df2_merged = df2_wins.merge(df2_losses, how="inner", on="WORD_LEN")
21 | df2_merged.plot(kind="bar", stacked=True, x="WORD_LEN", 
22 |                 title="Win/Loss Counts by Word Length") 
23 | plt.show()
24 | df2_merged["NUM_GAMES"] = df2_merged["SOLVER_WINS"] + df2_merged["SOLVER_LOSES"]
25 | df2_merged["SOLVER_WINS"] = df2_merged["SOLVER_WINS"] / df2_merged["NUM_GAMES"]
26 | df2_merged["SOLVER_LOSES"] = df2_merged["SOLVER_LOSES"] / df2_merged["NUM_GAMES"]
27 | df2_merged.drop("NUM_GAMES", axis=1, inplace=True)
28 | df2_merged.plot(kind="bar", stacked=True, x="WORD_LEN", 
29 |                 title="Win/Loss Probabilities by Word Length") 
30 | plt.show()
31 | 
32 | # how number of guesses to win varies with word length (winning games only)
33 | df3 = df[df["SOLVER_WINS"] == 1]
34 | df3.drop("SOLVER_WINS", 1)
35 | df3_grouped = df3.drop("SOLVER_WINS", 1).groupby("WORD_LEN").mean().reset_index()
36 | df3_grouped.plot(kind="bar", x="WORD_LEN", 
37 |                  title="Avg Guesses for Different Word Lengths")
38 | plt.show()
39 | 


--------------------------------------------------------------------------------
/src/langmodel/med_lang_model.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | 
 3 | import math
 4 | import os.path
 5 | 
 6 | import cPickle
 7 | import glob
 8 | import nltk
 9 | from nltk.corpus.reader import XMLCorpusReader
10 | 
11 | class LangModel:
12 |   def __init__(self, order, alpha, sentences):
13 |     self.order = order
14 |     self.alpha = alpha
15 |     if order > 1:
16 |       self.backoff = LangModel(order - 1, alpha, sentences)
17 |       self.lexicon = None
18 |     else:
19 |       self.backoff = None
20 |       self.n = 0
21 |     self.ngramFD = nltk.FreqDist()
22 |     lexicon = set()
23 |     for sentence in sentences:
24 |       words = nltk.word_tokenize(sentence)
25 |       wordNGrams = nltk.ngrams(words, order)
26 |       for wordNGram in wordNGrams:
27 |         self.ngramFD.inc(wordNGram)
28 |         if order == 1:
29 |           lexicon.add(wordNGram)
30 |           self.n += 1
31 |     self.v = len(lexicon)
32 | 
33 |   def logprob(self, ngram):
34 |     return math.log(self.prob(ngram))
35 |   
36 |   def prob(self, ngram):
37 |     if self.backoff != None:
38 |       freq = self.ngramFD[ngram]
39 |       backoffFreq = self.backoff.ngramFD[ngram[1:]]
40 |       if freq == 0:
41 |         return self.alpha * self.backoff.prob(ngram[1:])
42 |       else:
43 |         return freq / backoffFreq
44 |     else:
45 |       # laplace smoothing to handle unknown unigrams
46 |       return ((self.ngramFD[ngram] + 1) / (self.n + self.v))
47 | 
48 | def train():
49 |   if os.path.isfile("lm.bin"):
50 |     return
51 |   files = glob.glob("data/*.xml")
52 |   sentences = []
53 |   i = 0
54 |   for file in files:
55 |     if i > 0 and i % 500 == 0:
56 |       print("%d/%d files loaded, #-sentences: %d" %
57 |         (i, len(files), len(sentences)))
58 |     dir, file = file.split("/")
59 |     reader = XMLCorpusReader(dir, file)
60 |     sentences.extend(nltk.sent_tokenize(" ".join(reader.words())))
61 |     i += 1
62 |   lm = LangModel(3, 0.4, sentences)
63 |   cPickle.dump(lm, open("lm.bin", "wb"))
64 | 
65 | def test():
66 |   lm1 = cPickle.load(open("lm.bin", 'rb'))
67 |   testFile = open("sentences.test", 'rb')
68 |   for line in testFile:
69 |     sentence = line.strip()
70 |     print "SENTENCE:", sentence,
71 |     words = nltk.word_tokenize(sentence)
72 |     wordTrigrams = nltk.trigrams(words)
73 |     slogprob = 0
74 |     for wordTrigram in wordTrigrams:
75 |       logprob = lm1.logprob(wordTrigram)
76 |       slogprob += logprob
77 |     print "(", slogprob / len(words), ")"
78 | 
79 | def main():
80 |   train()
81 |   test()
82 | 
83 | if __name__ == "__main__":
84 |   main()


--------------------------------------------------------------------------------
/src/langmodel/old_med_lang_model.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | 
 3 | import glob
 4 | import nltk
 5 | from nltk.corpus.reader import XMLCorpusReader
 6 | from nltk.model.ngram import NgramModel
 7 | from nltk.probability import LidstoneProbDist
 8 | import cPickle
 9 | 
10 | def train():
11 |   # parse XML and load up words
12 |   print("Loading words from XML files...")
13 |   sentences = []
14 |   files = glob.glob("data/*.xml")
15 |   i = 0
16 |   for file in files:
17 |     if i > 0 and i % 500 == 0:
18 |       print("%d/%d files loaded, #-sentences: %d" %
19 |         (i, len(files), len(sentences)))
20 |       break
21 |     dir, file = file.split("/")
22 |     reader = XMLCorpusReader(dir, file)
23 |     sentences.extend(nltk.sent_tokenize(" ".join(reader.words())))
24 |     i += 1
25 |   words = []
26 |   for sentence in sentences:
27 |     words.append(nltk.word_tokenize(sentence))
28 |   # build a trigram Language Model (using default Good-Turing
29 |   # smoothing) with the words array
30 |   print("Building language model...")
31 |   est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
32 |   langModel = NgramModel(3, words, estimator=est)
33 | #  langModel = NgramModel(3, words)
34 | #  cPickle.dump(langModel, open("lm.bin", 'wb'))
35 |   return langModel
36 | 
37 | def test(langModel):
38 |   testData = open("sentences.test", 'rb')
39 |   for line in testData:
40 |     sentence = line.strip()
41 |     print "SENTENCE:", sentence,
42 |     words = nltk.word_tokenize(sentence)
43 |     trigrams = nltk.trigrams(words)
44 |     slogprob = 0
45 |     for trigram in trigrams:
46 |       word = trigram[2]
47 |       context = list(trigrams[:-1])
48 |       slogprob += langModel.logprob(word, context)
49 |     print "(", slogprob, ")"
50 |   testData.close()
51 | 
52 | def main():
53 |   langModel = train()
54 |   test(langModel)
55 | 
56 | if __name__ == "__main__":
57 |   main()


--------------------------------------------------------------------------------
/src/langmodel/sentences.test:
--------------------------------------------------------------------------------
 1 | In biology, immunity is the state of having sufficient biological defences to avoid infection, disease, or other unwanted biological invasion.
 2 | Naturally acquired immunity occurs through contact with a disease causing agent, when the contact was not deliberate, whereas artificially acquired immunity develops only through deliberate actions such as vaccination.
 3 | Immunity from prosecution occurs when a prosecutor grants immunity, usually to a witness in exchange for testimony or production of other evidence. 
 4 | Transactional immunity (colloquially known as "blanket" or "total" immunity) completely protects the witness from future prosecution for crimes related to his or her testimony.
 5 | Hearing loss is being partly or totally unable to hear sound in one or both ears.
 6 | Conductive hearing loss (CHL) occurs because of a mechanical problem in the outer or middle ear.
 7 | Sensorineural hearing loss (SNHL) occurs when the tiny hair cells (nerve endings) that detect sound in the ear are injured, diseased, do not work correctly, or have died. 
 8 | This type of hearing loss often cannot be reversed.
 9 | In law, a hearing is a proceeding before a court or other decision-making body or officer, such as a government agency.
10 | Within some criminal justice systems, a preliminary hearing (evidentiary hearing) is a proceeding, after a criminal complaint has been filed by the prosecutor, to determine whether there is enough evidence to require a trial.
11 | 


--------------------------------------------------------------------------------
/src/medorleg/README.md:
--------------------------------------------------------------------------------
 1 | ##INTRODUCTION
 2 | 
 3 | The motivation for this project is to automatically determine if a sentence is from a medical or a legal genre. This is needed to support a medical litigation decision support system which allows a user to search for documents using concepts rather than terms[1]. Concepts are medical and legal named entities, such as diseases, drugs, organizations, jurisdictions, etc. These entities are extracted during document indexing by looking up words and phrases in each sentence against domain specific taxonomies for each genre. Documents are then annotated with the entity IDs, where they can be discovered during concept search. Unfortunately, this can often produce ambiguous concepts that can mean very different things in the two genres - for example, hearing, period and immunity. This creates a bad search experience.
 4 | 
 5 | To address this, we build a classifier that can classify an incoming sentence into one of the two genres. This classifier acts as a preprocessor that will route the sentence into one of two knowledge bases for entity recognition and annotation.
 6 | 
 7 | ##METHODS
 8 | 
 9 | We build two interpolated trigram language models[2], one for each genre. An interpolated trigram language model approximates the probability of a specific trigram as a linear combination of the frequency of the trigram and the associated bigram and unigram. The interpolated score for each trigram (w1,w2,w3) is given by:
10 | 
11 |     p(w1,w2,w3|c) = α * p(w1,w2,w3) + β * p(w2,w3) + γ * p(w3)    ...(1)
12 |     where:
13 |       p(w1,w2,w3|c) = C(w1,w2,w3) / Number of trigrams in corpus
14 |       p(w1,w2,w3)   = C(w1,w2,w3) / C(w2,w3)
15 |       p(w2,w3)      = C(w2,w3) / C(w3)
16 |       p(w3)         = (C(w3) + 1) / (N + V)
17 |       N             = number of terms in corpus
18 |       V             = number of vocabulary terms
19 | 
20 | The unigram probability is smoothed using Laplace smoothing so unseen unigrams in the test set don't result in a zero score for the sentence. The parameters α, β and γ are learned from the training data using a simple linear regression algorithm.
21 | 
22 | The coefficients of the linear model should also satisfy this constraint:
23 | 
24 |     α + β + γ = 1                    ...(2)
25 | 
26 | Using the models for the two genres as described above, the probability for an unseen sentence is calculated for each genre, as the joint probability (product) of each of its trigrams. The sentence is classified into the genre which is more probable according to the model.
27 | 
28 | ##ANALYSIS
29 | 
30 | The model described above was generated and tested as follows:
31 | 
32 | 1. Our training set consists of a medical corpus (about 10,000 files from the Gale Medical Encyclopedia[3]), and a legal corpus (about 4,000 files from the Australian Case Law Dataset[4]). The first step (preprocess.py[7]) is to parse the XML files in each corpus into a single flat file of sentences. Each sentence is tagged with a source (M or L). We end up with 950,887 medical sentences and 837,393 legal sentences.
33 | 2. We randomly choose 1,000 sentences from each category (testset_splitter.py[7]) to use as a test set for evaluating our classifier.
34 | 3. Use a MapReduce job (ngram_counting_job.py[7]) using mrjob[5] and NLTK to compute the counts for trigrams, bigrams and unigrams for each sentence. This gives us 3,740,646 medical and 5,092,913 legal count records.
35 | 4. Populate a SQLite3 database with the aggregated counts (db_loader.py[7]). We use SQLite3 as a persistent lookup table for doing probability calculations (regression_data.py[7]) for each trigram, ie finding the p(w) values in equation (1) above. The output of this step is a set of X (variable) and y (outcome) values for the trigrams in each genre.
36 | 5. Train a Linear Regression model for each genre (model_params.py[7]). The coefficients of the Linear model correspond to the unnormalized values of α, β, and γ in equation (2). We normalize both models by dividing the coefficients and the intercept by the sum of the coefficients.
37 | 6. Convert each test sentences into trigrams and compute the joint probability of the trigrams against each model (eval_model.py[7]), and report on overall accuracy of the model.
38 | 
39 | ##RESULTS
40 | 
41 | The overall accuracy for the classifier was 92.7%. The legal model performed better, correctly classifying 997 of 1,000 legal documents, as opposed to the medical model, which correctly classfied only 857 of 1,000 medical documents. The confusion matrix is shown below:
42 | 
43 |             M     L  <-- classified as
44 |           857   143 |   M
45 |             3   997 |   L
46 | 
47 | ##CONCLUSION
48 | 
49 | An accuracy of 92.7% is adequate for our purposes, so we can consider putting this model into production. However, its performance can probably be improved through the use of more sophisticated regression algorithms.
50 | 
51 | ##REFERENCES
52 | 
53 |   1. Concept Search [http://en.wikipedia.org/wiki/Concept_Search, downloaded May 30, 2013].
54 |   2. Interpolated Language Models from Introduction to Information Retrieval Chapter 12: Language Models for Information Retrieval, by Manning, Schultz, et al [http://nlp.stanford.edu/IR-book/pdf/12lmodel.pdf, downloaded May 30, 2013].
55 |   3. Gale Encyclopedia of Medicine, 4/Ed (EBook Edition) [http://www.gale.cengage.com/servlet/ItemDetailServlet?region=9&imprint=000&cf=e&titleCode=GEME&type=4&id=259611, downloaded May 31, 2013].
56 |   4. UCI Machine Learning Repository Legal Case Reports Dataset [http://archive.ics.uci.edu/ml/datasets/Legal+Case+Reports, downloaded May 25, 2013].
57 |   5. MRJob Tutorial [http://www.brianweidenbaum.com/mapreduce-python-mrjob-tutorial/, downloaded May 31, 2013].
58 |   6. R-Squared Coefficient of Determination [http://en.wikipedia.org/wiki/Coefficient_of_determination, downloaded June 3, 2013].
59 |   7. Code for the analysis [https://github.com/sujitpal/nltk-examples/tree/master/src/medorleg, uploaded June 3, 2013].
60 |   8. Scikit-learn: Machine Learning in Python, Pedregosa et al., JMLR 12, pp. 2825-2830, 2011. 
61 | 


--------------------------------------------------------------------------------
/src/medorleg/db_loader.py:
--------------------------------------------------------------------------------
 1 | import sqlite3 as sql
 2 | 
 3 | def is_empty(conn):
 4 |   cur = conn.cursor()
 5 |   cur.execute("select name from sqlite_master where type='table'")
 6 |   rows = cur.fetchall()
 7 |   return len(rows) == 0
 8 | 
 9 | def create_tables(conn):
10 |   if not is_empty(conn):
11 |     return
12 |   cur = conn.cursor()
13 |   cur.executescript("""
14 |     create table m3 (w1 text, w2 text, w3 text, freq integer);
15 |     create table m2 (w2 text, w3 text, freq integer);
16 |     create table m1 (w3 text, freq integer);
17 |     create table l3 (w1 text, w2 text, w3 text, freq integer);
18 |     create table l2 (w2 text, w3 text, freq integer);
19 |     create table l1 (w3 text, freq integer);
20 |   """)
21 |   conn.commit()
22 | 
23 | def gram_to_list(gram):
24 |   return [x[1:-1] for x in gram[1:-1].split(", ")]
25 | 
26 | def populate_tables(conn, infn, t3n, t2n, t1n):
27 |   cur = conn.cursor()
28 |   infile = open(infn, 'rb')
29 |   i = 0
30 |   for line in infile:
31 |     if i % 1000 == 0:
32 |       print "Processing %s, line: %d" % (infn, i)
33 |     gram, count = line.strip().split("\t")
34 |     gramlist = gram_to_list(gram)
35 |     if len(gramlist) == 3:
36 |       cur.execute("insert into %s(w1,w2,w3,freq)values(?,?,?,?)" % (t3n),
37 |         (gramlist[0], gramlist[1], gramlist[2], int(count)))
38 |     elif len(gramlist) == 2:
39 |       cur.execute("insert into %s(w2,w3,freq)values(?,?,?)" % (t2n),
40 |         (gramlist[0], gramlist[1], int(count)))
41 |     else:
42 |       cur.execute("insert into %s(w3,freq)values(?,?)" % (t1n),
43 |         (gramlist[0], int(count)))
44 |     i += 1
45 |   infile.close()
46 |   conn.commit()
47 | 
48 | def build_indexes(conn):
49 |   print "Building indexes..."
50 |   cur = conn.cursor()
51 |   cur.executescript("""
52 |     create unique index ix_m3 on m3(w1,w2,w3);
53 |     create unique index ix_m2 on m2(w2,w3);
54 |     create unique index ix_m1 on m1(w3);
55 |     create unique index ix_l3 on l3(w1,w2,w3);
56 |     create unique index ix_l2 on l2(w2,w3);
57 |     create unique index ix_l1 on l1(w3);
58 |   """)
59 |   conn.commit()
60 |   
61 | def main():
62 |   conn = sql.connect("data/db/ngram_freqs.db")
63 |   create_tables(conn)
64 |   populate_tables(conn, "data/counts/medical_count.txt", "m3", "m2", "m1")
65 |   populate_tables(conn, "data/counts/legal_count.txt", "l3", "l2", "l1")
66 |   build_indexes(conn)
67 |   conn.close()
68 | 
69 | if __name__ == "__main__":
70 |   main()


--------------------------------------------------------------------------------
/src/medorleg/eval_model.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | import math
  4 | import sqlite3 as sql
  5 | 
  6 | import cPickle as pickle
  7 | import nltk
  8 | import numpy as np
  9 | import string
 10 | 
 11 | def normalize_numeric(x):
 12 |   xc = x.translate(string.maketrans("", ""), string.punctuation)
 13 |   return "_NNN_" if xc.isdigit() else x
 14 | 
 15 | def normalize_stopword(x, stopwords):
 16 |   return "_SSS_" if str(x) in stopwords else x
 17 | 
 18 | def get_trigrams(sentence, stopwords, porter):
 19 |   words = nltk.word_tokenize(sentence)
 20 |   words = [word.lower() for word in words]
 21 |   words = [normalize_numeric(word) for word in words]
 22 |   words = [normalize_stopword(word, stopwords) for word in words]
 23 |   words = [porter.stem(word) for word in words]
 24 |   return nltk.trigrams(words)
 25 | 
 26 | def get_base_counts(conn, morl):
 27 |   cur = conn.cursor()
 28 |   cur.execute("select count(*), sum(freq) from %s1" % (morl))
 29 |   v, n = cur.fetchall()[0]
 30 |   return n, v
 31 | 
 32 | def load_model_coeffs(model):
 33 |   norm = np.sum(model.coef_)
 34 |   return np.array([model.coef_[0] / norm,
 35 |     model.coef_[1] / norm,
 36 |     model.coef_[2] / norm,
 37 |     model.intercept_ / norm])
 38 | 
 39 | def calc_prob(trigrams, conn, coeffs, morl, n, v):
 40 |   joint_log_prob = 0.0
 41 |   cur = conn.cursor()
 42 |   for trigram in trigrams:
 43 |     cur.execute("select freq from %s3 where w1 = ? and w2 = ? and w3 = ?"
 44 |       % (morl), trigram)
 45 |     rows = cur.fetchall()
 46 |     freq3 = 0 if len(rows) == 0 else rows[0][0]
 47 |     cur.execute("select freq from %s2 where w2 = ? and w3 = ?" %
 48 |       (morl), trigram[1:])
 49 |     rows = cur.fetchall()
 50 |     freq2 = 0 if len(rows) == 0 else rows[0][0]
 51 |     cur.execute("select freq from %s1 where w3 = ?" % (morl), trigram[2:])
 52 |     rows = cur.fetchall()
 53 |     freq1 = 0 if len(rows) == 0 else rows[0][0]
 54 |     freqs = np.array([
 55 |       0 if freq3 == 0 else freq3 / freq2,
 56 |       0 if freq2 == 0 else freq2 / freq1,
 57 |       0 if freq1 == 0 else (freq1 + 1) / (n + v),
 58 |       1.0])
 59 |     joint_log_prob += math.log(1 + np.dot(coeffs, freqs))
 60 |   return joint_log_prob
 61 | 
 62 | def eval_model(medmodelfn, legmodelfn, testfn, stopwords, porter, conn):
 63 |   pos = {"M": 0, "L": 1}
 64 |   stats = np.zeros((2, 2))
 65 |   med_params = load_model_coeffs(pickle.load(open(medmodelfn, 'rb')))
 66 |   leg_params = load_model_coeffs(pickle.load(open(legmodelfn, 'rb')))
 67 |   mn, mv = get_base_counts(conn, "m")
 68 |   ln, lv = get_base_counts(conn, "l")
 69 |   testfile = open(testfn, 'rb')
 70 |   i = 0
 71 |   for line in testfile:
 72 |     if i % 100 == 0:
 73 |       print "Tested %d/1000 test cases..." % (i)
 74 |     i += 1
 75 |     cols = line.strip().split("|")
 76 |     trigrams = get_trigrams(cols[1], stopwords, porter)
 77 |     med_prob = calc_prob(trigrams, conn, med_params, "m", mn, mv)
 78 |     leg_prob = calc_prob(trigrams, conn, leg_params, "l", ln, lv)
 79 |     ytruth = cols[0]
 80 |     ypred = "M" if med_prob > leg_prob else "L"
 81 |     print "...", i, ytruth, ypred
 82 |     stats[pos[ytruth], pos[ypred]] += 1
 83 |   return stats
 84 | 
 85 | def calc_acc(stats):
 86 |   return np.sum(np.diag(stats)) / np.sum(stats)
 87 | 
 88 | def main():
 89 |   stopwords = nltk.corpus.stopwords.words("english")
 90 |   porter = nltk.PorterStemmer()
 91 |   conn = sql.connect("data/db/ngram_freqs.db")
 92 |   med_stats = eval_model("data/regdata/medical.pkl",
 93 |     "data/regdata/legal.pkl", "data/sentences/medical_test.txt",
 94 |     stopwords, porter, conn)
 95 |   print "confusion matrix (med), acc=", calc_acc(med_stats)
 96 |   print med_stats
 97 |   leg_stats = eval_model("data/regdata/medical.pkl",
 98 |     "data/regdata/legal.pkl", "data/sentences/legal_test.txt",
 99 |     stopwords, porter, conn)
100 |   print "confusion matrix (leg), acc=", calc_acc(leg_stats)
101 |   print leg_stats
102 |   merged_stats = med_stats + leg_stats
103 |   print "confusion matrix (merged), acc=", calc_acc(merged_stats)
104 |   print merged_stats
105 |   conn.close()
106 | 
107 | if __name__ == "__main__":
108 |   main()
109 | 


--------------------------------------------------------------------------------
/src/medorleg/model_params.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | 
 3 | import cPickle as pickle
 4 | import numpy as np
 5 | from sklearn.linear_model import LinearRegression
 6 | from sklearn.metrics import r2_score
 7 | 
 8 | def train(prefix):
 9 |   X = np.exp(np.loadtxt(prefix + "_X.txt"))
10 |   y = np.exp(np.loadtxt(prefix + "_y.txt"))
11 |   model = LinearRegression(fit_intercept=True)
12 |   model.fit(X, y)
13 |   ypred = model.predict(X)
14 |   print prefix, model.coef_, model.intercept_, r2_score(y, ypred)
15 |   pickle.dump(model, open(prefix + ".pkl", 'wb'))
16 |   
17 | def main():
18 |   train("data/regdata/medical")
19 |   train("data/regdata/legal")
20 |   
21 | if __name__ == "__main__":
22 |   main()
23 | 


--------------------------------------------------------------------------------
/src/medorleg/ngram_counting_job.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from mrjob.job import MRJob
 3 | import nltk
 4 | import string
 5 | 
 6 | class NGramCountingJob(MRJob):
 7 | 
 8 |   def mapper_init(self):
 9 | #    self.stopwords = nltk.corpus.stopwords.words("english")
10 |     self.stopwords = set(['i', 'me', 'my', 'myself', 'we',
11 |       'our', 'ours', 'ourselves', 'you', 'your', 'yours',
12 |       'yourself', 'yourselves', 'he', 'him', 'his', 'himself',
13 |       'she', 'her', 'hers', 'herself', 'it', 'its', 'itself',
14 |       'they', 'them', 'their', 'theirs', 'themselves', 'what',
15 |       'which', 'who', 'whom', 'this', 'that', 'these', 'those',
16 |       'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
17 |       'have', 'has', 'had', 'having', 'do', 'does', 'did',
18 |       'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
19 |       'because', 'as', 'until', 'while', 'of', 'at', 'by',
20 |       'for', 'with', 'about', 'against', 'between', 'into',
21 |       'through', 'during', 'before', 'after', 'above', 'below',
22 |       'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off',
23 |       'over', 'under', 'again', 'further', 'then', 'once',
24 |       'here', 'there', 'when', 'where', 'why', 'how', 'all',
25 |       'any', 'both', 'each', 'few', 'more', 'most', 'other',
26 |       'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same',
27 |       'so', 'than', 'too', 'very', 's', 't', 'can', 'will',
28 |       'just', 'don', 'should', 'now'])
29 |     self.porter = nltk.PorterStemmer()
30 | 
31 |   def mapper(self, key, value):
32 | 
33 |     def normalize_numeric(x):
34 |       xc = x.translate(string.maketrans("", ""), string.punctuation)
35 |       return "_NNN_" if xc.isdigit() else x
36 | 
37 |     def normalize_stopword(x):
38 |       return "_SSS_" if str(x) in self.stopwords else x
39 | 
40 |     cols = value.split("|")
41 |     words = nltk.word_tokenize(cols[1])
42 |     # normalize number and stopwords and stem remaining words
43 |     words = [word.lower() for word in words]
44 |     words = [normalize_numeric(word) for word in words]
45 |     words = [normalize_stopword(word) for word in words]
46 |     words = [self.porter.stem(word) for word in words]
47 |     trigrams = nltk.trigrams(words)
48 |     for trigram in trigrams:
49 |       yield (trigram, 1)
50 |       bigram = trigram[1:]
51 |       yield (bigram, 1)
52 |       unigram = bigram[1:]
53 |       yield (unigram, 1)
54 | 
55 |   def reducer(self, key, values):
56 |     yield (key, sum([value for value in values]))
57 | 
58 | if __name__ == "__main__":
59 |   NGramCountingJob.run()
60 |   


--------------------------------------------------------------------------------
/src/medorleg/preprocess.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sujitpal/nltk-examples/6fe64f774fe8d2c97f51f4648de6d1be6e6950af/src/medorleg/preprocess.py


--------------------------------------------------------------------------------
/src/medorleg/regression_data.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | 
 3 | import math
 4 | import sqlite3 as sql
 5 | 
 6 | def get_base_counts(conn, morl):
 7 |   cur = conn.cursor()
 8 |   cur.execute("select count(*), sum(freq) from %s1" % (morl))
 9 |   v, n = cur.fetchall()[0]
10 |   return n, v
11 | 
12 | def gram_to_list(gram):
13 |   return [x[1:-1] for x in gram[1:-1].split(", ")]
14 | 
15 | def build_regdata(conn, morl, infn, outX, outY):
16 |   cur = conn.cursor()
17 |   infile = open(infn, 'rb')
18 |   xfile = open(outX, 'wb')
19 |   yfile = open(outY, 'wb')
20 |   n, v = get_base_counts(conn, morl)
21 |   i = 0
22 |   for line in infile:
23 |     gram, freq = line.strip().split("\t")
24 |     gramlist = gram_to_list(gram)
25 |     if len(gramlist) == 3:
26 |       cur.execute("select freq from %s3 where w1 = ? and w2 = ? and w3 = ?"
27 |         % (morl), gramlist)
28 |       rows = cur.fetchall()
29 |       freq3 = 0 if len(rows) == 0 else rows[0][0]
30 |       cur.execute("select freq from %s2 where w2 = ? and w3 = ?" %
31 |         (morl), gramlist[1:])
32 |       rows = cur.fetchall()
33 |       freq2 = 0 if len(rows) == 0 else rows[0][0]
34 |       cur.execute("select freq from %s1 where w3 = ?" % (morl), gramlist[2:])
35 |       rows = cur.fetchall()
36 |       freq1 = 0 if len(rows) == 0 else rows[0][0]
37 |       y = math.log(freq3) - math.log(n)
38 |       x0 = math.log(freq3) - math.log(freq2)
39 |       x1 = math.log(freq2) - math.log(freq1)
40 |       x2 = math.log(freq1 + 1) / math.log(n + v)
41 |       print morl, x0, x1, x2, y
42 |       xfile.write("%s %s %s\n" % (x0, x1, x2))
43 |       yfile.write("%s\n" % (y))
44 |   infile.close()
45 |   xfile.close()
46 |   yfile.close()
47 | 
48 | def main():
49 |   conn = sql.connect("data/db/ngram_freqs.db")
50 |   build_regdata(conn, "m", "data/counts/medical_count.txt",
51 |     "data/regdata/medical_X.txt", "data/regdata/medical_y.txt")
52 |   build_regdata(conn, "l", "data/counts/legal_count.txt",
53 |     "data/regdata/legal_X.txt", "data/regdata/legal_y.txt")
54 |   conn.close()
55 |   
56 | if __name__ == "__main__":
57 |   main()
58 | 


--------------------------------------------------------------------------------
/src/medorleg/testset_splitter.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def main():
 4 |   filelines = {
 5 |     "data/sentences/medical.txt": 950887,
 6 |     "data/sentences/legal.txt": 837393
 7 |   }
 8 |   for file, lines in filelines.items():
 9 |     test_ids = sorted([int(lines * x) for x in np.random.random((1000,))])
10 |     fn = file.split(".")[0]
11 |     input_file = open(file, 'rb')
12 |     train_file = open(fn + "_train.txt", 'wb')
13 |     test_file = open(fn + "_test.txt", 'wb')
14 |     curr_line = 0
15 |     curr_pos = 0
16 |     for line in input_file:
17 |       if curr_pos < 1000 and curr_line == test_ids[curr_pos]:
18 |         test_file.write(line)
19 |         curr_pos += 1
20 |       else:
21 |         train_file.write(line)
22 |       curr_line += 1
23 |     input_file.close()
24 |     train_file.close()
25 |     test_file.close()
26 |     
27 | if __name__ == "__main__":
28 |   main()
29 | 


--------------------------------------------------------------------------------
/src/medorleg2/arffwriter.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | import numpy as np
 3 | import operator
 4 | 
 5 | def qq(s):
 6 |   return "'" + s + "'"
 7 | 
 8 | def save_arff(X, y, vocab, fname):
 9 |   aout = open(fname, 'wb')
10 |   # header
11 |   aout.write("@relation %s\n\n" %
12 |     (os.path.basename(fname).split(".")[0]))
13 |   # input variables
14 |   for term in vocab:
15 |     aout.write("@attribute \"%s\" numeric\n" % (term))
16 |   # target variable
17 |   aout.write("@attribute target_var {%s}\n" %
18 |     (",".join([qq(str(int(e))) for e in list(np.unique(y))])))
19 |   # data
20 |   aout.write("\n@data\n")
21 |   for row in range(0, X.shape[0]):
22 |     rdata = X.getrow(row)
23 |     idps = sorted(zip(rdata.indices, rdata.data), key=operator.itemgetter(0))
24 |     if len(idps) > 0:
25 |       aout.write("{%s,%d '%d'}\n" % (
26 |         ",".join([" ".join([str(idx), str(dat)]) for (idx,dat) in idps]),
27 |         X.shape[1], int(y[row])))
28 |   aout.close()
29 | 


--------------------------------------------------------------------------------
/src/medorleg2/arffwriter_test.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import operator
 3 | 
 4 | from arffwriter import save_arff
 5 | import datetime
 6 | import numpy as np
 7 | from sklearn.cross_validation import train_test_split
 8 | from sklearn.feature_extraction.text import CountVectorizer
 9 | from sklearn.feature_extraction.text import TfidfTransformer
10 | from sklearn.pipeline import Pipeline
11 | 
12 | def load_xy(xfile, yfile):
13 |   pipeline = Pipeline([
14 |     ("count", CountVectorizer(stop_words='english', min_df=0.0,
15 |               binary=False)),
16 |     ("tfidf", TfidfTransformer(norm="l2"))
17 |   ])
18 |   xin = open(xfile, 'rb')
19 |   X = pipeline.fit_transform(xin)
20 |   xin.close()
21 |   yin = open(yfile, 'rb')
22 |   y = np.loadtxt(yin)
23 |   yin.close()
24 |   vocab_map = pipeline.steps[0][1].vocabulary_
25 |   vocab = [x[0] for x in sorted([(x, vocab_map[x]) 
26 |                 for x in vocab_map], 
27 |                 key=operator.itemgetter(1))]
28 |   return X, y, vocab
29 | 
30 | def print_timestamp(message):
31 |   print message, datetime.datetime.now()
32 | 
33 | def main():
34 |   if len(sys.argv) != 5:
35 |     print "Usage: arffwriter_test Xfile yfile trainARFF testARFF"
36 |     sys.exit(-1)
37 |   print_timestamp("started:")
38 |   X, y, vocab = load_xy(sys.argv[1], sys.argv[2])
39 |   Xtrain, Xtest, ytrain, ytest = train_test_split(X, y,
40 |     test_size=0.1, random_state=42)
41 |   save_arff(Xtrain, ytrain, vocab, sys.argv[3])
42 |   save_arff(Xtest, ytest, vocab, sys.argv[4])
43 |   print_timestamp("finished:")
44 |   
45 | if __name__ == "__main__":
46 |   main()
47 | 


--------------------------------------------------------------------------------
/src/medorleg2/classify.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/python
  2 | from __future__ import division
  3 | 
  4 | import sys
  5 | 
  6 | import cPickle as pickle
  7 | import datetime
  8 | import nltk
  9 | import numpy as np
 10 | from sklearn.cross_validation import KFold
 11 | from sklearn.cross_validation import train_test_split
 12 | from sklearn.feature_extraction.text import CountVectorizer
 13 | from sklearn.feature_extraction.text import TfidfTransformer
 14 | from sklearn.metrics import accuracy_score
 15 | from sklearn.metrics import classification_report
 16 | from sklearn.metrics import confusion_matrix
 17 | from sklearn.pipeline import Pipeline
 18 | from sklearn.svm import LinearSVC
 19 | 
 20 | # total number of sentences (combined)
 21 | NTOTAL = 1788280
 22 | 
 23 | def calc_ngrams(line):
 24 |   """ Converts line into a list of trigram tokens """
 25 |   words = nltk.word_tokenize(line.lower())
 26 |   word_str = " ".join(words)
 27 |   bigrams = nltk.bigrams(words)
 28 |   bigram_str = " ".join(["0".join(bigram) for bigram in bigrams])
 29 |   trigrams = nltk.trigrams(words)
 30 |   trigram_str = " ".join(["0".join(trigram) for trigram in trigrams])
 31 |   return " ".join([word_str, bigram_str, trigram_str])
 32 |   
 33 | def generate_xy(texts, labels):
 34 |   ftext = open(texts, 'rb')
 35 |   pipeline = Pipeline([
 36 |     ("count", CountVectorizer(stop_words='english', min_df=0.0,
 37 | #              max_features=10000,
 38 |               binary=False)),
 39 |     ("tfidf", TfidfTransformer(norm="l2"))
 40 |   ])
 41 | #  X = pipeline.fit_transform(map(lambda line: calc_ngrams(line), ftext))
 42 |   X = pipeline.fit_transform(ftext)
 43 |   ftext.close()
 44 |   flabel = open(labels, 'rb')
 45 |   y = np.loadtxt(flabel)
 46 |   flabel.close()
 47 |   return X, y
 48 | 
 49 | def crossvalidate_model(X, y, nfolds):
 50 |   kfold = KFold(X.shape[0], n_folds=nfolds)
 51 |   avg_accuracy = 0
 52 |   for train, test in kfold:
 53 |     Xtrain, Xtest, ytrain, ytest = X[train], X[test], y[train], y[test]
 54 |     clf = LinearSVC()
 55 |     clf.fit(Xtrain, ytrain)
 56 |     ypred = clf.predict(Xtest)
 57 |     accuracy = accuracy_score(ytest, ypred)
 58 |     print "...accuracy = ", accuracy
 59 |     avg_accuracy += accuracy
 60 |   print "Average Accuracy: ", (avg_accuracy / nfolds)
 61 | 
 62 | def train_model(X, y, binmodel):
 63 |   model = LinearSVC()
 64 |   model.fit(X, y)
 65 |   # reports
 66 |   ypred = model.predict(X)
 67 |   print "Confusion Matrix (Train):"
 68 |   print confusion_matrix(y, ypred)
 69 |   print "Classification Report (Train)"
 70 |   print classification_report(y, ypred)
 71 |   pickle.dump(model, open(binmodel, 'wb'))
 72 | 
 73 | def test_model(X, y, binmodel):
 74 |   model = pickle.load(open(binmodel, 'rb'))
 75 |   if y is not None:
 76 |     # reports
 77 |     ypred = model.predict(X)
 78 |     print "Confusion Matrix (Test)"
 79 |     print confusion_matrix(y, ypred)
 80 |     print "Classification Report (Test)"
 81 |     print classification_report(y, ypred)
 82 | 
 83 | def print_timestamp(message):
 84 |   print message, datetime.datetime.now()
 85 | 
 86 | def usage():
 87 |   print "Usage: python classify.py [xval|test|train]"
 88 |   sys.exit(-1)
 89 |   
 90 | def main():
 91 |   if len(sys.argv) != 2:
 92 |     usage()
 93 |   print_timestamp("started:")
 94 |   X, y = generate_xy("data/sentences.txt", "data/labels.txt")
 95 |   if sys.argv[1] == "xval":
 96 |     crossvalidate_model(X, y, 10)
 97 |   elif sys.argv[1] == "run":
 98 |     Xtrain, Xtest, ytrain, ytest = train_test_split(X, y,
 99 |       test_size=0.1, random_state=42)
100 |     train_model(Xtrain, ytrain, "data/model.bin")
101 |     test_model(Xtest, ytest, "data/model.bin")
102 |   else:
103 |     usage()
104 |   print_timestamp("finished:")
105 |   
106 | if __name__ == "__main__":
107 |   main()
108 | 


--------------------------------------------------------------------------------
/src/medorleg2/preprocess.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Code to convert from XML format to a file of sentences for
  3 | # each genre, one sentence per line.
  4 | from __future__ import division
  5 | import glob
  6 | import nltk
  7 | import re
  8 | import unicodedata
  9 | from xml.dom.minidom import Node
 10 | from xml.dom.minidom import parseString
 11 | 
 12 | def medical_plaintext(fn):
 13 |   print "processing", fn
 14 |   if not (fn.startswith("data/medical/eph_") or
 15 |       fn.startswith("data/medical/gemd_") or
 16 |       fn.startswith("data/medical/gesu_") or
 17 |       fn.startswith("data/medical/gea2_") or
 18 |       fn.startswith("data/medical/gem_") or
 19 |       fn.startswith("data/medical/gech_") or
 20 |       fn.startswith("data/medical/geca_") or
 21 |       fn.startswith("data/medical/gecd_") or
 22 |       fn.startswith("data/medical/gegd_") or
 23 |       fn.startswith("data/medical/gend_") or
 24 |       fn.startswith("data/medical/gec_") or
 25 |       fn.startswith("data/medical/genh_") or
 26 |       fn.startswith("data/medical/nwaz_")):
 27 |     return ""
 28 |   file = open(fn, 'rb')
 29 |   data = file.read()
 30 |   file.close()
 31 |   # remove gale: namespace from attributes
 32 |   data = re.sub("gale:", "", data)
 33 |   dom = parseString(data)
 34 |   text = ""
 35 |   paragraphs = dom.getElementsByTagName("p")
 36 |   for paragraph in paragraphs:
 37 |     xml = paragraph.toxml()
 38 |     xml = re.sub("\n", " ", xml)
 39 |     xml = re.sub("<.*?>", "", xml)
 40 |     text = text + " " + xml
 41 |   text = re.sub("\\s+", " ", text)
 42 |   text = text.strip()
 43 |   text = text.encode("ascii", "ignore")
 44 |   return text
 45 | 
 46 | def legal_plaintext(fn):
 47 |   print "processing", fn
 48 |   file = open(fn, 'rb')
 49 |   data = file.read()
 50 |   data = re.sub("&eacute;", "e", data)
 51 |   data = re.sub("&aacute;", "a", data)
 52 |   data = re.sub("&yacute;", "y", data)
 53 |   data = re.sub("&nbsp;", " ", data)
 54 |   data = re.sub("&tm;", "(TM)", data)
 55 |   data = re.sub("&reg;", "(R)", data)
 56 |   data = re.sub("&agrave;", "a", data)
 57 |   data = re.sub("&egrave;", "e", data)
 58 |   data = re.sub("&igrave", "i", data)
 59 |   data = re.sub("&ecirc;", "e", data)
 60 |   data = re.sub("&ocirc;", "o", data)
 61 |   data = re.sub("&icirc;", "i", data)
 62 |   data = re.sub("&ccedil;", "c", data)
 63 |   data = re.sub("&amp;", "and", data)
 64 |   data = re.sub("&auml;", "a", data)
 65 |   data = re.sub("&szlig;", "ss", data)
 66 |   data = re.sub("&aelig;", "e", data)
 67 |   data = re.sub("&iuml;", "i", data)
 68 |   data = re.sub("&euml;", "e", data)
 69 |   data = re.sub("&ouml;", "o", data)
 70 |   data = re.sub("&uuml;", "u", data)
 71 |   data = re.sub("&acirc;", "a", data)
 72 |   data = re.sub("&oslash;", "o", data)
 73 |   data = re.sub("&ntilde;", "n", data)
 74 |   data = re.sub("&Eacute;", "E", data)
 75 |   data = re.sub("&Aring;", "A", data)
 76 |   data = re.sub("&Ouml;", "O", data)
 77 |   data = unicodedata.normalize("NFKD",
 78 |     unicode(data, 'iso-8859-1')).encode("ascii", "ignore")
 79 |   # fix "id=xxx" pattern, causes XML parsing to fail
 80 |   data = re.sub("\"id=", "id=\"", data)
 81 |   file.close()
 82 |   text = ""
 83 |   dom = parseString(data)
 84 |   sentencesEl = dom.getElementsByTagName("sentences")[0]
 85 |   for sentenceEl in sentencesEl.childNodes:
 86 |     if sentenceEl.nodeType == Node.ELEMENT_NODE:
 87 |       stext = sentenceEl.firstChild.data
 88 |       if len(stext.strip()) == 0:
 89 |         continue
 90 |       text = text + " " + re.sub("\n", " ", stext)
 91 |   text = re.sub("\\s+", " ", text)
 92 |   text = text.strip()
 93 |   text = text.encode("ascii", "ignore")
 94 |   return text
 95 | 
 96 | def parse_to_plaintext(dirs, labels, funcs, sent_file, label_file):
 97 |   fsent = open(sent_file, 'wb')
 98 |   flabs = open(label_file, 'wb')
 99 |   idx = 0
100 |   for dir in dirs:
101 |     files = glob.glob("/".join([dir, "*.xml"]))
102 |     for file in files:
103 |       text = funcs[idx](file)
104 |       if len(text.strip()) > 0:
105 |         for sentence in nltk.sent_tokenize(text):
106 |           fsent.write("%s\n" % sentence)
107 |           flabs.write("%d\n" % labels[idx])
108 |     idx += 1
109 |   fsent.close()
110 |   flabs.close()
111 | 
112 | def main():
113 |   parse_to_plaintext(["data/medical", "data/legal"],
114 |     [1, 0], [medical_plaintext, legal_plaintext],
115 |     "data/sentences.txt", "data/labels.txt")
116 | 
117 | if __name__ == "__main__":
118 |   main()
119 | 


--------------------------------------------------------------------------------
/src/phrases/interesting_phrases.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | import operator
 3 | import nltk
 4 | import numpy as np
 5 | from scipy.stats import binom
 6 | import string
 7 | 
 8 | def isValid(word):
 9 |   if word.startswith("#"):
10 |     return False # no hashtag
11 |   else:
12 |     vword = word.translate(string.maketrans("", ""), string.punctuation)
13 |     return len(vword) == len(word)
14 | 
15 | def llr(c1, c2, c12, n):
16 |   # H0: Independence p(w1,w2) = p(w1,~w2) = c2/N
17 |   p0 = c2 / n
18 |   # H1: Dependence, p(w1,w2) = c12/N
19 |   p10 = c12 / n
20 |   # H1: p(~w1,w2) = (c2-c12)/N
21 |   p11 = (c2 - c12) / n
22 |   # binomial probabilities
23 |   # H0: b(c12; c1, p0),  b(c2-c12; N-c1, p0)
24 |   # H1: b(c12, c1, p10), b(c2-c12; N-c1, p11)
25 |   probs = np.matrix([
26 |     [binom(c1, p0).logpmf(c12), binom(n - c1, p0).logpmf(c2 - c12)],
27 |     [binom(c1, p10).logpmf(c12), binom(n - c1, p11).logpmf(c2 - c12)]])
28 |   # LLR = p(H1) / p(H0)
29 |   return np.sum(probs[1, :]) - np.sum(probs[0, :])
30 | 
31 | def isLikelyNGram(ngram, phrases):
32 |   if len(ngram) == 2:
33 |     return True
34 |   prevGram = ngram[:-1]
35 |   return phrases.has_key(prevGram)
36 | 
37 | def main():
38 |   # accumulate words and word frequency distributions
39 |   lines = []
40 |   unigramFD = nltk.FreqDist()
41 |   fin = open("twitter_messages.txt", 'rb')
42 |   i = 0
43 |   for line in fin:
44 |     i += 1
45 |     words = nltk.word_tokenize(line.strip().lower())
46 |     words = filter(lambda x: isValid(x), words)
47 |     [unigramFD.inc(x) for x in words]
48 |     lines.append(words)
49 |     if i > 1000:
50 |       break
51 |   fin.close()
52 |   # identify likely phrases using a multi-pass algorithm based
53 |   # on the LLR approach described in the Building Search Applications
54 |   # Lucene, LingPipe and GATE book, except that we treat n-gram
55 |   # collocations beyond 2 as n-1 gram plus a unigram.
56 |   phrases = nltk.defaultdict(float)
57 |   prevGramFD = None
58 |   for i in range(2, 5):
59 |     ngramFD = nltk.FreqDist()
60 |     for words in lines:
61 |       nextGrams = nltk.ngrams(words, i)
62 |       nextGrams = filter(lambda x: isLikelyNGram(x, phrases), nextGrams)
63 |       [ngramFD.inc(x) for x in nextGrams]
64 |     for k, v in ngramFD.iteritems():
65 |       if v > 1:
66 |         c1 = unigramFD[k[0]] if prevGramFD == None else prevGramFD[k[:-1]]
67 |         c2 = unigramFD[k[1]] if prevGramFD == None else unigramFD[k[len(k) - 1]]
68 |         c12 = ngramFD[k]
69 |         n = unigramFD.N() if prevGramFD == None else prevGramFD.N()
70 |         phrases[k] = llr(c1, c2, c12, n)
71 |     # only consider bigrams where LLR > 0, ie P(H1) > P(H0)
72 |     likelyPhrases = nltk.defaultdict(float)
73 |     likelyPhrases.update([(k, v) for (k, v)
74 |       in phrases.iteritems() if len(k) == i and v > 0])
75 |     print "==== #-grams = %d ====" % (i)
76 |     sortedPhrases = sorted(likelyPhrases.items(),
77 |       key=operator.itemgetter(1), reverse=True)
78 |     for k, v in sortedPhrases:
79 |       print k, v
80 |     prevGramFD = ngramFD
81 | 
82 | if __name__ == "__main__":
83 |   main()


--------------------------------------------------------------------------------
/src/phrases/preprocess.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | 
 3 | import json
 4 | import sys
 5 | 
 6 | def main():
 7 |   if len(sys.argv) != 2:
 8 |     print "Usage: %s /path/to/twitter/json/list.txt"
 9 |     sys.exit(-1)
10 |   fin = open(sys.argv[1], 'rb')
11 |   fout = open("twitter_messages.txt", 'wb')
12 |   for line in fin:
13 |     try:
14 |       data = json.loads(line.strip())
15 |       lang = data["lang"]
16 |       if lang == "en":
17 |         tweet = data["text"]
18 |         tweet = tweet.replace("\n", " ").replace("\\s+", " ")
19 |         tweet = tweet.encode("ascii", "ignore")
20 |         if len(tweet) == 0:
21 |           continue
22 |         fout.write("%s\n" % (tweet))
23 |     except KeyError:
24 |       continue
25 |   fin.close()
26 |   fout.close()
27 |       
28 | 
29 | if __name__ == "__main__":
30 |   main()


--------------------------------------------------------------------------------
/src/sameword/same_word_finder.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from nltk.corpus import wordnet as wn
 3 | import sys
 4 | 
 5 | def similarity(w1, w2, sim=wn.path_similarity):
 6 |   synsets1 = wn.synsets(w1)
 7 |   synsets2 = wn.synsets(w2)
 8 |   sim_scores = []
 9 |   for synset1 in synsets1:
10 |     for synset2 in synsets2:
11 |       sim_scores.append(sim(synset1, synset2))
12 |   if len(sim_scores) == 0:
13 |     return 0
14 |   else:
15 |     return max(sim_scores)
16 | 
17 | def main():
18 |   f = open(sys.argv[1], 'rb')
19 |   for line in f:
20 |     (word1, word2) = line.strip().split("\t")
21 |     if similarity(word1, word2) != 1.0:
22 |       print word1
23 |   f.close()
24 | 
25 | if __name__ == "__main__":
26 |   main()
27 | 


--------------------------------------------------------------------------------
/src/sameword/test.dat:
--------------------------------------------------------------------------------
1 | favour	favor
2 | favourite	favorite
3 | colour	color
4 | four	for
5 | humeri	humerus
6 | femora	femur
7 | insomnia	insomnium
8 | media	medium
9 | 


--------------------------------------------------------------------------------
/src/similar-tweets-nmslib/01-load-sqlite3.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sqlite3
 3 | 
 4 | ##################################################################
 5 | # Script to read and parse multiple tweet files and load them 
 6 | # into a SQLite3 DB for later retrieval.
 7 | ##################################################################
 8 | 
 9 | DATA_DIR = "../data"
10 | INPUT_DIR = os.path.join(DATA_DIR, "Health-Tweets")
11 | DB_FILE = os.path.join(DATA_DIR, "tweets.db")
12 | 
13 | # create database
14 | conn = sqlite3.connect(DB_FILE)
15 | 
16 | # create table if not exists
17 | cur = conn.cursor()
18 | try:
19 |     cur.execute("""
20 |     CREATE TABLE IF NOT EXISTS tweets (
21 |         t_id VARCHAR(32) NOT NULL,
22 |         t_dttm VARCHAR(50) NOT NULL,
23 |         t_text VARCHAR(255) NOT NULL
24 |     )
25 |     """)
26 |     cur.execute("""
27 |     CREATE UNIQUE INDEX IF NOT EXISTS ix_tweets ON tweets(t_id)
28 |     """)
29 | except sqlite3.Error as e:
30 |     print("Failed to create table tweets and unique index")
31 |     raise e
32 | finally:
33 |     if cur: cur.close()
34 | 
35 | 
36 | num_written = 0
37 | insert_sql = """
38 |     INSERT INTO tweets(t_id, t_dttm, t_text) VALUES (?, ?, ?)
39 | """
40 | for filename in os.listdir(INPUT_DIR):
41 |     print("Now processing: {:s}".format(filename))
42 |     fin = open(os.path.join(INPUT_DIR, filename), "r", encoding="utf8")
43 |     for line in fin:
44 |         cols = line.strip().split('|')
45 |         if len(cols) != 3:
46 |             continue
47 |         if num_written % 1000 == 0:
48 |             print("{:d} rows added".format(num_written))
49 |             conn.commit()
50 |         t_id, t_dttm, t_text = cols
51 |         t_text = " ".join([w for w in t_text.split() if not w.startswith("http://")])
52 |         # print(t_id, t_dttm, t_text)
53 |         try:
54 |             cur = conn.cursor()
55 |             cur.execute(insert_sql, (t_id, t_dttm, t_text))
56 |         except sqlite3.Error as e:
57 |             print("Error inserting data")
58 |             raise e
59 |         finally:
60 |             if cur: cur.close()
61 |         num_written += 1
62 |     fin.close()
63 | 
64 | print("{:d} rows added, COMPLETE".format(num_written))
65 | conn.commit()
66 | 


--------------------------------------------------------------------------------
/src/similar-tweets-nmslib/02-generate-vectors.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import numpy as np
 3 | import os
 4 | import sqlite3
 5 | 
 6 | from bert_serving.client import BertClient
 7 | 
 8 | ##################################################################
 9 | # Script to generate BERT vectors for all the tweet texts in 
10 | # SQLite3 database.
11 | ##################################################################
12 | 
13 | bc = BertClient()
14 | 
15 | conn = sqlite3.connect("tweets.db")
16 | 
17 | fout = open("vectors.tsv", "w")
18 | 
19 | num_processed = 0
20 | select_sql = """SELECT t_id, t_dttm, t_text FROM tweets"""
21 | cur = conn.cursor()
22 | cur.execute(select_sql)
23 | for row in cur.fetchall():
24 |     if num_processed % 1000 == 0:
25 |         print("{:d} rows processed".format(num_processed))
26 |     try:
27 |         embeddings = bc.encode([row[2]])
28 |     except ValueError:
29 |         continue
30 |     t_vec = ",".join(["{:3e}".format(e) for e in embeddings[0].tolist()])
31 |     fout.write("{:s}\t{:s}\n".format(row[0], t_vec))
32 |     num_processed += 1
33 | 
34 | print("{:d} rows processed, COMPLETE".format(num_processed))
35 | 
36 | fout.close()
37 | cur.close()
38 | conn.close()
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/src/similar-tweets-nmslib/03-query-times.py:
--------------------------------------------------------------------------------
 1 | import nmslib
 2 | import numpy as np
 3 | import os
 4 | import sqlite3
 5 | import time
 6 | 
 7 | DATA_DIR = "../data"
 8 | 
 9 | TWEET_DB = os.path.join(DATA_DIR, "tweets.db")
10 | VECTORS_FILE = os.path.join(DATA_DIR, "vectors.tsv")
11 | RESULTS_FILE = os.path.join(DATA_DIR, "results.tsv")
12 | NMS_INDEX = os.path.join(DATA_DIR, "tweet-vectors.index")
13 | 
14 | MAX_NUM_VECTORS = 63111
15 | INDEX_SAMPLES = 63111
16 | QUERY_SAMPLES = 50
17 | 
18 | 
19 | def lookup_tweet_by_id(tweet_id):
20 |     try:
21 |         conn = sqlite3.connect(TWEET_DB)
22 |         cur = conn.cursor()
23 |         cur.execute("""SELECT t_text FROM tweets WHERE t_id = '%s' """ % (tweet_id))
24 |         row = cur.fetchone()
25 |         return row[0]
26 |     except sqlite3.Error as e:
27 |         raise e
28 |     finally:
29 |         if cur: cur.close()
30 |         if conn: conn.close()
31 | 
32 | 
33 | # build vector data for required number of samples
34 | index_positions = set(
35 |     np.random.random_integers(low=0, high=MAX_NUM_VECTORS, size=INDEX_SAMPLES)
36 |     .tolist())
37 | query_positions = set(
38 |     np.random.random_integers(low=0, high=MAX_NUM_VECTORS, size=QUERY_SAMPLES)
39 |     .tolist())
40 | 
41 | index_vecs = np.empty((INDEX_SAMPLES, 768))
42 | query_vecs = np.empty((QUERY_SAMPLES, 768))
43 | index_pos2id, query_pos2id = {}, {}
44 | 
45 | fvec = open(VECTORS_FILE, "r")
46 | curr_index_position, curr_query_position = 0, 0
47 | for lid, line in enumerate(fvec):
48 |     if lid in index_positions or lid in query_positions:
49 |         t_id, t_vec = line.strip().split('\t')
50 |         t_vec_arr = np.array([float(v) for v in t_vec.split(',')])
51 |         if lid in index_positions:
52 |             index_vecs[curr_index_position] = t_vec_arr
53 |             index_pos2id[curr_index_position] = t_id
54 |             curr_index_position += 1
55 |         else: # lid in query_positions:
56 |             query_vecs[curr_query_position] = t_vec_arr
57 |             query_pos2id[curr_query_position] = t_id            
58 |             curr_query_position += 1
59 |     else:
60 |         continue
61 | 
62 | fvec.close()
63 | 
64 | # load
65 | start_tm = time.time()
66 | index = nmslib.init(method='hnsw', space='cosinesimil')
67 | index.addDataPointBatch(index_vecs)
68 | index.createIndex({'post': 2}, print_progress=True)
69 | elapsed_tm = time.time() - start_tm
70 | print("load elapsed time (s): {:.3f}".format(elapsed_tm))
71 | 
72 | index.saveIndex(NMS_INDEX, save_data=True)
73 | 
74 | fout = open(RESULTS_FILE, "w")
75 | query_times = []
76 | for i in range(query_vecs.shape[0]):
77 |     try:
78 |         start_tm = time.time()
79 |         q_tid = query_pos2id[i]
80 |         q_text = lookup_tweet_by_id(q_tid)
81 |         fout.write("query: {:s} ({:s})\n".format(q_text, q_tid))
82 |         rids, distances = index.knnQuery(query_vecs[i], k=10)
83 |         for rid, distance in zip(rids, distances):
84 |             r_tid = index_pos2id[rid]
85 |             r_text = lookup_tweet_by_id(r_tid)
86 |             fout.write("{:.3f} {:s} {:s}\n".format(distance, r_tid, r_text))
87 |         query_times.append(time.time() - start_tm)
88 |     except KeyError:
89 |         continue
90 | 
91 | fout.close()
92 | print("average query elapsed time (s): {:.3f}".format(sum(query_times) / len(query_times)))


--------------------------------------------------------------------------------
/src/similar-tweets-nmslib/04-chart-times.py:
--------------------------------------------------------------------------------
 1 | 
 2 | #%%
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | num_recs = [1000, 5000, 10000, 20000, 40000, 63111]
 6 | index_load_times = [0.163, 1.471, 4.655, 11.402, 28.614, 53.969]
 7 | # query_times = [0.018, 0.013, 0.020, 0.051, 0.033, 0.054]
 8 | query_times = [0.018, 0.020, 0.028, 0.045, 0.051, 0.054]
 9 | 
10 | plt.plot(num_recs, index_load_times, marker="o", color="r")
11 | plt.title("Index Load times")
12 | plt.xlabel("number of records")
13 | plt.ylabel("load time (s)")
14 | plt.show()
15 | 
16 | # plt.plot(num_recs, query_times, marker="o", color="r")
17 | # plt.title("Query times")
18 | # plt.xlabel("number of records")
19 | # plt.ylabel("query time (s)")
20 | # plt.show()
21 | 
22 | 
23 | # %%
24 | 


--------------------------------------------------------------------------------
/src/similar-tweets-nmslib/README.md:
--------------------------------------------------------------------------------
1 | Supporting code for my blog post [Finding Similar Tweets with BERT and NMSLib](https://sujitpal.blogspot.com/2019/12/finding-similar-tweets-with-bert-and.html).
2 | 


--------------------------------------------------------------------------------
/src/stlclust/cluster_titles.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | from sklearn.cluster import DBSCAN
 4 | from scipy.io import mmread
 5 | 
 6 | OUTPUT_DIR = "../../data/stlclust"
 7 | 
 8 | X = mmread(os.path.join(OUTPUT_DIR, "stitles.mtx"))
 9 | clust = DBSCAN(eps=0.1, min_samples=5, metric="precomputed")
10 | clust.fit(X)
11 | 
12 | # print cluster report
13 | stitles = []
14 | ftitles = open(os.path.join(OUTPUT_DIR, "stitles.txt"), 'rb')
15 | for line in ftitles:
16 |     stitles.append(line.strip().split("\t")[0])
17 | ftitles.close()
18 | 
19 | preds = clust.labels_
20 | clabels = np.unique(preds)
21 | for i in range(clabels.shape[0]):
22 |     if clabels[i] < 0:
23 |         continue
24 |     cmem_ids = np.where(preds == clabels[i])[0]
25 |     cmembers = []
26 |     for cmem_id in cmem_ids:
27 |         cmembers.append(stitles[cmem_id])
28 |     print "Cluster#%d: %s" % (i, ", ".join(cmembers))
29 | 
30 |     


--------------------------------------------------------------------------------
/src/stlclust/extract_stl.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import nltk
 3 | import string
 4 | import re
 5 | from operator import itemgetter
 6 | 
 7 | INPUT_DIR = "/home/sujit/Projects/med_data/mtcrawler/texts"
 8 | OUTPUT_DIR = "../../data/stlclust"
 9 | PUNCTUATIONS = set([c for c in string.punctuation])
10 | DIGITS = set([c for c in string.digits])
11 | BULLETS = re.compile("[0-9IVXA-Za-z]{0,3}\.")
12 | PUNCTS = re.compile(r"[" + string.punctuation + "]")
13 | 
14 | def find_first(line, cs):
15 |     idxs = []
16 |     for c in cs:
17 |         c_index = line.find(c)
18 |         if c_index > -1:
19 |             # if this occurs after an existing punctuation, then discard
20 |             prev_chars = set([pc for pc in line[0:c_index - 1]])
21 |             if len(PUNCTUATIONS.intersection(prev_chars)) > 0:
22 |                 return -1
23 |             # make sure this position is either EOL or followed by space
24 |             if c_index + 1 == len(line) or line[c_index + 1] == ' ':
25 |                 idxs.append(c_index)
26 |     if len(idxs) == 0:
27 |         return -1
28 |     else:
29 |         return min(idxs)
30 |         
31 | stfd = nltk.FreqDist()
32 | for filename in os.listdir(INPUT_DIR):
33 |     f = open(os.path.join(INPUT_DIR, filename), 'rb')
34 |     for line in f:
35 |         line = line.strip()
36 |         if len(line) == 0:
37 |             continue
38 |         # Isolate section titles from text. Titles are leading phrases 
39 |         # terminated by colon or hyphen. Usually all-caps but can be in
40 |         # mixed-case also
41 |         sec_title = None
42 |         corh = find_first(line, [":", "-"])
43 |         if corh > -1:
44 |             sec_title = line[0:corh]
45 |         # Alternatively, if the line is all caps, then it is also a
46 |         # section title
47 |         if sec_title is None and line.upper() == line:
48 |             sec_title = line
49 |         if sec_title is not None: 
50 |             # Remove retrieved titles with leading arabic number, roman number
51 |             # and alpha bullets (allow max 3) bullets
52 |             if re.match(BULLETS, sec_title) is not None:
53 |                 continue
54 |             # Remove sections that look like dates (all numbers once puncts)
55 |             # are removed
56 |             if re.sub(PUNCTS, "", sec_title).isdigit():
57 |                 continue
58 |             # if retrieved title is mixed case remove any that have > 4 words
59 |             if sec_title != sec_title.upper() and len(sec_title.split()) > 4:
60 |                 continue
61 |             # if retrieved title contains special chars, remove
62 |             if "," in sec_title:
63 |                 continue
64 |             # replace "diagnoses" with "diagnosis"
65 |             sec_title = re.sub("DIAGNOSES", "DIAGNOSIS", sec_title)
66 |             stfd[sec_title] += 1
67 |     f.close()
68 |     
69 | # output the frequency distribution
70 | fout = open(os.path.join(OUTPUT_DIR, "stitles.txt"), 'wb')
71 | for k, v in sorted(stfd.items(), key=itemgetter(1), reverse=True):
72 |     fout.write("%s\t%d\n" % (k, v))
73 | fout.close()
74 |     


--------------------------------------------------------------------------------
/src/stlclust/fuzz_similarity.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | import os
 3 | import numpy as np
 4 | from scipy.io import mmwrite
 5 | from fuzzywuzzy import fuzz
 6 | 
 7 | OUTPUT_DIR = "../../data/stlclust"
 8 | 
 9 | def compute_similarity(s1, s2):
10 |     return 1.0 - (0.01 * max(
11 |         fuzz.ratio(s1, s2),
12 |         fuzz.token_sort_ratio(s1, s2),
13 |         fuzz.token_set_ratio(s1, s2)))
14 |         
15 | 
16 | cutoff = 2
17 | stitles = []
18 | fin = open(os.path.join(OUTPUT_DIR, "stitles.txt"), 'rb')
19 | for line in fin:
20 |     stitle, count = line.strip().split("\t")
21 |     if int(count) < cutoff:
22 |         continue
23 |     stitles.append(stitle)
24 | fin.close()
25 | 
26 | X = np.zeros((len(stitles), len(stitles)))
27 | for i in range(len(stitles)):
28 |     if i > 0 and i % 10 == 0:
29 |         print "Processed %d/%d rows of data" % (i, X.shape[0])
30 |     for j in range(len(stitles)):
31 |         if X[i, j] == 0.0:        
32 |             X[i, j] = compute_similarity(stitles[i].lower(), stitles[j].lower())
33 |             X[j, i] = X[i, j]
34 | 
35 | # write to Matrix Market format for passing to DBSCAN
36 | mmwrite(os.path.join(OUTPUT_DIR, "stitles.mtx"), X)
37 | 
38 | 


--------------------------------------------------------------------------------
/src/topicmodel/bok_model.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import logging
 3 | import os
 4 | import gensim
 5 | 
 6 | def iter_docs(topdir):
 7 |     for f in os.listdir(topdir):
 8 |         fin = open(os.path.join(topdir, f), 'rb')
 9 |         text = fin.read()
10 |         fin.close()
11 |         yield (x for x in text.split(" "))
12 | 
13 | class MyBokCorpus(object):
14 |     
15 |     def __init__(self, topdir):
16 |         self.topdir = topdir
17 |         self.dictionary = gensim.corpora.Dictionary(iter_docs(topdir))
18 |         
19 |     def __iter__(self):
20 |         for tokens in iter_docs(self.topdir):
21 |             yield self.dictionary.doc2bow(tokens)
22 |             
23 | logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s",
24 |                     level=logging.INFO)
25 | BOK_DIR = "/Users/palsujit/Projects/med_data/mtcrawler/kea_keys"
26 | MODELS_DIR = "models"
27 | 
28 | corpus = MyBokCorpus(BOK_DIR)
29 | tfidf = gensim.models.TfidfModel(corpus, normalize=True)
30 | corpus_tfidf = tfidf[corpus]
31 | 
32 | corpus.dictionary.save(os.path.join(MODELS_DIR, "bok.dict"))
33 | gensim.corpora.MmCorpus.serialize(os.path.join(MODELS_DIR, "bok.mm"), 
34 |                                   corpus_tfidf)


--------------------------------------------------------------------------------
/src/topicmodel/bow_model.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import nltk
 4 | import gensim
 5 | 
 6 | def iter_docs(topdir, stoplist):
 7 |     for fn in os.listdir(topdir):
 8 |         fin = open(os.path.join(topdir, fn), 'rb')
 9 |         text = fin.read()
10 |         fin.close()
11 |         yield (x for x in 
12 |             gensim.utils.tokenize(text, lowercase=True, deacc=True, errors="ignore")
13 |             if x not in stoplist)
14 | 
15 | class MyCorpus(object):
16 | 
17 |     def __init__(self, topdir, stoplist):
18 |         self.topdir = topdir
19 |         self.stoplist = stoplist
20 |         self.dictionary = gensim.corpora.Dictionary(iter_docs(topdir, stoplist))
21 |         
22 |     def __iter__(self):
23 |         for tokens in iter_docs(self.topdir, self.stoplist):
24 |             yield self.dictionary.doc2bow(tokens)
25 | 
26 | 
27 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
28 | 
29 | TEXTS_DIR = "/home/sujit/Projects/mlia-examples/data/mtcrawler/gensim"
30 | MODELS_DIR = "models"
31 | 
32 | stoplist = set(nltk.corpus.stopwords.words("english"))
33 | corpus = MyCorpus(TEXTS_DIR, stoplist)
34 | 
35 | corpus.dictionary.save(os.path.join(MODELS_DIR, "mtsamples.dict"))
36 | gensim.corpora.MmCorpus.serialize(os.path.join(MODELS_DIR, "mtsamples.mm"), corpus)
37 | 


--------------------------------------------------------------------------------
/src/topicmodel/gensim_preprocess.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | JSONS_DIR = "/home/sujit/Projects/mlia-examples/data/mtcrawler/jsons"
 5 | TEXTS_DIR = "/home/sujit/Projects/mlia-examples/data/mtcrawler/gensim"
 6 | 
 7 | for fn in os.listdir(JSONS_DIR):
 8 |     print "Converting JSON: %s" % (fn)
 9 |     fjson = open(os.path.join(JSONS_DIR, fn), 'rb')
10 |     data = json.load(fjson)
11 |     fjson.close()
12 |     tfn = os.path.splitext(fn)[0] + ".txt"
13 |     ftext = open(os.path.join(TEXTS_DIR, tfn), 'wb')
14 |     ftext.write(data["text"].encode("utf-8"))
15 |     ftext.close()
16 | 


--------------------------------------------------------------------------------
/src/topicmodel/gensim_word2vec.py:
--------------------------------------------------------------------------------
 1 | import string
 2 | import nltk
 3 | import numpy as np
 4 | from cStringIO import StringIO
 5 | from gensim.models import word2vec
 6 | import logging
 7 | logging.basicConfig(format="%(asctime)s: %(levelname)s : %(message)s", 
 8 |                     level=logging.INFO)
 9 | 
10 | # load data
11 | fin = open("/home/sujit/Projects/scalcium/src/main/resources/langmodel/raw_sentences.txt", 'rb')
12 | puncts = set([c for c in string.punctuation])
13 | sentences = []
14 | for line in fin:
15 |     # each sentence is a list of words, we lowercase and remove punctuations
16 |     # same as the Scala code
17 |     sentences.append([w for w in nltk.word_tokenize(line.strip().lower()) 
18 |             if w not in puncts])
19 | fin.close()
20 | 
21 | # train word2vec with sentences
22 | model = word2vec.Word2Vec(sentences, size=100, window=4, min_count=1, workers=4)
23 | model.init_sims(replace=True)
24 | 
25 | # find 10 words closest to "day"
26 | print "words most similar to 'day':"
27 | print model.most_similar(positive=["day"], topn=10)
28 | 
29 | # find closest word to "he"
30 | print "words most similar to 'he':"
31 | print model.most_similar(positive=["he"], topn=1)
32 | 
33 | # for each word in the vocabulary, write out the word vectors to a file
34 | fvec = open("/tmp/word_vectors.txt", 'wb')
35 | for word in model.vocab.keys():
36 |     vec = model[word]
37 |     for i in range(vec.shape[0]):
38 |     s = StringIO()
39 |     np.savetxt(s, vec, fmt="%.5f", newline=",")
40 |     fvec.write("%s%s\n" % (s.getvalue(), word))
41 | fvec.close()
42 | 


--------------------------------------------------------------------------------
/src/topicmodel/kea_preprocess.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import random
 4 | import shutil
 5 | 
 6 | JSONS_DIR = "/Users/palsujit/Projects/med_data/mtcrawler/jsons"
 7 | KEA_TRAIN_DIR = "/Users/palsujit/Projects/med_data/mtcrawler/kea/train"
 8 | KEA_TEST_DIR = "/Users/palsujit/Projects/med_data/mtcrawler/kea/test"
 9 | 
10 | shutil.rmtree(KEA_TRAIN_DIR)
11 | shutil.rmtree(KEA_TEST_DIR)
12 | os.mkdir(KEA_TRAIN_DIR)
13 | os.mkdir(KEA_TEST_DIR)
14 | os.mkdir(os.path.join(KEA_TEST_DIR, "keys"))
15 | 
16 | for filename in os.listdir(JSONS_DIR):
17 |     print "Converting %s..." % (filename)
18 |     fjson = open(os.path.join(JSONS_DIR, filename), 'rb')
19 |     data = json.load(fjson)
20 |     fjson.close()
21 |     basename = os.path.splitext(filename)[0]
22 |     # do a 30/70 split for training vs test
23 |     train = random.uniform(0, 1) <= 0.1
24 |     txtdir = KEA_TRAIN_DIR if train else KEA_TEST_DIR
25 |     ftxt = open(os.path.join(txtdir, basename + ".txt"), 'wb')
26 |     ftxt.write(data["text"].encode("utf-8"))
27 |     ftxt.close()
28 |     # write keywords
29 |     keydir = KEA_TRAIN_DIR if train else os.path.join(KEA_TEST_DIR, "keys")    
30 |     fkey = open(os.path.join(keydir, basename + ".key"), 'wb')     
31 |     keywords = data["keywords"]
32 |     for keyword in keywords:
33 |         fkey.write("%s\n" % (keyword.encode("utf-8")))
34 |     fkey.close()
35 | 
36 | 


--------------------------------------------------------------------------------
/src/topicmodel/keywords_merge.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import nltk
 3 | import os
 4 | 
 5 | USER_KEYWORDS_DIR = "/Users/palsujit/Projects/med_data/mtcrawler/kea/test"
 6 | KEA_KEYWORDS_DIR = "/Users/palsujit/Projects/med_data/mtcrawler/kea/test/keys"
 7 | KEYWORDS_FILE = "/Users/palsujit/Projects/med_data/mtcrawler/kea/merged_keys.txt"
 8 | CUSTOM_STOPWORDS = ["patient", "normal", "mg"]
 9 | 
10 | def main():
11 | 
12 |     # get set of english keywords from NLTK
13 |     stopwords = set(nltk.corpus.stopwords.words("english"))
14 |     # add own corpus-based stopwords based on high IDF words
15 |     for custom_stopword in CUSTOM_STOPWORDS:
16 |         stopwords.add(custom_stopword)
17 |         
18 |     keywords = set()
19 |     for f in os.listdir(USER_KEYWORDS_DIR):
20 |         # only select the .key files
21 |         if f.endswith(".txt") or f == "keys":
22 |             continue
23 |         fusr = open(os.path.join(USER_KEYWORDS_DIR, f), 'rb')
24 |         for line in fusr:
25 |             line = line.strip().lower()
26 |             if line in keywords:
27 |                 continue
28 |             keywords.add(line)
29 |         fusr.close()
30 |     for f in os.listdir(KEA_KEYWORDS_DIR):
31 |         fkea = open(os.path.join(KEA_KEYWORDS_DIR, f), 'rb')
32 |         for line in fkea:
33 |             keywords.add(line.strip())
34 |         fkea.close()
35 |     fmrg = open(KEYWORDS_FILE, 'wb')
36 |     for keyword in keywords:
37 |         if keyword in stopwords:
38 |             continue
39 |         fmrg.write("%s\n" % (keyword))
40 |     fmrg.close()
41 | 
42 | if __name__ == "__main__":
43 |     main()
44 | 


--------------------------------------------------------------------------------
/src/topicmodel/lda_model.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import gensim
 4 | 
 5 | MODELS_DIR = "models"
 6 | NUM_TOPICS = 4
 7 | 
 8 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 9 | 
10 | dictionary = gensim.corpora.Dictionary.load(os.path.join(MODELS_DIR, "bok.dict"))
11 | corpus = gensim.corpora.MmCorpus(os.path.join(MODELS_DIR, "bok.mm"))
12 | 
13 | # Project to LDA space
14 | lda = gensim.models.LdaModel(corpus, id2word=dictionary, 
15 |                              iterations=300,
16 |                              num_topics=NUM_TOPICS)
17 | 
18 | ftt = open(os.path.join(MODELS_DIR, "topic_terms.csv"), 'wb')
19 | for topic_id in range(NUM_TOPICS):
20 |     term_probs = lda.show_topic(topic_id, topn=50)
21 |     for prob, term in term_probs:
22 |        ftt.write("%d\t%s\t%.3f\n" % (topic_id, term.replace("_", " "), prob))
23 | ftt.close()
24 | 
25 | fdt = open(os.path.join(MODELS_DIR, "doc_topics.csv"), 'wb')
26 | for doc_id in range(len(corpus)):
27 |     docbok = corpus[doc_id]
28 |     doc_topics = lda.get_document_topics(docbok)
29 |     for topic_id, topic_prob in doc_topics:
30 |         fdt.write("%d\t%d\t%.3f\n" % (doc_id, topic_id, topic_prob))
31 | fdt.close()
32 | 


--------------------------------------------------------------------------------
/src/topicmodel/lsi_model.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import gensim
 4 | 
 5 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 6 | 
 7 | MODELS_DIR = "models"
 8 | 
 9 | dictionary = gensim.corpora.Dictionary.load(os.path.join(MODELS_DIR, "bok.dict"))
10 | corpus = gensim.corpora.MmCorpus(os.path.join(MODELS_DIR, "bok.mm"))
11 | 
12 | tfidf = gensim.models.TfidfModel(corpus, normalize=True)
13 | corpus_tfidf = tfidf[corpus]
14 | 
15 | # project to 2 dimensions for visualization
16 | lsi = gensim.models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)
17 | 
18 | # write out coordinates to file
19 | fcoords = open(os.path.join(MODELS_DIR, "coords.csv"), 'wb')
20 | for vector in lsi[corpus]:
21 |     if len(vector) != 2:
22 |         continue
23 |     fcoords.write("%6.4f\t%6.4f\n" % (vector[0][1], vector[1][1]))
24 | fcoords.close()
25 | 
26 | 


--------------------------------------------------------------------------------
/src/topicmodel/num_topics.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | from sklearn.cluster import KMeans
 5 | 
 6 | MODELS_DIR = "models"
 7 | MAX_K = 10
 8 | 
 9 | X = np.loadtxt(os.path.join(MODELS_DIR, "coords.csv"), delimiter="\t")
10 | ks = range(1, MAX_K + 1)
11 | 
12 | inertias = np.zeros(MAX_K)
13 | diff = np.zeros(MAX_K)
14 | diff2 = np.zeros(MAX_K)
15 | diff3 = np.zeros(MAX_K)
16 | for k in ks:
17 |     kmeans = KMeans(k).fit(X)
18 |     inertias[k - 1] = kmeans.inertia_
19 |     # first difference    
20 |     if k > 1:
21 |         diff[k - 1] = inertias[k - 1] - inertias[k - 2]
22 |     # second difference
23 |     if k > 2:
24 |         diff2[k - 1] = diff[k - 1] - diff[k - 2]
25 |     # third difference
26 |     if k > 3:
27 |         diff3[k - 1] = diff2[k - 1] - diff2[k - 2]
28 | 
29 | elbow = np.argmin(diff3[3:]) + 3
30 | 
31 | plt.plot(ks, inertias, "b*-")
32 | plt.plot(ks[elbow], inertias[elbow], marker='o', markersize=12,
33 |          markeredgewidth=2, markeredgecolor='r', markerfacecolor=None)
34 | plt.ylabel("Inertia")
35 | plt.xlabel("K")
36 | plt.show()
37 | 


--------------------------------------------------------------------------------
/src/topicmodel/viz_doctopic_distrib.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import random
 3 | import matplotlib.pyplot as plt
 4 | import pandas as pd
 5 | 
 6 | MODELS_DIR = "models"
 7 | NUM_TOPICS = 4
 8 | 
 9 | dtdf = pd.read_csv(os.path.join(MODELS_DIR, "doc_topics.csv"), sep="\t", 
10 |                    names=["doc_id", "topic_id", "topic_prob"], 
11 |                    skiprows=0)
12 | # Choose 5 documents randomly for analysis
13 | max_doc_id = dtdf["doc_id"].max()
14 | doc_ids = []
15 | for i in range(6):
16 |     doc_ids.append(int(random.random() * max_doc_id))
17 | 
18 | for doc_id in doc_ids:
19 |     filt = dtdf[dtdf["doc_id"] == doc_id]
20 |     topic_ids = filt["topic_id"].tolist()
21 |     topic_probs = filt["topic_prob"].tolist()
22 |     prob_dict = dict(zip(topic_ids, topic_probs))
23 |     ys = []
24 |     for i in range(NUM_TOPICS):    
25 |         if prob_dict.has_key(i):
26 |            ys.append(prob_dict[i])
27 |         else:
28 |             ys.append(0.0)
29 |     plt.title("Document #%d" % (doc_id))
30 |     plt.ylabel("P(topic)")
31 |     plt.ylim(0.0, 1.0)
32 |     plt.xticks(range(NUM_TOPICS), 
33 |                ["Topic#%d" % (x) for x in range(NUM_TOPICS)])
34 |     plt.grid(True)
35 |     plt.bar(range(NUM_TOPICS), ys, align="center")
36 |     plt.show()
37 |     
38 |     


--------------------------------------------------------------------------------
/src/topicmodel/viz_topics_scatter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | from sklearn.cluster import KMeans
 5 | 
 6 | MODELS_DIR = "models"
 7 | NUM_TOPICS = 4
 8 | 
 9 | X = np.loadtxt(os.path.join(MODELS_DIR, "coords.csv"), delimiter="\t")
10 | kmeans = KMeans(NUM_TOPICS).fit(X)
11 | y = kmeans.labels_
12 | 
13 | colors = ["b", "g", "r", "m"]
14 | for i in range(X.shape[0]):
15 |     plt.scatter(X[i][0], X[i][1], c=colors[y[i]], s=10)    
16 | plt.show()
17 | 


--------------------------------------------------------------------------------
/src/topicmodel/viz_topics_wordcloud.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import os
 3 | import pandas as pd
 4 | import wordcloud
 5 | 
 6 | MODELS_DIR = "models"
 7 | 
 8 | ttdf = pd.read_csv(os.path.join(MODELS_DIR, "topic_terms.csv"), 
 9 |                    sep="\t", skiprows=0, names=["topic_id", "term", "prob"])
10 | topics = ttdf.groupby("topic_id").groups
11 | for topic in topics.keys():
12 |     row_ids = topics[topic]
13 |     freqs = []
14 |     for row_id in row_ids:
15 |         row = ttdf.ix[row_id]
16 |         freqs.append((row["term"], row["prob"]))
17 |     wc = wordcloud.WordCloud()
18 |     elements = wc.fit_words(freqs)
19 |     plt.figure(figsize=(5, 5))
20 |     plt.imshow(wc)
21 |     plt.axis("off")
22 |     plt.show()


--------------------------------------------------------------------------------
/src/topicmodel/word2vec_cluster_plot.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | 
 3 | fig = plt.figure()
 4 | ax = fig.add_subplot(111)
 5 | fin = open("../../data/word_som_online.txt", 'rb')
 6 | for line in fin:
 7 |   word, x, y = line.strip().split("\t")
 8 |   ax.text(int(x), int(y), word)
 9 | fin.close()
10 | ax.axis([0, 50, 0, 50])
11 | plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)
12 | plt.title("Word Clusters (Online Training)")
13 | plt.grid()
14 | plt.show()
15 | 


--------------------------------------------------------------------------------