├── 1_spacy_intro.ipynb
├── 2_spacy_pubmed.ipynb
├── 3_spacy_pubmed_model.ipynb
├── 4_SparkNLP_intro.ipynb
├── 5_SparkNLP_pubmed_model.ipynb
├── Dockerfile
├── Installation instructions.pdf
├── NLU at Scale with spaCy and Spark NLP - Feb 2018.pptx
└── README.md
/1_spacy_intro.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "from __future__ import print_function"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "
'\n",
279 | "# return html"
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": null,
285 | "metadata": {
286 | "collapsed": false,
287 | "scrolled": true
288 | },
289 | "outputs": [],
290 | "source": [
291 | "def process_document(id,txt,tagger):\n",
292 | " stpwds = set()\n",
293 | " #run the document through the NLP pipeline\n",
294 | " doc = tagger.nlp(txt)\n",
295 | " \n",
296 | " #create a list with the indeces of the stop words\n",
297 | " for token in doc:\n",
298 | " if token.is_stop:\n",
299 | " stpwds.add(token.idx)\n",
300 | " #run the UMLS tagger \n",
301 | " matches= tagger.match(txt, best_match=True, ignore_syntax=True)\n",
302 | " data = []\n",
303 | "\n",
304 | " for match in matches:\n",
305 | " semtypes = set()\n",
306 | " term = ''\n",
307 | " cui = ''\n",
308 | " ngram = ''\n",
309 | " mi=0\n",
310 | " #for every match collect all the semantic types\n",
311 | " #keep only the term with the highest macthing score (similarity)\n",
312 | " \n",
313 | " for m in match:\n",
314 | " for s in m['semtypes']:\n",
315 | " semtypes.add(s)\n",
316 | " if m['similarity']>mi:\n",
317 | " term = m['term']\n",
318 | " cui = m['cui']\n",
319 | " mi=m['similarity']\n",
320 | " ngram = m['ngram']\n",
321 | " \n",
322 | " #filter out terms shorter than 3 chars\n",
323 | " if len(term)<=2:\n",
324 | " continue\n",
325 | " #filter out stop words\n",
326 | " if match[0]['start'] in stpwds:\n",
327 | " continue\n",
328 | " \n",
329 | " tmp=[]\n",
330 | " tmp.append(id)\n",
331 | " tmp.append(match[0]['start'])\n",
332 | " tmp.append(match[0]['end'])\n",
333 | " tmp.append(term.lower())\n",
334 | " tmp.append(cui)\n",
335 | " tmp.append(mi)\n",
336 | " stypes = set()\n",
337 | " for sem in semtypes:\n",
338 | " stypes.add(sems[sem][1])\n",
339 | " tmp.append(stypes)\n",
340 | " data.append(tmp)\n",
341 | " return data\n",
342 | "\n",
343 | "\n",
344 | "#configure the UMLS tagger to anly accept certian Semantic Types (per our problem)\n",
345 | "tagger.accepted_semtypes=set()\n",
346 | "tagger.accepted_semtypes.add('T047') #Disease or Syndrome\n",
347 | "tagger.accepted_semtypes.add('T184') # Sign or Symptom\n",
348 | "\n",
349 | "for sem in sems:\n",
350 | " if sems[sem][0]=='Anatomy':\n",
351 | " tagger.accepted_semtypes.add(sem)\n",
352 | "for sem in sems:\n",
353 | " if sems[sem][0]=='Chemicals & Drugs':\n",
354 | " tagger.accepted_semtypes.add(sem)\n",
355 | "\n",
356 | "#Iterate over every document and extract the concepts\n",
357 | "i=-1 \n",
358 | "result = []\n",
359 | "for idx,row in df.iterrows():\n",
360 | " try:\n",
361 | " i+=1\n",
362 | " if row['Abstract'] is None:\n",
363 | " continue\n",
364 | " annotations = process_document(i,str(row['Abstract']),tagger)\n",
365 | " result.extend(annotations)\n",
366 | " except Exception as e:\n",
367 | " print(e)\n",
368 | " \n",
369 | "df_matches = pd.DataFrame(data=result, columns =['document','start','end','term','cui','similarity','semtypes'])\n",
370 | "df_matches.sort_values(by=['document','start'],inplace=True)\n",
371 | "df_matches"
372 | ]
373 | },
374 | {
375 | "cell_type": "code",
376 | "execution_count": null,
377 | "metadata": {
378 | "collapsed": true
379 | },
380 | "outputs": [],
381 | "source": [
382 | "# df_matches.to_pickle('df_matches_pubmed_diabetes_100.data')\n",
383 | "# import pickle\n",
384 | "# with open('df_matches_pubmed_diabetes_100.data','r') as f:\n",
385 | "# df_matches = pickle.load(f)"
386 | ]
387 | },
388 | {
389 | "cell_type": "code",
390 | "execution_count": null,
391 | "metadata": {
392 | "collapsed": false
393 | },
394 | "outputs": [],
395 | "source": [
396 | "#construct documents using only the extracted concepts\n",
397 | "data = []\n",
398 | "for enc,items in df_matches.groupby(['document']):\n",
399 | " data.append((enc,'|'.join(items['term'].values)))\n",
400 | "new_arts = pd.DataFrame(data = data,columns=['document','content'])\n",
401 | "new_arts.head(5)"
402 | ]
403 | },
404 | {
405 | "cell_type": "code",
406 | "execution_count": null,
407 | "metadata": {
408 | "collapsed": false
409 | },
410 | "outputs": [],
411 | "source": [
412 | "most_frequent_terms=[]\n",
413 | "for term,items in df_matches.groupby(['term']):\n",
414 | " most_frequent_terms.append((term,len(items)))\n",
415 | "most_frequent_terms.sort(key=lambda tup:tup[1],reverse=True)\n",
416 | "print('Most frequent terms: ',most_frequent_terms[:10])\n",
417 | "print('Vocabulary size: ',len(most_frequent_terms))"
418 | ]
419 | },
420 | {
421 | "cell_type": "code",
422 | "execution_count": null,
423 | "metadata": {
424 | "collapsed": false
425 | },
426 | "outputs": [],
427 | "source": [
428 | "#We're using Spark's MLlib implementaiton of Word2Vec\n",
429 | "\n",
430 | "#initialize the Spark context\n",
431 | "from pyspark import SparkContext\n",
432 | "from pyspark.sql import HiveContext, SQLContext\n",
433 | "sc = SparkContext('local','example_notebook')\n",
434 | "ssc = SQLContext(sc)"
435 | ]
436 | },
437 | {
438 | "cell_type": "code",
439 | "execution_count": null,
440 | "metadata": {
441 | "collapsed": false
442 | },
443 | "outputs": [],
444 | "source": [
445 | "#create a SparkSql table containg the newly created documents\n",
446 | "df_new_arts = ssc.createDataFrame(new_arts[['document','content']])\n",
447 | "ssc.registerDataFrameAsTable(df_new_arts,tableName='new_arts')"
448 | ]
449 | },
450 | {
451 | "cell_type": "code",
452 | "execution_count": null,
453 | "metadata": {
454 | "collapsed": false
455 | },
456 | "outputs": [],
457 | "source": [
458 | "#Train Word2Vec on this data \n",
459 | "new_notes = ssc.sql('select * from new_arts')\n",
460 | "from pyspark.mllib.feature import Word2Vec\n",
461 | "word2vec = Word2Vec()\n",
462 | "word2vec.setVectorSize(10) #embeddig vector size is 10\n",
463 | "model = word2vec.fit(new_notes.rdd.map(lambda x: x.content.split('|')))"
464 | ]
465 | },
466 | {
467 | "cell_type": "code",
468 | "execution_count": null,
469 | "metadata": {
470 | "collapsed": false
471 | },
472 | "outputs": [],
473 | "source": [
474 | "%matplotlib inline\n",
475 | "import numpy as np\n",
476 | "from pylab import *\n",
477 | "from matplotlib import pyplot as plt\n",
478 | "from matplotlib.ticker import MultipleLocator, FormatStrFormatter\n",
479 | "\n",
480 | "def get_relevant_terms(term,cnt):\n",
481 | " synonyms=[]\n",
482 | " try:\n",
483 | " synonyms = model.findSynonyms(term, cnt) #get the most relevant cnt terms by computing vector distance\n",
484 | " except Exception as e:\n",
485 | " pass\n",
486 | " return synonyms\n",
487 | "\n",
488 | "# construct the represeantion of the graph of concepts relationships derived from related terms obtained from word2vec\n",
489 | "# by selecting the top 5 most relevant concepts where the releavncy is >0.5 for the most frequent diseases and drugs concepts\n",
490 | "# having a frequency > 100 in the input 100 notes\n",
491 | "\n",
492 | "import networkx as nx\n",
493 | "G=nx.Graph()\n",
494 | "d_d=set()\n",
495 | "\n",
496 | "for termf in most_frequent_terms:\n",
497 | " term = termf[0]\n",
498 | " if ' ' not in term or termf[1]<2:\n",
499 | " continue\n",
500 | " rel = get_relevant_terms(term,3)\n",
501 | " for tup in rel:\n",
502 | " if tup[1]>0.1:\n",
503 | " G.add_edge(term.replace(' ','_'),tup[0].replace(' ','_'),weight=round(tup[1],2))\n",
504 | " d_d.add(term.replace(' ','_'))\n",
505 | " \n",
506 | "d_d_l=list(d_d)\n",
507 | "try: \n",
508 | " pos = nx.spring_layout(G,iterations=100)\n",
509 | " plt.figure(figsize=(20,20))\n",
510 | " edgewidth = [ d['weight'] for (u,v,d) in G.edges(data=True)]\n",
511 | " nx.draw_networkx_nodes (G,pos,alpha=0.2,node_color='red',node_size=400)\n",
512 | " nx.draw_networkx_nodes (G,pos,alpha=0.6,nodelist=d_d_l,node_color='yellow',node_size=500)\n",
513 | " nx.draw_networkx_labels(G,pos,alpha=0.4,label_color='grey',font_size=10)\n",
514 | " nx.draw_networkx_edges(G,pos,edge_color='orange',width = 1)\n",
515 | " plt.savefig(\"disease_drugs.png\")\n",
516 | " plt.show()\n",
517 | "except Exception as e: # matplotlib not available\n",
518 | " print(e)"
519 | ]
520 | }
521 | ],
522 | "metadata": {
523 | "kernelspec": {
524 | "display_name": "Python 2",
525 | "language": "python",
526 | "name": "python2"
527 | },
528 | "language_info": {
529 | "codemirror_mode": {
530 | "name": "ipython",
531 | "version": 2
532 | },
533 | "file_extension": ".py",
534 | "mimetype": "text/x-python",
535 | "name": "python",
536 | "nbconvert_exporter": "python",
537 | "pygments_lexer": "ipython2",
538 | "version": "2.7.10"
539 | }
540 | },
541 | "nbformat": 4,
542 | "nbformat_minor": 1
543 | }
544 |
--------------------------------------------------------------------------------
/3_spacy_pubmed_model.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "
How to build a document classifier ?
\n",
8 | "Problem:\n",
9 | "\n",
10 | "We want to build a model that is able to tell if a Pubmed article is refering to child or adult patient(s).\n",
11 | "\n",
12 | "Solution:\n",
13 | "\n",
14 | "We construct a training/validation set out of English only Pubmed articles and use the keywords associated with these articles to assign the labels.\n",
15 | "\n",
16 | "Using Keras with Tensorflow backend, we train a convolutional neural network on the training data. We show very good accuracy and f1-score can be obtained in 5 Epochs."
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {},
22 | "source": [
23 | "http://biopython.org/DIST/docs/tutorial/Tutorial.html"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {
30 | "collapsed": false,
31 | "scrolled": false
32 | },
33 | "outputs": [],
34 | "source": [
35 | "!pip install --upgrade numpy\n",
36 | "from __future__ import print_function\n",
37 | "search_term = 'type+1+diabetes[MH]'\n",
38 | "max_articles = 10000\n",
39 | "\n",
40 | "from Bio import Entrez\n",
41 | "print('Searching PubMed abstracts for documents containing term: ',search_term)\n",
42 | "handle = Entrez.esearch(db=\"pubmed\", term=search_term, retmax=max_articles)\n",
43 | "record = Entrez.read(handle)\n",
44 | "handle.close()\n",
45 | "idlist = record[\"IdList\"]\n",
46 | "\n",
47 | "print('Found:',len(idlist),' documents')"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": null,
53 | "metadata": {
54 | "collapsed": false
55 | },
56 | "outputs": [],
57 | "source": [
58 | "#fetching the previously found documents\n",
59 | "#select only English articles \n",
60 | "#assign labels based on the keywords assoociated with the articles\n",
61 | "from Bio import Medline\n",
62 | "handle = Entrez.efetch(db=\"pubmed\", id=idlist, rettype=\"medline\",retmode=\"text\")\n",
63 | "records = Medline.parse(handle)\n",
64 | "data = []\n",
65 | "adults =0\n",
66 | "child =0\n",
67 | "for record in records:\n",
68 | " if 'AB' not in record or record['AB'] is None:\n",
69 | " continue\n",
70 | " if len(record['LA'])==1 and record['LA'][0]=='eng':\n",
71 | " is_adult = False\n",
72 | " is_child = False\n",
73 | " for val in record['MH']:\n",
74 | " if val =='Adult':\n",
75 | " is_adult=True\n",
76 | " break\n",
77 | " if val =='Adolescent' or val=='Child':\n",
78 | " is_child=True\n",
79 | " if is_adult and is_child:\n",
80 | " continue\n",
81 | " if is_adult:\n",
82 | " adults+=1\n",
83 | " data.append((record,1))\n",
84 | " if is_child:\n",
85 | " data.append((record,0))\n",
86 | " child+=1\n",
87 | "print ('Articles about adults:',adults,' Articles about children/child:',child)"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": null,
93 | "metadata": {
94 | "collapsed": false,
95 | "scrolled": true
96 | },
97 | "outputs": [],
98 | "source": [
99 | "# #Dump the obtained data to disk in case we need to repeat the process from here on.\n",
100 | "# import dill as pickle\n",
101 | "# with open('pubmed_records.tmp','w') as f:\n",
102 | "# pickle.dump(data,f)\n",
103 | "# # pickle.load(f)"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "metadata": {
110 | "collapsed": false
111 | },
112 | "outputs": [],
113 | "source": [
114 | "#Split the data into trianing/test sets\n",
115 | "split=0.8\n",
116 | "train_set = data[:int(split*len(data))]\n",
117 | "test_set = data[int(split*len(data)):]"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": null,
123 | "metadata": {
124 | "collapsed": false
125 | },
126 | "outputs": [],
127 | "source": [
128 | "#separate labels \n",
129 | "x_train = [record[0].get('AB') for record in train_set]\n",
130 | "y_train = [record[1] for record in train_set]\n",
131 | "x_test = [record[0].get('AB') for record in test_set]\n",
132 | "y_test = [record[1] for record in test_set]\n",
133 | "\n",
134 | "from keras.preprocessing.text import hashing_trick\n",
135 | "\n",
136 | "max_features = 5000\n",
137 | "#Transform the input articles into number sequences by replacing each word\n",
138 | "#with it's index in a frequency list\n",
139 | "\n",
140 | "x_train = [hashing_trick(record,max_features) for record in x_train]\n",
141 | "x_test = [hashing_trick(record,max_features) for record in x_test]\n"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": null,
147 | "metadata": {
148 | "collapsed": false
149 | },
150 | "outputs": [],
151 | "source": [
152 | "from keras.preprocessing import sequence\n",
153 | "from keras.models import Sequential\n",
154 | "from keras.layers import Dense, Dropout, Activation\n",
155 | "from keras.layers import Embedding\n",
156 | "from keras.layers import Conv1D, GlobalMaxPooling1D\n",
157 | "\n",
158 | "# set parameters for our model:\n",
159 | "maxlen = 1000 #max 1000 words per article\n",
160 | "batch_size = 32 #size of the batch \n",
161 | "embedding_dims = 50 # size of the embedding vectors for each word\n",
162 | "filters = 250 #dimension of filters for the convolutional layer\n",
163 | "kernel_size = 3 #size of the kernel used in the convolutional layer\n",
164 | "hidden_dims = 250 #dimension of the hidden layer\n",
165 | "epochs = 5 #number of training epochs"
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": null,
171 | "metadata": {
172 | "collapsed": true
173 | },
174 | "outputs": [],
175 | "source": []
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": null,
180 | "metadata": {
181 | "collapsed": false
182 | },
183 | "outputs": [],
184 | "source": [
185 | "print(len(x_train), 'train sequences')\n",
186 | "print(len(x_test), 'test sequences')\n",
187 | "\n",
188 | "print('Pad sequences (samples x time)')\n",
189 | "x_train = sequence.pad_sequences(x_train, maxlen=maxlen)\n",
190 | "x_test = sequence.pad_sequences(x_test, maxlen=maxlen)\n",
191 | "print('x_train shape:', x_train.shape)\n",
192 | "print('x_test shape:', x_test.shape)"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": null,
198 | "metadata": {
199 | "collapsed": false
200 | },
201 | "outputs": [],
202 | "source": [
203 | "print('Build model...')\n",
204 | "model = Sequential()\n",
205 | "\n",
206 | "# we start off with an efficient embedding layer which maps\n",
207 | "# our indices into embedding_dims dimensions\n",
208 | "model.add(Embedding(max_features,\n",
209 | " embedding_dims,\n",
210 | " input_length=maxlen\n",
211 | " ))\n",
212 | "model.add(Dropout(0.2))\n",
213 | "\n",
214 | "# we add a Convolution1D, which will learn filters\n",
215 | "# word group filters of size filter_length:\n",
216 | "model.add(Conv1D(filters,\n",
217 | " kernel_size,\n",
218 | " padding='valid',\n",
219 | " activation='relu',\n",
220 | " strides=1))\n",
221 | "# we use max pooling:\n",
222 | "model.add(GlobalMaxPooling1D())\n",
223 | "\n",
224 | "# We add a vanilla hidden layer:\n",
225 | "model.add(Dense(hidden_dims))\n",
226 | "model.add(Dropout(0.2))\n",
227 | "model.add(Activation('relu'))\n",
228 | "\n",
229 | "# We project onto a single unit output layer, and squash it with a sigmoid:\n",
230 | "model.add(Dense(1))\n",
231 | "model.add(Activation('sigmoid'))\n",
232 | "\n",
233 | "from keras import backend as K\n",
234 | "\n",
235 | "def f1(y_true, y_pred):\n",
236 | " def recall(y_true, y_pred):\n",
237 | " \"\"\"Recall metric.\n",
238 | "\n",
239 | " Only computes a batch-wise average of recall.\n",
240 | "\n",
241 | " Computes the recall, a metric for multi-label classification of\n",
242 | " how many relevant items are selected.\n",
243 | " \"\"\"\n",
244 | " true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))\n",
245 | " possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))\n",
246 | " recall = true_positives / (possible_positives + K.epsilon())\n",
247 | " return recall\n",
248 | "\n",
249 | " def precision(y_true, y_pred):\n",
250 | " \"\"\"Precision metric.\n",
251 | "\n",
252 | " Only computes a batch-wise average of precision.\n",
253 | "\n",
254 | " Computes the precision, a metric for multi-label classification of\n",
255 | " how many selected items are relevant.\n",
256 | " \"\"\"\n",
257 | " true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))\n",
258 | " predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))\n",
259 | " precision = true_positives / (predicted_positives + K.epsilon())\n",
260 | " return precision\n",
261 | " precision = precision(y_true, y_pred)\n",
262 | " recall = recall(y_true, y_pred)\n",
263 | " return 2*((precision*recall)/(precision+recall))\n",
264 | "\n",
265 | "\n",
266 | "model.compile(loss='binary_crossentropy',\n",
267 | " optimizer='adam',\n",
268 | " metrics=['accuracy',f1])\n",
269 | "\n",
270 | "model.fit(x_train, y_train,\n",
271 | " batch_size=batch_size,\n",
272 | " epochs=epochs,\n",
273 | " validation_data=(x_test, y_test))"
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": null,
279 | "metadata": {
280 | "collapsed": false
281 | },
282 | "outputs": [],
283 | "source": [
284 | "# serialize model to JSON\n",
285 | "model_json = model.to_json()\n",
286 | "with open(\"model.json\", \"w\") as json_file:\n",
287 | " json_file.write(model_json)\n",
288 | "# serialize weights to HDF5\n",
289 | "model.save_weights(\"model.h5\")\n",
290 | "print(\"Saved model to disk\")"
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": null,
296 | "metadata": {
297 | "collapsed": false
298 | },
299 | "outputs": [],
300 | "source": [
301 | "from keras.models import model_from_json\n",
302 | "# load json and create model\n",
303 | "json_file = open('model.json', 'r')\n",
304 | "loaded_model_json = json_file.read()\n",
305 | "json_file.close()\n",
306 | "loaded_model = model_from_json(loaded_model_json)\n",
307 | "# load weights into new model\n",
308 | "loaded_model.load_weights(\"model.h5\")\n",
309 | "print(\"Loaded model from disk\")"
310 | ]
311 | },
312 | {
313 | "cell_type": "code",
314 | "execution_count": null,
315 | "metadata": {
316 | "collapsed": false
317 | },
318 | "outputs": [],
319 | "source": [
320 | "# evaluate loaded model on test data\n",
321 | "loaded_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy',f1])\n",
322 | "score = loaded_model.evaluate(x_test, y_test, verbose=0)\n",
323 | "print('Performance of loaded model on the test-set:')\n",
324 | "print(\"%s: %.2f%%\" % (loaded_model.metrics_names[1], score[1]*100))\n",
325 | "print(\"%s: %.2f%%\" % (loaded_model.metrics_names[2], score[2]*100))"
326 | ]
327 | }
328 | ],
329 | "metadata": {
330 | "kernelspec": {
331 | "display_name": "Python 2",
332 | "language": "python",
333 | "name": "python2"
334 | },
335 | "language_info": {
336 | "codemirror_mode": {
337 | "name": "ipython",
338 | "version": 2
339 | },
340 | "file_extension": ".py",
341 | "mimetype": "text/x-python",
342 | "name": "python",
343 | "nbconvert_exporter": "python",
344 | "pygments_lexer": "ipython2",
345 | "version": "2.7.10"
346 | }
347 | },
348 | "nbformat": 4,
349 | "nbformat_minor": 1
350 | }
351 |
--------------------------------------------------------------------------------
/5_SparkNLP_pubmed_model.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from __future__ import division, print_function\n",
10 | "\n",
11 | "from Bio import Entrez, Medline\n",
12 | "\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "import numpy as np\n",
15 | "import pandas as pd\n",
16 | "import pyspark\n",
17 | "from pyspark.ml import Pipeline, feature as spark_ft, classification as spark_cls\n",
18 | "from sklearn import metrics as skmetrics\n",
19 | "import wordcloud\n",
20 | "\n",
21 | "%matplotlib inline"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 2,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "packages = [\n",
31 | " 'com.databricks:spark-xml_2.11:0.4.1',\n",
32 | " 'JohnSnowLabs:spark-nlp:1.4.1'\n",
33 | "]\n",
34 | "\n",
35 | "spark = pyspark.sql.SparkSession.builder \\\n",
36 | " .master('local[4]') \\\n",
37 | " .appName('notebook') \\\n",
38 | " .config('spark.jars', 'pysparknlp-1.0.0/lib/sparknlp.jar') \\\n",
39 | " .config('spark.jars.packages', ','.join(packages)) \\\n",
40 | " .getOrCreate()"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 3,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "from sparknlp.annotator import *\n",
50 | "from sparknlp.common import *\n",
51 | "from sparknlp.base import *"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 4,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "def query(terms, num_docs=1000):\n",
61 | " search_term = '+'.join(terms)\n",
62 | " print('Searching PubMed abstracts for documents containing term: ',search_term)\n",
63 | " handle = Entrez.esearch(db=\"pubmed\", term=search_term, retmax=num_docs)\n",
64 | " record = Entrez.read(handle)\n",
65 | " handle.close()\n",
66 | " idlist = record[\"IdList\"]\n",
67 | " \n",
68 | " handle = Entrez.efetch(db=\"pubmed\", id=idlist, rettype=\"medline\",retmode=\"text\")\n",
69 | " records = Medline.parse(handle)\n",
70 | " data = []\n",
71 | " for record in records:\n",
72 | " data.append((record.get(\"TI\", \"?\"),record.get(\"AU\", \"?\"),record.get(\"SO\", \"?\"),record.get(\"AB\",\"?\")))\n",
73 | "\n",
74 | " df = pd.DataFrame(data=data, columns=['Title','Authors','Source','Abstract'])\n",
75 | " df.head(10)\n",
76 | "\n",
77 | " df.replace(r'^\\?$', np.nan, regex=True, inplace=True)\n",
78 | " df['Authors'] = df['Authors'].apply(lambda x: x if isinstance(x, list) else [])\n",
79 | " df.fillna('', inplace=True)\n",
80 | " df['Topic'] = search_term\n",
81 | " \n",
82 | " return spark.createDataFrame(df)"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": 5,
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "topics = [\n",
92 | " ['type', '1', 'diabetes'], \n",
93 | " ['creutzfeldt', 'jakob', 'disease'], \n",
94 | " ['post', 'traumatic', 'stress', 'disorder'],\n",
95 | " ['heart', 'disease']\n",
96 | "]"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": 6,
102 | "metadata": {},
103 | "outputs": [
104 | {
105 | "name": "stdout",
106 | "output_type": "stream",
107 | "text": [
108 | "terms ['type', '1', 'diabetes'] num_docs 710\n",
109 | "Searching PubMed abstracts for documents containing term: type+1+diabetes\n"
110 | ]
111 | },
112 | {
113 | "name": "stderr",
114 | "output_type": "stream",
115 | "text": [
116 | "/opt/conda/lib/python3.6/site-packages/Bio/Entrez/__init__.py:564: UserWarning: \n",
117 | "Email address is not specified.\n",
118 | "\n",
119 | "To make use of NCBI's E-utilities, NCBI requires you to specify your\n",
120 | "email address with each request. As an example, if your email address\n",
121 | "is A.N.Other@example.com, you can specify it as follows:\n",
122 | " from Bio import Entrez\n",
123 | " Entrez.email = 'A.N.Other@example.com'\n",
124 | "In case of excessive usage of the E-utilities, NCBI will attempt to contact\n",
125 | "a user at the email address provided before blocking access to the\n",
126 | "E-utilities.\n",
127 | " E-utilities.\"\"\", UserWarning)\n"
128 | ]
129 | },
130 | {
131 | "name": "stdout",
132 | "output_type": "stream",
133 | "text": [
134 | "terms ['creutzfeldt', 'jakob', 'disease'] num_docs 565\n",
135 | "Searching PubMed abstracts for documents containing term: creutzfeldt+jakob+disease\n",
136 | "terms ['post', 'traumatic', 'stress', 'disorder'] num_docs 582\n",
137 | "Searching PubMed abstracts for documents containing term: post+traumatic+stress+disorder\n",
138 | "terms ['heart', 'disease'] num_docs 522\n",
139 | "Searching PubMed abstracts for documents containing term: heart+disease\n"
140 | ]
141 | }
142 | ],
143 | "source": [
144 | "texts = None\n",
145 | "\n",
146 | "np.random.seed(123)\n",
147 | "for terms in topics:\n",
148 | " num_docs = np.random.randint(200, 1000)\n",
149 | " print('terms', terms, 'num_docs', num_docs)\n",
150 | " if texts is None:\n",
151 | " texts = query(terms, num_docs)\n",
152 | " else:\n",
153 | " texts = texts.union(query(terms, num_docs))"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": 7,
159 | "metadata": {},
160 | "outputs": [
161 | {
162 | "data": {
163 | "text/plain": [
164 | "2379"
165 | ]
166 | },
167 | "execution_count": 7,
168 | "metadata": {},
169 | "output_type": "execute_result"
170 | }
171 | ],
172 | "source": [
173 | "texts.count()"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": 8,
179 | "metadata": {},
180 | "outputs": [
181 | {
182 | "name": "stdout",
183 | "output_type": "stream",
184 | "text": [
185 | "+--------------------+--------------------+--------------------+--------------------+---------------+\n",
186 | "| Title| Authors| Source| Abstract| Topic|\n",
187 | "+--------------------+--------------------+--------------------+--------------------+---------------+\n",
188 | "|A luciferase immu...|[Ling Y, Jiang P,...|Clin Biochem. 201...|AIM: Luciferase i...|type+1+diabetes|\n",
189 | "|Type 1 diabetes m...|[Singh RM, Howart...|Mol Cell Biochem....|There is much evi...|type+1+diabetes|\n",
190 | "|Erratum. Validati...|[Sosenko JM, Skyl...|Diabetes Care. 20...| |type+1+diabetes|\n",
191 | "|Macrovascular dis...|[Bjornstad P, Don...|Lancet Diabetes E...|Cardiovascular di...|type+1+diabetes|\n",
192 | "|Insufficient evid...|[Brignardello-Pet...|J Am Dent Assoc. ...| |type+1+diabetes|\n",
193 | "|Genetic risk scor...|[Thomas NJ, Jones...|Lancet Diabetes E...| |type+1+diabetes|\n",
194 | "|Genetic risk scor...|[Leslie RD, Lernm...|Lancet Diabetes E...| |type+1+diabetes|\n",
195 | "|Association betwe...|[Ahola AJ, Forsbl...|Diabetes Res Clin...|AIMS: Depressive ...|type+1+diabetes|\n",
196 | "|Alpha-1 antitryps...|[Weir GC, Ehlers ...|Pediatr Diabetes....|OBJECTIVE: To det...|type+1+diabetes|\n",
197 | "|Considering Cultu...|[Rose M, Aronow L...|Curr Diab Rep. 20...|PURPOSE OF REVIEW...|type+1+diabetes|\n",
198 | "|Improved Murine-M...|[Racine JJ, Stewa...|Diabetes. 2018 Fe...|Improved mouse mo...|type+1+diabetes|\n",
199 | "|ANNALS EXPRESS: C...|[Shimizu I, Hiram...|Ann Clin Biochem....|BackgroundTo clar...|type+1+diabetes|\n",
200 | "|Prevalence of ear...|[Adar A, Shalitin...|Diabetes Metab Re...|BACKGROUND: The i...|type+1+diabetes|\n",
201 | "|The Effect of Aro...|[Jeon YD, Kang SH...|J Med Food. 2018 ...|The number of dia...|type+1+diabetes|\n",
202 | "|Human Subcutaneou...|[Rigla M, Pons B,...|Diabetes Technol ...|BACKGROUND: Subcu...|type+1+diabetes|\n",
203 | "|Evaluation of Pan...|[Naganawa M, Lim ...|Mol Imaging Biol....|PURPOSE: Previous...|type+1+diabetes|\n",
204 | "|BMX-001, a novel ...|[Bruni A, Pepper ...|Am J Transplant. ...|Islet transplanta...|type+1+diabetes|\n",
205 | "|Continuous Glucos...|[Wood A, O'Neal D...|Intern Med J. 201...|The advent of dev...|type+1+diabetes|\n",
206 | "|Factitious hypogl...|[Bauman V, Sturke...|Pediatr Diabetes....|BACKGROUND: Facti...|type+1+diabetes|\n",
207 | "|Intensive remote ...|[Gandrud L, Altan...|Pediatr Diabetes....|OBJECTIVE: While ...|type+1+diabetes|\n",
208 | "+--------------------+--------------------+--------------------+--------------------+---------------+\n",
209 | "only showing top 20 rows\n",
210 | "\n"
211 | ]
212 | }
213 | ],
214 | "source": [
215 | "texts.show()"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": 9,
221 | "metadata": {},
222 | "outputs": [],
223 | "source": [
224 | "non_empty_texts = texts.where('Abstract != \"\"') \\\n",
225 | " .withColumn('id', pyspark.sql.functions.monotonically_increasing_id())"
226 | ]
227 | },
228 | {
229 | "cell_type": "code",
230 | "execution_count": 10,
231 | "metadata": {},
232 | "outputs": [
233 | {
234 | "data": {
235 | "text/plain": [
236 | "2180"
237 | ]
238 | },
239 | "execution_count": 10,
240 | "metadata": {},
241 | "output_type": "execute_result"
242 | }
243 | ],
244 | "source": [
245 | "non_empty_texts.count()"
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": 11,
251 | "metadata": {},
252 | "outputs": [
253 | {
254 | "name": "stdout",
255 | "output_type": "stream",
256 | "text": [
257 | "+--------------------+--------------------+--------------------+--------------------+---------------+---+\n",
258 | "| Title| Authors| Source| Abstract| Topic| id|\n",
259 | "+--------------------+--------------------+--------------------+--------------------+---------------+---+\n",
260 | "|A luciferase immu...|[Ling Y, Jiang P,...|Clin Biochem. 201...|AIM: Luciferase i...|type+1+diabetes| 0|\n",
261 | "|Type 1 diabetes m...|[Singh RM, Howart...|Mol Cell Biochem....|There is much evi...|type+1+diabetes| 1|\n",
262 | "|Macrovascular dis...|[Bjornstad P, Don...|Lancet Diabetes E...|Cardiovascular di...|type+1+diabetes| 2|\n",
263 | "|Association betwe...|[Ahola AJ, Forsbl...|Diabetes Res Clin...|AIMS: Depressive ...|type+1+diabetes| 3|\n",
264 | "|Alpha-1 antitryps...|[Weir GC, Ehlers ...|Pediatr Diabetes....|OBJECTIVE: To det...|type+1+diabetes| 4|\n",
265 | "|Considering Cultu...|[Rose M, Aronow L...|Curr Diab Rep. 20...|PURPOSE OF REVIEW...|type+1+diabetes| 5|\n",
266 | "|Improved Murine-M...|[Racine JJ, Stewa...|Diabetes. 2018 Fe...|Improved mouse mo...|type+1+diabetes| 6|\n",
267 | "|ANNALS EXPRESS: C...|[Shimizu I, Hiram...|Ann Clin Biochem....|BackgroundTo clar...|type+1+diabetes| 7|\n",
268 | "|Prevalence of ear...|[Adar A, Shalitin...|Diabetes Metab Re...|BACKGROUND: The i...|type+1+diabetes| 8|\n",
269 | "|The Effect of Aro...|[Jeon YD, Kang SH...|J Med Food. 2018 ...|The number of dia...|type+1+diabetes| 9|\n",
270 | "|Human Subcutaneou...|[Rigla M, Pons B,...|Diabetes Technol ...|BACKGROUND: Subcu...|type+1+diabetes| 10|\n",
271 | "|Evaluation of Pan...|[Naganawa M, Lim ...|Mol Imaging Biol....|PURPOSE: Previous...|type+1+diabetes| 11|\n",
272 | "|BMX-001, a novel ...|[Bruni A, Pepper ...|Am J Transplant. ...|Islet transplanta...|type+1+diabetes| 12|\n",
273 | "|Continuous Glucos...|[Wood A, O'Neal D...|Intern Med J. 201...|The advent of dev...|type+1+diabetes| 13|\n",
274 | "|Factitious hypogl...|[Bauman V, Sturke...|Pediatr Diabetes....|BACKGROUND: Facti...|type+1+diabetes| 14|\n",
275 | "|Intensive remote ...|[Gandrud L, Altan...|Pediatr Diabetes....|OBJECTIVE: While ...|type+1+diabetes| 15|\n",
276 | "|Lysosomal Exoglyc...|[Maciejczyk M, Ko...|J Diabetes Res. 2...|Before this study...|type+1+diabetes| 16|\n",
277 | "|MHC-mismatched mi...|[Zhang M, Racine ...|Proc Natl Acad Sc...|Autoimmune type 1...|type+1+diabetes| 17|\n",
278 | "|Characteristics a...|[Ritsinger V, Her...|Diabetes Care. 20...|OBJECTIVE: To des...|type+1+diabetes| 18|\n",
279 | "|Parenting Stress ...|[Limbers CA, Teas...|Fam Community Hea...|The purpose of th...|type+1+diabetes| 19|\n",
280 | "+--------------------+--------------------+--------------------+--------------------+---------------+---+\n",
281 | "only showing top 20 rows\n",
282 | "\n"
283 | ]
284 | }
285 | ],
286 | "source": [
287 | "non_empty_texts.show()"
288 | ]
289 | },
290 | {
291 | "cell_type": "code",
292 | "execution_count": 12,
293 | "metadata": {},
294 | "outputs": [],
295 | "source": [
296 | "label_indexer = spark_ft.StringIndexer(inputCol='Topic', outputCol='label')"
297 | ]
298 | },
299 | {
300 | "cell_type": "code",
301 | "execution_count": 13,
302 | "metadata": {},
303 | "outputs": [],
304 | "source": [
305 | "label_indexer_model = label_indexer.fit(non_empty_texts)"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": 14,
311 | "metadata": {},
312 | "outputs": [],
313 | "source": [
314 | "label_deindexer = spark_ft.IndexToString(inputCol='prediction', outputCol='pred_label', \n",
315 | " labels=label_indexer_model.labels)"
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": 15,
321 | "metadata": {},
322 | "outputs": [],
323 | "source": [
324 | "train, test = label_indexer_model.transform(non_empty_texts).randomSplit(weights=[0.8, 0.2], seed=123)"
325 | ]
326 | },
327 | {
328 | "cell_type": "code",
329 | "execution_count": 16,
330 | "metadata": {},
331 | "outputs": [],
332 | "source": [
333 | "abstract_assembler = DocumentAssembler() \\\n",
334 | " .setInputCol(\"Abstract\") \\\n",
335 | " .setOutputCol(\"document\")\n",
336 | " \n",
337 | "title_assembler = DocumentAssembler() \\\n",
338 | " .setInputCol(\"Title\") \\\n",
339 | " .setOutputCol(\"document\")"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": 17,
345 | "metadata": {},
346 | "outputs": [],
347 | "source": [
348 | "sentence_detector = SentenceDetector() \\\n",
349 | " .setInputCols([\"document\"]) \\\n",
350 | " .setOutputCol(\"sentence\") \\\n",
351 | " .setUseAbbreviations(True)\n",
352 | " \n",
353 | "tokenizer = Tokenizer() \\\n",
354 | " .setInputCols([\"sentence\"]) \\\n",
355 | " .setOutputCol(\"token\")\n",
356 | "\n",
357 | "stemmer = Stemmer() \\\n",
358 | " .setInputCols([\"token\"]) \\\n",
359 | " .setOutputCol(\"stem\")\n",
360 | " \n",
361 | "normalizer = Normalizer() \\\n",
362 | " .setInputCols([\"stem\"]) \\\n",
363 | " .setOutputCol(\"normalized\")\n",
364 | "\n",
365 | "nlp_pipeline = Pipeline(stages=[sentence_detector, tokenizer, stemmer, normalizer])"
366 | ]
367 | },
368 | {
369 | "cell_type": "code",
370 | "execution_count": 18,
371 | "metadata": {},
372 | "outputs": [],
373 | "source": [
374 | "abstract_finisher = Finisher() \\\n",
375 | " .setInputCols([\"normalized\"]) \\\n",
376 | " .setOutputCols([\"ntokens\"]) \\\n",
377 | " .setOutputAsArray(True) \\\n",
378 | " .setCleanAnnotations(True)\n",
379 | " \n",
380 | "title_finisher = Finisher() \\\n",
381 | " .setInputCols([\"normalized\"]) \\\n",
382 | " .setOutputCols([\"title\"]) \\\n",
383 | " .setOutputAsArray(True) \\\n",
384 | " .setCleanAnnotations(True)"
385 | ]
386 | },
387 | {
388 | "cell_type": "code",
389 | "execution_count": 19,
390 | "metadata": {},
391 | "outputs": [],
392 | "source": [
393 | "stopWords = spark_ft.StopWordsRemover.loadDefaultStopWords('english')\n",
394 | "sw_remover = spark_ft.StopWordsRemover(inputCol='ntokens', outputCol='text', stopWords=stopWords)"
395 | ]
396 | },
397 | {
398 | "cell_type": "code",
399 | "execution_count": 20,
400 | "metadata": {},
401 | "outputs": [],
402 | "source": [
403 | "abstract_pipeline = Pipeline(stages=[abstract_assembler, nlp_pipeline, abstract_finisher, sw_remover])\n",
404 | "\n",
405 | "title_pipeline = Pipeline(stages=[title_assembler, nlp_pipeline, title_finisher])\n",
406 | "\n",
407 | "preproc_pipeline = Pipeline(stages=[abstract_pipeline, title_pipeline])"
408 | ]
409 | },
410 | {
411 | "cell_type": "code",
412 | "execution_count": 21,
413 | "metadata": {},
414 | "outputs": [],
415 | "source": [
416 | "preproc_model = preproc_pipeline.fit(train)\n",
417 | "processed = preproc_model.transform(train).select('id', 'topic', 'title', 'text', 'label')"
418 | ]
419 | },
420 | {
421 | "cell_type": "code",
422 | "execution_count": 22,
423 | "metadata": {},
424 | "outputs": [
425 | {
426 | "name": "stdout",
427 | "output_type": "stream",
428 | "text": [
429 | "+---+---------------+--------------------+--------------------+-----+\n",
430 | "| id| topic| title| text|label|\n",
431 | "+---+---------------+--------------------+--------------------+-----+\n",
432 | "| 0|type+1+diabetes|[a, luciferas, im...|[aim, luciferas, ...| 0.0|\n",
433 | "|108|type+1+diabetes|[a, typ, diabet, ...|[aimshypothesi, i...| 0.0|\n",
434 | "| 7|type+1+diabetes|[annal, express, ...|[backgroundto, cl...| 0.0|\n",
435 | "| 66|type+1+diabetes|[adipos, impact, ...|[object, central,...| 0.0|\n",
436 | "| 94|type+1+diabetes|[administr, of, v...|[object, two, cas...| 0.0|\n",
437 | "|122|type+1+diabetes|[alpha, cell, dys...|[typ, diabet, cha...| 0.0|\n",
438 | "| 77|type+1+diabetes|[an, effect, trea...|[nanotechnologi, ...| 0.0|\n",
439 | "| 91|type+1+diabetes|[analysi, of, pan...|[background, decr...| 0.0|\n",
440 | "| 63|type+1+diabetes|[assess, the, nut...|[object, lowcarbo...| 0.0|\n",
441 | "| 61|type+1+diabetes|[associ, between,...|[aim, investig, a...| 0.0|\n",
442 | "| 3|type+1+diabetes|[associ, between,...|[aim, depress, mo...| 0.0|\n",
443 | "|137|type+1+diabetes|[automat, detect,...|[background, auto...| 0.0|\n",
444 | "| 12|type+1+diabetes|[bmx, a, novel, r...|[islet, transplan...| 0.0|\n",
445 | "|143|type+1+diabetes|[basal, subnuclea...|[hypothermia, dia...| 0.0|\n",
446 | "|105|type+1+diabetes|[beta, cell, func...|[background, aim,...| 0.0|\n",
447 | "| 51|type+1+diabetes|[beta, cell, extr...|[aimshypothesi, i...| 0.0|\n",
448 | "|116|type+1+diabetes|[bodi, mass, inde...|[object, object, ...| 0.0|\n",
449 | "|141|type+1+diabetes|[cd, +, t, helper...|[autoreact, cd, +...| 0.0|\n",
450 | "| 18|type+1+diabetes|[characterist, an...|[object, describ,...| 0.0|\n",
451 | "| 36|type+1+diabetes|[circul, mirna, p...|[investig, plasma...| 0.0|\n",
452 | "+---+---------------+--------------------+--------------------+-----+\n",
453 | "only showing top 20 rows\n",
454 | "\n"
455 | ]
456 | }
457 | ],
458 | "source": [
459 | "processed.show()"
460 | ]
461 | },
462 | {
463 | "cell_type": "code",
464 | "execution_count": 23,
465 | "metadata": {},
466 | "outputs": [],
467 | "source": [
468 | "text2vec = spark_ft.Word2Vec(\n",
469 | " vectorSize=100, minCount=5, seed=123, \n",
470 | " inputCol='text', outputCol='text_vec', \n",
471 | " windowSize=5, maxSentenceLength=30\n",
472 | ")\n",
473 | "\n",
474 | "title2vec = spark_ft.Word2Vec(\n",
475 | " vectorSize=50, minCount=3, seed=123, \n",
476 | " inputCol='title', outputCol='title_vec', \n",
477 | " windowSize=5, maxSentenceLength=10\n",
478 | ")\n",
479 | "\n",
480 | "assembler = spark_ft.VectorAssembler(inputCols=['text_vec', 'title_vec'], outputCol='features')\n",
481 | "\n",
482 | "feature_pipeline = Pipeline(stages=[text2vec, title2vec, assembler])"
483 | ]
484 | },
485 | {
486 | "cell_type": "code",
487 | "execution_count": 24,
488 | "metadata": {},
489 | "outputs": [],
490 | "source": [
491 | "feature_model = feature_pipeline.fit(processed)\n",
492 | "features = feature_model.transform(processed)"
493 | ]
494 | },
495 | {
496 | "cell_type": "code",
497 | "execution_count": 25,
498 | "metadata": {},
499 | "outputs": [
500 | {
501 | "name": "stdout",
502 | "output_type": "stream",
503 | "text": [
504 | "+---+---------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------+\n",
505 | "| id| topic| title| text|label| text_vec| title_vec| features|\n",
506 | "+---+---------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------+\n",
507 | "| 0|type+1+diabetes|[a, luciferas, im...|[aim, luciferas, ...| 0.0|[0.00135010731583...|[-0.0093724036589...|[0.00135010731583...|\n",
508 | "|108|type+1+diabetes|[a, typ, diabet, ...|[aimshypothesi, i...| 0.0|[-0.0172051572149...|[-0.0485772364307...|[-0.0172051572149...|\n",
509 | "| 7|type+1+diabetes|[annal, express, ...|[backgroundto, cl...| 0.0|[0.03553665489388...|[-0.0297967683523...|[0.03553665489388...|\n",
510 | "| 66|type+1+diabetes|[adipos, impact, ...|[object, central,...| 0.0|[0.01813366204482...|[-0.0657454569635...|[0.01813366204482...|\n",
511 | "| 94|type+1+diabetes|[administr, of, v...|[object, two, cas...| 0.0|[0.00166694424089...|[-0.0769326825315...|[0.00166694424089...|\n",
512 | "|122|type+1+diabetes|[alpha, cell, dys...|[typ, diabet, cha...| 0.0|[-0.0136437712689...|[-0.0979562591140...|[-0.0136437712689...|\n",
513 | "| 77|type+1+diabetes|[an, effect, trea...|[nanotechnologi, ...| 0.0|[0.00717100351251...|[-0.0362102882388...|[0.00717100351251...|\n",
514 | "| 91|type+1+diabetes|[analysi, of, pan...|[background, decr...| 0.0|[0.03914943991343...|[-0.0596927608285...|[0.03914943991343...|\n",
515 | "| 63|type+1+diabetes|[assess, the, nut...|[object, lowcarbo...| 0.0|[-0.0071556649848...|[-0.0019804228601...|[-0.0071556649848...|\n",
516 | "| 61|type+1+diabetes|[associ, between,...|[aim, investig, a...| 0.0|[0.04105319693723...|[-0.0671359592815...|[0.04105319693723...|\n",
517 | "| 3|type+1+diabetes|[associ, between,...|[aim, depress, mo...| 0.0|[0.00475204453800...|[-0.0542618287727...|[0.00475204453800...|\n",
518 | "|137|type+1+diabetes|[automat, detect,...|[background, auto...| 0.0|[-0.0111896677184...|[-0.0113201303950...|[-0.0111896677184...|\n",
519 | "| 12|type+1+diabetes|[bmx, a, novel, r...|[islet, transplan...| 0.0|[0.00678624764814...|[-0.0057002936489...|[0.00678624764814...|\n",
520 | "|143|type+1+diabetes|[basal, subnuclea...|[hypothermia, dia...| 0.0|[-0.0133420593804...|[-0.0176167050696...|[-0.0133420593804...|\n",
521 | "|105|type+1+diabetes|[beta, cell, func...|[background, aim,...| 0.0|[0.05711436714045...|[-0.0244967469901...|[0.05711436714045...|\n",
522 | "| 51|type+1+diabetes|[beta, cell, extr...|[aimshypothesi, i...| 0.0|[-0.0042026805076...|[-0.0420467860364...|[-0.0042026805076...|\n",
523 | "|116|type+1+diabetes|[bodi, mass, inde...|[object, object, ...| 0.0|[0.01275832407116...|[-0.0246792081130...|[0.01275832407116...|\n",
524 | "|141|type+1+diabetes|[cd, +, t, helper...|[autoreact, cd, +...| 0.0|[-0.0025502139021...|[-0.0012579757429...|[-0.0025502139021...|\n",
525 | "| 18|type+1+diabetes|[characterist, an...|[object, describ,...| 0.0|[0.05452627044602...|[-0.0370577549465...|[0.05452627044602...|\n",
526 | "| 36|type+1+diabetes|[circul, mirna, p...|[investig, plasma...| 0.0|[-0.0033266883919...|[-0.0668783794778...|[-0.0033266883919...|\n",
527 | "+---+---------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------+\n",
528 | "only showing top 20 rows\n",
529 | "\n"
530 | ]
531 | }
532 | ],
533 | "source": [
534 | "features.show()"
535 | ]
536 | },
537 | {
538 | "cell_type": "code",
539 | "execution_count": 26,
540 | "metadata": {},
541 | "outputs": [],
542 | "source": [
543 | "text2vec_model = text2vec.fit(processed)"
544 | ]
545 | },
546 | {
547 | "cell_type": "code",
548 | "execution_count": 27,
549 | "metadata": {},
550 | "outputs": [
551 | {
552 | "name": "stdout",
553 | "output_type": "stream",
554 | "text": [
555 | "+-----------+------------------+\n",
556 | "| word| similarity|\n",
557 | "+-----------+------------------+\n",
558 | "| girl|0.8909512758255005|\n",
559 | "| leukemia|0.8899913430213928|\n",
560 | "| foot|0.8795618414878845|\n",
561 | "| dr|0.8758804798126221|\n",
562 | "|recentonset|0.8757311701774597|\n",
563 | "|osteoporosi|0.8751940131187439|\n",
564 | "| gestat|0.8743801116943359|\n",
565 | "| neuropathi|0.8684095144271851|\n",
566 | "| cgl|0.8683326840400696|\n",
567 | "| modi|0.8677959442138672|\n",
568 | "+-----------+------------------+\n",
569 | "\n"
570 | ]
571 | }
572 | ],
573 | "source": [
574 | "text2vec_model.findSynonyms('obes', 10).show()"
575 | ]
576 | },
577 | {
578 | "cell_type": "code",
579 | "execution_count": 28,
580 | "metadata": {},
581 | "outputs": [
582 | {
583 | "name": "stdout",
584 | "output_type": "stream",
585 | "text": [
586 | "+------------+------------------+\n",
587 | "| word| similarity|\n",
588 | "+------------+------------------+\n",
589 | "| combat| 0.904567301273346|\n",
590 | "| experi|0.8929694890975952|\n",
591 | "| postdeploy|0.8746237754821777|\n",
592 | "| catastroph| 0.864286482334137|\n",
593 | "|relationship|0.8641922473907471|\n",
594 | "| tbi|0.8591167330741882|\n",
595 | "| sud|0.8578217029571533|\n",
596 | "| symptom| 0.85660320520401|\n",
597 | "| ptss|0.8558170199394226|\n",
598 | "| buffer|0.8549688458442688|\n",
599 | "+------------+------------------+\n",
600 | "\n"
601 | ]
602 | }
603 | ],
604 | "source": [
605 | "text2vec_model.findSynonyms('trauma', 10).show()"
606 | ]
607 | },
608 | {
609 | "cell_type": "code",
610 | "execution_count": 29,
611 | "metadata": {},
612 | "outputs": [],
613 | "source": [
614 | "mlpc = spark_cls.MultilayerPerceptronClassifier(\n",
615 | " maxIter=100, seed=123, layers=[150, 75, 4]\n",
616 | ")\n",
617 | "\n",
618 | "model_pipeline = Pipeline(stages=[mlpc, label_deindexer])"
619 | ]
620 | },
621 | {
622 | "cell_type": "code",
623 | "execution_count": 30,
624 | "metadata": {},
625 | "outputs": [],
626 | "source": [
627 | "model = model_pipeline.fit(features)"
628 | ]
629 | },
630 | {
631 | "cell_type": "code",
632 | "execution_count": 31,
633 | "metadata": {},
634 | "outputs": [],
635 | "source": [
636 | "test_processed = preproc_model.transform(test).select('id', 'topic', 'title', 'text', 'label')\n",
637 | "\n",
638 | "test_features = feature_model.transform(test_processed)\n",
639 | "\n",
640 | "preds = model.transform(test_features)"
641 | ]
642 | },
643 | {
644 | "cell_type": "code",
645 | "execution_count": 32,
646 | "metadata": {},
647 | "outputs": [
648 | {
649 | "name": "stdout",
650 | "output_type": "stream",
651 | "text": [
652 | "+---+---------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------+----------+---------------+\n",
653 | "| id| topic| title| text|label| text_vec| title_vec| features|prediction| pred_label|\n",
654 | "+---+---------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------+----------+---------------+\n",
655 | "|115|type+1+diabetes|[a, physic, activ...|[background, thi,...| 0.0|[-0.0181661611564...|[-0.0456972203346...|[-0.0181661611564...| 0.0|type+1+diabetes|\n",
656 | "| 47|type+1+diabetes|[a, plateau, in, ...|[object, describ,...| 0.0|[0.02216673717203...|[-0.0611507280264...|[0.02216673717203...| 0.0|type+1+diabetes|\n",
657 | "| 73|type+1+diabetes|[a, quarter, of, ...|[individu, typ, d...| 0.0|[0.01657626631090...|[-0.0464318374563...|[0.01657626631090...| 0.0|type+1+diabetes|\n",
658 | "| 69|type+1+diabetes|[allcaus, mortal,...|[object, estim, a...| 0.0|[0.03344887619467...|[-0.0427703198716...|[0.03344887619467...| 0.0|type+1+diabetes|\n",
659 | "| 4|type+1+diabetes|[alpha, antitryps...|[object, determin...| 0.0|[0.00590007096664...|[-0.0369152471110...|[0.00590007096664...| 0.0|type+1+diabetes|\n",
660 | "|147|type+1+diabetes|[associ, between,...|[aim, advanc, gly...| 0.0|[0.02942380872589...|[-0.0447189292826...|[0.02942380872589...| 0.0|type+1+diabetes|\n",
661 | "|127|type+1+diabetes|[cish, promot, po...|[background, impa...| 0.0|[0.02050852430258...|[-0.0283943034716...|[0.02050852430258...| 0.0|type+1+diabetes|\n",
662 | "| 49|type+1+diabetes|[clinic, profil, ...|[background, diab...| 0.0|[-0.0015870097104...|[-0.0781120069324...|[-0.0015870097104...| 0.0|type+1+diabetes|\n",
663 | "| 87|type+1+diabetes|[concert, redox, ...|[diabet, cardiome...| 0.0|[-0.0198416023941...|[-0.0224614477949...|[-0.0198416023941...| 0.0|type+1+diabetes|\n",
664 | "| 90|type+1+diabetes|[construct, of, e...|[pancreat, islet,...| 0.0|[-0.0284984559506...|[-0.0215841557714...|[-0.0284984559506...| 0.0|type+1+diabetes|\n",
665 | "|114|type+1+diabetes|[differenti, meth...|[diabet, mellitu,...| 0.0|[-0.0276550344460...|[0.01172700314054...|[-0.0276550344460...| 0.0|type+1+diabetes|\n",
666 | "| 60|type+1+diabetes|[effect, of, resi...|[background, bodi...| 0.0|[0.02043139191001...|[-0.0462926091548...|[0.02043139191001...| 0.0|type+1+diabetes|\n",
667 | "|107|type+1+diabetes|[evalu, of, vitam...|[background, vita...| 0.0|[0.05376429138559...|[-0.0255385008973...|[0.05376429138559...| 2.0| heart+disease|\n",
668 | "| 89|type+1+diabetes|[food, insecur, i...|[aim, household, ...| 0.0|[0.01205506821111...|[-0.0370869886299...|[0.01205506821111...| 0.0|type+1+diabetes|\n",
669 | "| 37|type+1+diabetes|[genet, of, typ, ...|[typ, diabet, com...| 0.0|[-0.0394111465742...|[-0.1558791380375...|[-0.0394111465742...| 0.0|type+1+diabetes|\n",
670 | "|152|type+1+diabetes|[hemoglobin, ac, ...|[typ, diabet, td,...| 0.0|[0.03471668307523...|[-0.0538453668954...|[0.03471668307523...| 0.0|type+1+diabetes|\n",
671 | "| 74|type+1+diabetes|[ildrfc, i, a, no...|[ildr, member, ig...| 0.0|[-0.0393003275928...|[-0.0267670435963...|[-0.0393003275928...| 0.0|type+1+diabetes|\n",
672 | "| 59|type+1+diabetes|[impact, of, typ,...|[background, neur...| 0.0|[-0.0047581135744...|[-0.0638369887601...|[-0.0047581135744...| 0.0|type+1+diabetes|\n",
673 | "|109|type+1+diabetes|[impair, hypoglyc...|[hypoglycaemia, r...| 0.0|[-0.0340725541569...|[-0.0500555422157...|[-0.0340725541569...| 0.0|type+1+diabetes|\n",
674 | "| 40|type+1+diabetes|[influenc, of, hy...|[background, aim,...| 0.0|[0.04033954996631...|[-0.0641401749337...|[0.04033954996631...| 0.0|type+1+diabetes|\n",
675 | "+---+---------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------+----------+---------------+\n",
676 | "only showing top 20 rows\n",
677 | "\n"
678 | ]
679 | }
680 | ],
681 | "source": [
682 | "preds.show()"
683 | ]
684 | },
685 | {
686 | "cell_type": "code",
687 | "execution_count": 33,
688 | "metadata": {},
689 | "outputs": [],
690 | "source": [
691 | "pred_df = preds.select('title', 'text', 'label', 'prediction').toPandas()"
692 | ]
693 | },
694 | {
695 | "cell_type": "code",
696 | "execution_count": 34,
697 | "metadata": {},
698 | "outputs": [
699 | {
700 | "data": {
701 | "text/html": [
702 | "
\n",
703 | "
\n",
704 | " \n",
705 | " \n",
706 | " | \n",
707 | " title | \n",
708 | " text | \n",
709 | " label | \n",
710 | " prediction | \n",
711 | "
\n",
712 | " \n",
713 | " \n",
714 | " \n",
715 | " 0 | \n",
716 | " [a, physic, activ, intervent, for, children, w... | \n",
717 | " [background, thi, studi, describ, develop, fea... | \n",
718 | " 0.0 | \n",
719 | " 0.0 | \n",
720 | "
\n",
721 | " \n",
722 | " 1 | \n",
723 | " [a, plateau, in, new, onset, typ, diabet, inci... | \n",
724 | " [object, describ, incid, preval, typ, diabet, ... | \n",
725 | " 0.0 | \n",
726 | " 0.0 | \n",
727 | "
\n",
728 | " \n",
729 | " 2 | \n",
730 | " [a, quarter, of, patient, with, typ, diabet, h... | \n",
731 | " [individu, typ, diabet, td, ar, increas, risk,... | \n",
732 | " 0.0 | \n",
733 | " 0.0 | \n",
734 | "
\n",
735 | " \n",
736 | " 3 | \n",
737 | " [allcaus, mortal, in, adult, with, and, withou... | \n",
738 | " [object, estim, agespecif, sexspecif, allcaus,... | \n",
739 | " 0.0 | \n",
740 | " 0.0 | \n",
741 | "
\n",
742 | " \n",
743 | " 4 | \n",
744 | " [alpha, antitrypsin, treatment, of, newonset, ... | \n",
745 | " [object, determin, safeti, pharmacokinet, alph... | \n",
746 | " 0.0 | \n",
747 | " 0.0 | \n",
748 | "
\n",
749 | " \n",
750 | "
\n",
751 | "
"
752 | ],
753 | "text/plain": [
754 | " title \\\n",
755 | "0 [a, physic, activ, intervent, for, children, w... \n",
756 | "1 [a, plateau, in, new, onset, typ, diabet, inci... \n",
757 | "2 [a, quarter, of, patient, with, typ, diabet, h... \n",
758 | "3 [allcaus, mortal, in, adult, with, and, withou... \n",
759 | "4 [alpha, antitrypsin, treatment, of, newonset, ... \n",
760 | "\n",
761 | " text label prediction \n",
762 | "0 [background, thi, studi, describ, develop, fea... 0.0 0.0 \n",
763 | "1 [object, describ, incid, preval, typ, diabet, ... 0.0 0.0 \n",
764 | "2 [individu, typ, diabet, td, ar, increas, risk,... 0.0 0.0 \n",
765 | "3 [object, estim, agespecif, sexspecif, allcaus,... 0.0 0.0 \n",
766 | "4 [object, determin, safeti, pharmacokinet, alph... 0.0 0.0 "
767 | ]
768 | },
769 | "execution_count": 34,
770 | "metadata": {},
771 | "output_type": "execute_result"
772 | }
773 | ],
774 | "source": [
775 | "pred_df.head()"
776 | ]
777 | },
778 | {
779 | "cell_type": "code",
780 | "execution_count": 35,
781 | "metadata": {},
782 | "outputs": [
783 | {
784 | "data": {
785 | "text/plain": [
786 | "[(0, 'type+1+diabetes'),\n",
787 | " (1, 'post+traumatic+stress+disorder'),\n",
788 | " (2, 'heart+disease'),\n",
789 | " (3, 'creutzfeldt+jakob+disease')]"
790 | ]
791 | },
792 | "execution_count": 35,
793 | "metadata": {},
794 | "output_type": "execute_result"
795 | }
796 | ],
797 | "source": [
798 | "list(enumerate(label_indexer_model.labels))"
799 | ]
800 | },
801 | {
802 | "cell_type": "code",
803 | "execution_count": 36,
804 | "metadata": {},
805 | "outputs": [
806 | {
807 | "data": {
808 | "text/html": [
809 | "
\n",
810 | "
\n",
811 | " \n",
812 | " \n",
813 | " | \n",
814 | " pred type+1+diabetes | \n",
815 | " pred post+traumatic+stress+disorder | \n",
816 | " pred heart+disease | \n",
817 | " pred creutzfeldt+jakob+disease | \n",
818 | "
\n",
819 | " \n",
820 | " \n",
821 | " \n",
822 | " true type+1+diabetes | \n",
823 | " 115 | \n",
824 | " 0 | \n",
825 | " 9 | \n",
826 | " 4 | \n",
827 | "
\n",
828 | " \n",
829 | " true post+traumatic+stress+disorder | \n",
830 | " 1 | \n",
831 | " 98 | \n",
832 | " 5 | \n",
833 | " 0 | \n",
834 | "
\n",
835 | " \n",
836 | " true heart+disease | \n",
837 | " 9 | \n",
838 | " 1 | \n",
839 | " 63 | \n",
840 | " 1 | \n",
841 | "
\n",
842 | " \n",
843 | " true creutzfeldt+jakob+disease | \n",
844 | " 0 | \n",
845 | " 0 | \n",
846 | " 3 | \n",
847 | " 92 | \n",
848 | "
\n",
849 | " \n",
850 | "
\n",
851 | "
"
852 | ],
853 | "text/plain": [
854 | " pred type+1+diabetes \\\n",
855 | "true type+1+diabetes 115 \n",
856 | "true post+traumatic+stress+disorder 1 \n",
857 | "true heart+disease 9 \n",
858 | "true creutzfeldt+jakob+disease 0 \n",
859 | "\n",
860 | " pred post+traumatic+stress+disorder \\\n",
861 | "true type+1+diabetes 0 \n",
862 | "true post+traumatic+stress+disorder 98 \n",
863 | "true heart+disease 1 \n",
864 | "true creutzfeldt+jakob+disease 0 \n",
865 | "\n",
866 | " pred heart+disease \\\n",
867 | "true type+1+diabetes 9 \n",
868 | "true post+traumatic+stress+disorder 5 \n",
869 | "true heart+disease 63 \n",
870 | "true creutzfeldt+jakob+disease 3 \n",
871 | "\n",
872 | " pred creutzfeldt+jakob+disease \n",
873 | "true type+1+diabetes 4 \n",
874 | "true post+traumatic+stress+disorder 0 \n",
875 | "true heart+disease 1 \n",
876 | "true creutzfeldt+jakob+disease 92 "
877 | ]
878 | },
879 | "execution_count": 36,
880 | "metadata": {},
881 | "output_type": "execute_result"
882 | }
883 | ],
884 | "source": [
885 | "pd.DataFrame(\n",
886 | " data=skmetrics.confusion_matrix(pred_df['label'], pred_df['prediction']),\n",
887 | " columns=['pred ' + l for l in label_indexer_model.labels],\n",
888 | " index=['true ' + l for l in label_indexer_model.labels]\n",
889 | ")"
890 | ]
891 | },
892 | {
893 | "cell_type": "code",
894 | "execution_count": 37,
895 | "metadata": {},
896 | "outputs": [
897 | {
898 | "name": "stdout",
899 | "output_type": "stream",
900 | "text": [
901 | " precision recall f1-score support\n",
902 | "\n",
903 | " type+1+diabetes 0.92 0.90 0.91 128\n",
904 | "post+traumatic+stress+disorder 0.99 0.94 0.97 104\n",
905 | " heart+disease 0.79 0.85 0.82 74\n",
906 | " creutzfeldt+jakob+disease 0.95 0.97 0.96 95\n",
907 | "\n",
908 | " avg / total 0.92 0.92 0.92 401\n",
909 | "\n"
910 | ]
911 | }
912 | ],
913 | "source": [
914 | "print(skmetrics.classification_report(pred_df['label'], pred_df['prediction'], \n",
915 | " target_names=label_indexer_model.labels))"
916 | ]
917 | }
918 | ],
919 | "metadata": {
920 | "kernelspec": {
921 | "display_name": "Python 3",
922 | "language": "python",
923 | "name": "python3"
924 | },
925 | "language_info": {
926 | "codemirror_mode": {
927 | "name": "ipython",
928 | "version": 3
929 | },
930 | "file_extension": ".py",
931 | "mimetype": "text/x-python",
932 | "name": "python",
933 | "nbconvert_exporter": "python",
934 | "pygments_lexer": "ipython3",
935 | "version": "3.6.3"
936 | }
937 | },
938 | "nbformat": 4,
939 | "nbformat_minor": 2
940 | }
941 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM jupyter/pyspark-notebook
2 |
3 |
4 | USER root
5 |
6 | # RSpark config
7 | ENV R_LIBS_USER $SPARK_HOME/R/lib
8 |
9 | # R pre-requisites
10 | RUN apt-get update && \
11 | apt-get install -y --no-install-recommends \
12 | fonts-dejavu \
13 | gfortran \
14 | gcc && apt-get clean && \
15 | rm -rf /var/lib/apt/lists/*
16 |
17 | USER $NB_USER
18 |
19 | RUN conda config --add channels conda-forge
20 | # R packages and nlu libraries
21 | RUN conda install --quiet --yes \
22 | 'r-base=3.3.2' \
23 | 'r-irkernel=0.7*' \
24 | 'r-ggplot2=2.2*' \
25 | 'r-sparklyr=0.5*' \
26 | 'networkx=1.11' \
27 | 'biopython=1.70' \
28 | 'unidecode' \
29 | 'leveldb' \
30 | 'spacy' \
31 | 'tensorflow' \
32 | 'keras' \
33 | 'r-rcurl=1.95*' && conda clean -tipsy
34 | USER root
35 | #download spacy's models for English
36 | RUN python -m spacy download en
37 |
38 | USER $NB_USER
39 |
40 | # Apache Toree kernel
41 | RUN pip --no-cache-dir install https://dist.apache.org/repos/dist/dev/incubator/toree/0.2.0/snapshots/dev1/toree-pip/toree-0.2.0.dev1.tar.gz
42 | RUN jupyter toree install --sys-prefix
43 |
44 | # Spylon-kernel
45 | RUN conda install --quiet --yes 'spylon-kernel=0.4*' && \
46 | conda clean -tipsy
47 | RUN python -m spylon_kernel install --sys-prefix
48 |
49 | RUN rm -r /home/$NB_USER/*
50 | ADD *.* /home/$NB_USER/
51 | ADD Solutions/* /home/$NB_USER/Solutions/
52 | ADD QuickUMLS /home/$NB_USER/QuickUMLS
53 |
54 | RUN bash setup_simstring.sh 3
55 |
56 | USER root
57 | RUN chown jovyan -R .
58 | RUN pip install leveldb
59 | # RUN pip install unidecode
60 | USER $NB_USER
61 | RUN cp -r simstring/ ~/QuickUMLS/
62 |
63 |
64 |
65 | USER root
66 |
67 | COPY pysparknlp-1.0.0.tar.gz /home/jovyan/
68 | COPY demo-data /home/jovyan/demo-data
69 | COPY strata-requirements.txt /home/jovyan/
70 | COPY strata_notebooks/*.ipynb /home/jovyan/
71 | RUN ls -l /home/jovyan
72 | RUN sudo chown -R jovyan:users /home/jovyan
73 | RUN ls -l /home/jovyan
74 |
75 | USER $NB_USER
76 |
77 | WORKDIR /home/jovyan/
78 |
79 | RUN pip install -r strata-requirements.txt
80 | RUN python -m nltk.downloader popular
81 |
82 | RUN tar -xzf pysparknlp-1.0.0.tar.gz
83 | #RUN cd demo-data/ && for f in *.tar.gz; do tar -xzf $f; done
84 |
85 |
86 |
87 | #RUN python3 ~/QuickUMLS/install.py ~/QuickUMLS/ ~/QuickUMLS/data
88 |
89 | # RUN rm ~QuickUMLS/*.RRF
90 |
91 | # docker tag cb9258ec4e02 melcutz/nlu
92 | # docker login --username=melcutz
93 | # docker push melcutz/nlu-demo
94 |
95 |
96 | # docker build -t nlu-demo:latest .
97 | # docker run -it --rm -p 8888:8888 nlu-demo
98 |
99 | # docker images
100 | # docker rmi --force imk
101 | # docker exec --user root -it bdff5651bbc8 bash
--------------------------------------------------------------------------------
/Installation instructions.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melcutz/NLU_tutorial/5e44d66f1f8354221b23618e10fd0059843dbbb2/Installation instructions.pdf
--------------------------------------------------------------------------------
/NLU at Scale with spaCy and Spark NLP - Feb 2018.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melcutz/NLU_tutorial/5e44d66f1f8354221b23618e10fd0059843dbbb2/NLU at Scale with spaCy and Spark NLP - Feb 2018.pptx
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # NLU_tutorial
2 | These are the notebooks covered during the tutorial.
3 | To be able to run these notebooks, please follow the instructions provided to install locally a copy of the referenced docker container
4 | and use the notebooks from the docker image.
5 |
--------------------------------------------------------------------------------