├── Chapter01 ├── butterfly.svg ├── displacy-ent-basic-1.4.2.py ├── displacy-ents-colored-1.4.6.py ├── displacy-generate-html-1.4.4.py ├── displacy-jupyter-render-1.4.3.ipynb ├── displacy-save-as-image-1-4-5.py ├── displacy-styled-colored-1.4.5.py ├── displacy-within-python-1.4.1.py ├── first-spacy-code-1.3.2.py ├── install-language-models-1.3.1.sh ├── install-spacy-1.2.1.sh └── update-spacy-1.2.2.sh ├── Chapter02 ├── container-objects-doc-2.4.1.py ├── container-objects-span-2.4.3.py ├── container-objects-token-2.4.2.py ├── lemmatization-2.3.1.py ├── more-spacy-features-2.5.1.py ├── sentence-segment-2.2.4.py ├── tokenization-basic-2.2.1.py ├── tokenization-customization-2.2.3.py └── tokenization-puncted-2.2.2.py ├── Chapter03 ├── dependency-3.2.1.py ├── merging-splitting-merge-3-4-1.py ├── merging-splitting-split-3-4-2.py ├── ner-3.3.1.py ├── pos-tagging-basics-3.1.1.py ├── pos-tagging-fish-3.1.3.py ├── pos-tagging-num-sym-3.1.4.py └── pos-tagging-verb-3.1.2.py ├── Chapter04 ├── entity_ruler_basic.py ├── extract_IBAN-nums.py ├── match_two_greetings.py ├── matcher_basic.py ├── matcher_capitals.py ├── matcher_ops.py ├── matcher_regex_basic.py ├── matcher_short_strings.py └── phrase_matcher_basic.py ├── Chapter05 ├── .gitkeep ├── has_vec_basic.py ├── noun-chunsk.py ├── similarity_basic.py ├── wv_basic.py └── wv_visualize.py ├── Chapter06 ├── ATIS_dataset_exploration.ipynb ├── calculate_ent_frequencies.py ├── data │ ├── atis_intents.csv │ └── atis_utterances.txt ├── extract_abbrevs.py └── extract_loc_names.py ├── Chapter07 ├── data │ └── corona.json └── train_on_cord19.py ├── Chapter08 ├── Keras_train.ipynb ├── Train_Food_reviews.ipynb └── data │ └── Reviews.zip ├── Chapter09 ├── BERT_spam.ipynb ├── bert_vectors.py ├── data │ └── spam.csv ├── tokenizer_basic.py ├── tokenizer_encode.py ├── tokenizer_encode_plus.py ├── transformer_pipe.py └── transformer_pipe_que.py ├── Chapter10 ├── Intent-classifier-char-LSTM.ipynb ├── data │ ├── restaurants.json │ └── utterances.txt ├── extract_city_ents.py └── extract_date_times.py ├── LICENSE └── README.md /Chapter01/butterfly.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | I 4 | PRON 5 | 6 | 7 | 8 | 'm 9 | AUX 10 | 11 | 12 | 13 | a 14 | DET 15 | 16 | 17 | 18 | butterfly. 19 | NOUN 20 | 21 | 22 | 23 | 24 | 25 | nsubj 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | det 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | attr 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /Chapter01/displacy-ent-basic-1.4.2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import spacy 4 | from spacy import displacy 5 | nlp = spacy.load("en_core_web_md") 6 | doc = nlp("Bill Gates is the CEO of Microsoft.") 7 | displacy.serve(doc, style="ent") 8 | 9 | -------------------------------------------------------------------------------- /Chapter01/displacy-ents-colored-1.4.6.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import spacy 4 | from spacy import displacy 5 | nlp = spacy.load("en_core_web_md") 6 | 7 | sentence = "Sony was leading the consumer music devices sector not so long ago before he lost it to Apple. By birth of music platforms such SoundCloud and Spotify, Sony lost the music battle completely. Over the last quarter, Apple sold 20.000 iPods for a profit of $5 million. Whereas Sony was able to sell only 5.000 Walkman music players." 8 | 9 | doc = nlp(sentence) 10 | 11 | colors = {"ORG": "linear-gradient(326deg, #a4508b, #5f0a87)", "PRODUCT": "radial-gradient(yellow, green)"} 12 | 13 | options = {"ents": ["ORG", "PRODUCT"], "colors": colors} 14 | displacy.serve(doc, style="ent", options=options) 15 | 16 | -------------------------------------------------------------------------------- /Chapter01/displacy-generate-html-1.4.4.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import spacy 4 | from spacy import displacy 5 | nlp = spacy.load("en_core_web_md") 6 | doc1 = nlp("I own a ginger cat.") 7 | doc2 = nlp("He is very pretty.") 8 | html = displacy.render([doc1, doc2], style="dep", page=True) 9 | print(html) 10 | 11 | -------------------------------------------------------------------------------- /Chapter01/displacy-jupyter-render-1.4.3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import spacy" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "from spacy import displacy " 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 6, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "nlp = spacy.load(\"en_core_web_md\") " 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 8, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "doc= nlp(\"Bill Gates is the CEO of Microsoft.\") " 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 9, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "text/html": [ 47 | "\n", 48 | "\n", 49 | " Bill\n", 50 | " PROPN\n", 51 | "\n", 52 | "\n", 53 | "\n", 54 | " Gates\n", 55 | " PROPN\n", 56 | "\n", 57 | "\n", 58 | "\n", 59 | " is\n", 60 | " AUX\n", 61 | "\n", 62 | "\n", 63 | "\n", 64 | " the\n", 65 | " DET\n", 66 | "\n", 67 | "\n", 68 | "\n", 69 | " CEO\n", 70 | " NOUN\n", 71 | "\n", 72 | "\n", 73 | "\n", 74 | " of\n", 75 | " ADP\n", 76 | "\n", 77 | "\n", 78 | "\n", 79 | " Microsoft.\n", 80 | " PROPN\n", 81 | "\n", 82 | "\n", 83 | "\n", 84 | " \n", 85 | " \n", 86 | " compound\n", 87 | " \n", 88 | " \n", 89 | "\n", 90 | "\n", 91 | "\n", 92 | " \n", 93 | " \n", 94 | " nsubj\n", 95 | " \n", 96 | " \n", 97 | "\n", 98 | "\n", 99 | "\n", 100 | " \n", 101 | " \n", 102 | " det\n", 103 | " \n", 104 | " \n", 105 | "\n", 106 | "\n", 107 | "\n", 108 | " \n", 109 | " \n", 110 | " attr\n", 111 | " \n", 112 | " \n", 113 | "\n", 114 | "\n", 115 | "\n", 116 | " \n", 117 | " \n", 118 | " prep\n", 119 | " \n", 120 | " \n", 121 | "\n", 122 | "\n", 123 | "\n", 124 | " \n", 125 | " \n", 126 | " pobj\n", 127 | " \n", 128 | " \n", 129 | "\n", 130 | "" 131 | ], 132 | "text/plain": [ 133 | "" 134 | ] 135 | }, 136 | "metadata": {}, 137 | "output_type": "display_data" 138 | } 139 | ], 140 | "source": [ 141 | "displacy.render(doc, style=\"dep\") " 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [] 150 | } 151 | ], 152 | "metadata": { 153 | "kernelspec": { 154 | "display_name": "Python 3", 155 | "language": "python", 156 | "name": "python3" 157 | }, 158 | "language_info": { 159 | "codemirror_mode": { 160 | "name": "ipython", 161 | "version": 3 162 | }, 163 | "file_extension": ".py", 164 | "mimetype": "text/x-python", 165 | "name": "python", 166 | "nbconvert_exporter": "python", 167 | "pygments_lexer": "ipython3", 168 | "version": "3.6.9" 169 | } 170 | }, 171 | "nbformat": 4, 172 | "nbformat_minor": 2 173 | } 174 | -------------------------------------------------------------------------------- /Chapter01/displacy-save-as-image-1-4-5.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import spacy 4 | from spacy import displacy 5 | from pathlib import Path 6 | nlp = spacy.load("en_core_web_md") 7 | 8 | doc = nlp("I'm a butterfly.") 9 | svg = displacy.render(doc, style="dep", jupyter=False) 10 | filename = "butterfly.svg" 11 | output_path = Path(filename) 12 | output_path.open("w", encoding="utf-8").write(svg) 13 | 14 | -------------------------------------------------------------------------------- /Chapter01/displacy-styled-colored-1.4.5.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import spacy 4 | from spacy import displacy 5 | nlp = spacy.load("en_core_web_md") 6 | 7 | doc = nlp("This is a sentence in compact mode with custom styles.") 8 | options = {"compact": True, "bg": "#09d5d4", "color": "orange", "font": "verdana"} 9 | displacy.serve(doc, style="dep", options=options) 10 | 11 | -------------------------------------------------------------------------------- /Chapter01/displacy-within-python-1.4.1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import spacy 4 | from spacy import displacy 5 | nlp = spacy.load("en_core_web_md") 6 | doc = nlp("I own a ginger cat.") 7 | displacy.serve(doc, style="dep") 8 | 9 | -------------------------------------------------------------------------------- /Chapter01/first-spacy-code-1.3.2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import spacy 4 | nlp = spacy.load("en_core_web_md") 5 | doc = nlp("I have a ginger cat") 6 | -------------------------------------------------------------------------------- /Chapter01/install-language-models-1.3.1.sh: -------------------------------------------------------------------------------- 1 | sudo python3 -m spacy download en 2 | -------------------------------------------------------------------------------- /Chapter01/install-spacy-1.2.1.sh: -------------------------------------------------------------------------------- 1 | sudo pip3 install spacy 2 | python3 -m spacy info 3 | -------------------------------------------------------------------------------- /Chapter01/update-spacy-1.2.2.sh: -------------------------------------------------------------------------------- 1 | sudo pip3 install -U spacy 2 | -------------------------------------------------------------------------------- /Chapter02/container-objects-doc-2.4.1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import spacy 4 | nlp = spacy.load("en_core_web_md") 5 | 6 | doc = nlp("I like cats.") 7 | print(doc.text) 8 | for token in doc: 9 | print(token.text) 10 | print(doc[1]) 11 | print(len(doc)) 12 | 13 | doc2 = nlp("This is a sentence. This is the second sentence.") 14 | sentences = list(doc2.sents) 15 | print(sentences) 16 | 17 | 18 | doc3 = nlp("I flied to New York with Ashley") 19 | print(doc3.ents) 20 | 21 | doc4 = nlp("Sweet brown fox jumped over the fence.") 22 | print(list(doc4.noun_chunks)) 23 | print(doc4.lang_) 24 | 25 | 26 | -------------------------------------------------------------------------------- /Chapter02/container-objects-span-2.4.3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import spacy 4 | nlp = spacy.load("en_core_web_md") 5 | 6 | doc = nlp("I know that you have been to USA.") 7 | print(doc[2:4]) 8 | 9 | 10 | doc2 = nlp("President Trump visited Mexico City.") 11 | print(doc2[4:]) 12 | print(doc2[3:-1]) 13 | 14 | 15 | doc3 = nlp("You love Atlanta since you're 20.") 16 | print(doc3.char_span(4, 16)) 17 | 18 | 19 | doc4 = nlp("You went there after you saw me.") 20 | span = doc[2:4] 21 | print(span) 22 | for token in span: 23 | print(token) 24 | print(len(span)) 25 | print(span.sent) 26 | print(span.doc) 27 | print(span.start) 28 | print(span.end) 29 | print(span.start_char) 30 | print(span.end_char) 31 | 32 | 33 | -------------------------------------------------------------------------------- /Chapter02/container-objects-token-2.4.2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import spacy 4 | nlp = spacy.load("en_core_web_md") 5 | 6 | doc = nlp("Hello Madam!") 7 | print(doc[0]) 8 | print(doc[0].text) 9 | print(doc[0].text_with_ws) 10 | print(doc[2].text_with_ws) 11 | print(len(doc[0])) 12 | 13 | token = doc[2] 14 | print(token.i) 15 | print(token.idx) 16 | print(token.doc) 17 | 18 | doc1 = nlp("He entered the room. Then he nodded.") 19 | print(doc1[0].is_sent_start) 20 | print(doc1[5].is_sent_start) 21 | print(doc1[6].is_sent_start) 22 | 23 | doc2 = nlp("President Trump visited Mexico City.") 24 | print(doc2.ents) 25 | print(doc2[1].ent_type_) 26 | print(doc2[3].ent_type_) 27 | print(doc2[4].ent_type_) 28 | print(doc2[0].ent_type_) 29 | 30 | -------------------------------------------------------------------------------- /Chapter02/lemmatization-2.3.1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import spacy 4 | nlp = spacy.load("en_core_web_md") 5 | doc = nlp("I went for working and worked for 3 years.") 6 | for token in doc: 7 | print(token.text, token.lemma_) 8 | 9 | -------------------------------------------------------------------------------- /Chapter02/more-spacy-features-2.5.1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | doc = nlp("Hello, hi!") 4 | print(doc[0].lower_) 5 | 6 | doc1 = nlp("HELLO, Hello, hello, hEllO") 7 | print(doc[0].is_upper) 8 | print(doc[0].is_lower) 9 | print(doc[1].is_upper) 10 | print(doc[1].is_lower) 11 | 12 | 13 | doc2 = nlp("Cat and Cat123") 14 | print(doc[0].is_alpha) 15 | print(doc[2].is_alpha) 16 | 17 | 18 | doc3 = nlp("Hamburg and Göttingen") 19 | print(doc3[0].is_ascii) 20 | print(doc3[2].is_ascii) 21 | 22 | 23 | doc4 = nlp("Cat Cat123 123") 24 | print(doc4[0].is_digit) 25 | print(doc4[1].is_digit) 26 | print(doc4[2].is_digit) 27 | 28 | 29 | 30 | doc5 = nlp("You, him and Sally") 31 | print(doc5[1]) 32 | print(doc5[1].is_punct) 33 | 34 | 35 | doc6 = nlp(" ") 36 | print(doc6[0]) 37 | print(len(doc6[0])) 38 | print(doc6[0].is_space) 39 | 40 | doc7 = nlp("I paid 12$ for the t-shirt.") 41 | print(doc7[3]) 42 | print(doc7[3].is_currency) 43 | 44 | 45 | doc8 = nlp("I emailed you at least 100 times") 46 | print(doc8[-2]) 47 | print(doc8[-2].like_num) 48 | 49 | doc9 = nlp("I emailed you at least hundred times") 50 | print(doc9[-2]) 51 | print(doc9[-2].like_num) 52 | 53 | doc10 = nlp("My email is duygu@packt.com and you can visit me under https://duygua.github.io any time you want.") 54 | print(doc10[3]) 55 | print(doc10[3].like_email) 56 | print(doc10[10]) 57 | print(doc10[10].like_url) 58 | 59 | doc11 = nlp("Girl called Kathy has a nickname Cat123.") 60 | for token in doc11: 61 | print(token.text, token.shape_) 62 | 63 | -------------------------------------------------------------------------------- /Chapter02/sentence-segment-2.2.4.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import spacy 4 | nlp = spacy.load("en_core_web_md") 5 | 6 | text = "I flied to N.Y yesterday. It was around 5 pm." 7 | doc = nlp(text) 8 | 9 | for sent in doc.sents: 10 | print(sent.text) 11 | -------------------------------------------------------------------------------- /Chapter02/tokenization-basic-2.2.1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import spacy 4 | nlp = spacy.load("en_core_web_md") 5 | doc = nlp("I own a ginger cat") 6 | print([token.text for token in doc]) 7 | -------------------------------------------------------------------------------- /Chapter02/tokenization-customization-2.2.3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import spacy 4 | from spacy.symbols import ORTH 5 | nlp = spacy.load("en_core_web_md") 6 | doc = nlp("lemme that") 7 | print([w.text for w in doc]) 8 | 9 | special_case = [{ORTH: "lem"}, {"ORTH": "me"}] 10 | nlp.tokenizer.add_special.case("lemme", special_case) 11 | print([w.text for w in nlp("lemme that")]) 12 | -------------------------------------------------------------------------------- /Chapter02/tokenization-puncted-2.2.2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import spacy 4 | nlp = spacy.load("en_core_web_md") 5 | doc = nlp("It's been a crazy week!!!") 6 | print([token.text for token in doc]) 7 | -------------------------------------------------------------------------------- /Chapter03/dependency-3.2.1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import spacy 4 | nlp = spacy.load("en_core_web_md") 5 | 6 | doc = nlp("I counted white sheep.") 7 | for token in doc: 8 | print(token.text, token.pos_, token.dep_) 9 | 10 | for token in doc: 11 | print(token.text, token.tag_, token.dep_, token.head) 12 | -------------------------------------------------------------------------------- /Chapter03/merging-splitting-merge-3-4-1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import spacy 4 | nlp = spacy.load("en_core_web_md") 5 | 6 | 7 | doc = nlp("She lived in New Hampshire.") 8 | print(doc.ents) 9 | print([(token.text, token.i) for token in doc]) 10 | print(len(doc)) 11 | 12 | with doc.retokenize() as retokenizer: 13 | retokenizer.merge(doc[3:5], attrs={"LEMMA":"new hampshire"}) 14 | 15 | print(doc.ents) 16 | print([(token.text, token.i) for token in doc]) 17 | print(len(doc)) 18 | print([(token.lemma_) for token in doc]) 19 | 20 | -------------------------------------------------------------------------------- /Chapter03/merging-splitting-split-3-4-2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import spacy 4 | nlp = spacy.load("en_core_web_md") 5 | 6 | 7 | doc = nlp("She lived in NewHampshire.") 8 | print(len(doc)) 9 | print([(token.text, token.lemma_, token.i) for token in doc]) 10 | for token in doc: 11 | print(token.text, token.pos_, token.tag_, token.dep_) 12 | 13 | with doc.retokenize() as retokenizer: 14 | heads = [(doc[3], 1), doc[2]] 15 | attrs = {"TAG":["NNP", "NNP"], "DEP":["compound", "pobj"]} 16 | retokenizer.split(doc[3], ["New", "Hampshire"], heads=heads, attrs=attrs) 17 | 18 | 19 | print(len(doc)) 20 | print([(token.text, token.lemma_, token.i) for token in doc]) 21 | for token in doc: 22 | print(token.text, token.pos_, token.tag_, token.dep_) 23 | -------------------------------------------------------------------------------- /Chapter03/ner-3.3.1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import spacy 4 | nlp = spacy.load("en_core_web_md") 5 | 6 | doc = nlp("The president Donald Trump visited France.") 7 | print(doc.ents) 8 | print(type(doc.ents[1])) 9 | 10 | print(spacy.explain("ORG")) 11 | 12 | doc2 = nlp("He worked for NASA") 13 | token = doc2[3] 14 | print(token.ent_type_, spacy.explain(token.ent_type_)) 15 | 16 | 17 | doc3 = nlp("“Albert Einstein was born in Ulm on 1987. He studied electronical engineering at ETH Zurich.") 18 | print(doc3.ents) 19 | 20 | for token in doc: 21 | print(token.text, token.ent_type_, spacy.explain(token.ent_type_)) 22 | 23 | -------------------------------------------------------------------------------- /Chapter03/pos-tagging-basics-3.1.1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import spacy 4 | nlp = spacy.load("en_core_web_md") 5 | 6 | doc = nlp("Alicia and me went to the school by bus") 7 | for token in doc: 8 | print(token.text, token.pos_, token.tag, spacy.explain(token.pos_), spacy.explain(token.tag_)) 9 | 10 | -------------------------------------------------------------------------------- /Chapter03/pos-tagging-fish-3.1.3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import spacy 4 | nlp = spacy.load("en_core_web_md") 5 | 6 | doc = nlp("My cat will fish for a fish tomorrow in a fishy way.") 7 | for token in doc: 8 | print(token.text, token.pos_, token.tag, spacy.explain(token.pos_), spacy.explain(token.tag_)) 9 | -------------------------------------------------------------------------------- /Chapter03/pos-tagging-num-sym-3.1.4.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import spacy 4 | nlp = spacy.load("en_core_web_md") 5 | 6 | doc = nlp("He earned $5.5 million in 2020 and paid %35 tax.") 7 | for token in doc: 8 | print(token.text, token.pos_, token.tag, spacy.explain(token.pos_), spacy.explain(token.tag_)) 9 | 10 | 11 | -------------------------------------------------------------------------------- /Chapter03/pos-tagging-verb-3.1.2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import spacy 4 | nlp = spacy.load("en_core_web_md") 5 | 6 | doc = nlp("My friend will fly to New York fast and she is staying there for 3 days.") 7 | for token in doc: 8 | print(token.text, token.pos_, token.tag, spacy.explain(token.pos_), spacy.explain(token.tag_)) 9 | 10 | -------------------------------------------------------------------------------- /Chapter04/entity_ruler_basic.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | from spacy.pipeline import EntityRuler 3 | 4 | nlp = spacy.load("en_core_web_md") 5 | doc = nlp("I have an acccount with chime since 2017") 6 | print(doc.ents) 7 | 8 | patterns = [{"label": "ORG", "pattern": [{"LOWER": "chime"}]}] 9 | ruler = nlp.add_pipe("entity_ruler") 10 | ruler.add_patterns(patterns) 11 | doc2 = nlp("I have an acccount with chime since 2017") 12 | 13 | print(doc2.ents) 14 | print(doc2[5].ent_type_) 15 | 16 | -------------------------------------------------------------------------------- /Chapter04/extract_IBAN-nums.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | from spacy.matcher import Matcher 3 | 4 | nlp = spacy.load("en_core_web_md") 5 | 6 | doc1 = nlp("You can call my office on +1 (221) 102-2423 or email me directly.") 7 | doc2 = nlp("You can call me on (221) 102 2423 or text me.") 8 | 9 | pattern = [{"TEXT": "+1", "OP": "?"}, {"TEXT": "("}, {"SHAPE": "ddd"}, {"TEXT": ")"}, {"SHAPE": "ddd"}, {"TEXT": "-", "OP": "?"}, {"SHAPE": "dddd"}] 10 | 11 | matcher = Matcher(nlp.vocab) 12 | matcher.add("usPhonNum", [pattern]) 13 | 14 | for mid, start, end in matcher(doc1): 15 | print(start, end, doc1[start:end]) 16 | -------------------------------------------------------------------------------- /Chapter04/match_two_greetings.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | from spacy.matcher import Matcher 3 | 4 | nlp = spacy.load("en_core_web_md") 5 | doc = nlp("Good morning, I want to reserve a ticket. I will then say good evening!") 6 | matcher = Matcher(nlp.vocab) 7 | pattern = [{"LOWER": "good"}, {"LOWER": "morning"}, {"IS_PUNCT": True}] 8 | matcher.add("morningGreeting", [pattern]) 9 | 10 | pattern2 = [{"LOWER": "good"}, {"LOWER": "evening"}, {"IS_PUNCT": True}] 11 | matcher.add("eveningGreeting", [pattern2]) 12 | matches = matcher(doc) 13 | for match_id, start, end in matches: 14 | pattern_name = nlp.vocab.strings[match_id] 15 | m_span = doc[start:end] 16 | print(start, end, m_span.text) 17 | -------------------------------------------------------------------------------- /Chapter04/matcher_basic.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | from spacy.matcher import Matcher 3 | 4 | nlp = spacy.load("en_core_web_md") 5 | doc = nlp("Good morning, I want to reserve a ticket.") 6 | matcher = Matcher(nlp.vocab) 7 | pattern = [{"LOWER": "good"}, {"LOWER": "morning"}, {"IS_PUNCT": True}] 8 | matcher.add("morningGreeting", [pattern]) 9 | matches = matcher(doc) 10 | for match_id, start, end in matches: 11 | m_span = doc[start:end] 12 | print(start, end, m_span.text) 13 | -------------------------------------------------------------------------------- /Chapter04/matcher_capitals.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | from spacy.matcher import Matcher 3 | 4 | nlp = spacy.load("en_core_web_md") 5 | doc = nlp("Take me out of your SPAM list. We never asked you to contact me. If you write again we’ll SUE!!!!") 6 | matcher = Matcher(nlp.vocab) 7 | pattern = [{"IS_UPPER": True}] 8 | matcher.add("onlyShort", [pattern]) 9 | matches = matcher(doc) 10 | for match_id, start, end in matches: 11 | print(start, end, doc[start:end]) 12 | -------------------------------------------------------------------------------- /Chapter04/matcher_ops.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | from spacy.matcher import Matcher 3 | 4 | nlp = spacy.load("en_core_web_md") 5 | doc1 = nlp("Hello hello hello, how are you?") 6 | doc2 = nlp("Hello, how are you?") 7 | doc3 = nlp("How are you?") 8 | matcher = Matcher(nlp.vocab) 9 | pattern = [{"LOWER": {"IN": ["hello", "hi", "hallo"]}, "OP": "+"}, {"IS_PUNCT": True}] 10 | matcher.add("greetings", [pattern]) 11 | 12 | for match_id, start, end in matcher(doc1): 13 | print(start, end, doc1[start:end]) 14 | 15 | for match_id, start, end in matcher(doc2): 16 | print(start, end, doc2[start:end]) 17 | 18 | for match_id, start, end in matcher(doc3): 19 | print(start, end, doc3[start:end]) 20 | -------------------------------------------------------------------------------- /Chapter04/matcher_regex_basic.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | from spacy.matcher import Matcher 3 | 4 | nlp = spacy.load("en_core_web_md") 5 | matcher = Matcher(nlp.vocab) 6 | 7 | doc1 = nlp("I travelled by bus.") 8 | doc2 = nlp("She traveled by bike.") 9 | 10 | pattern = [{"POS": "PRON"}, {"TEXT": {"REGEX": "[Tt]ravell?ed"}}] 11 | 12 | matcher.add("travelRegex", [pattern]) 13 | 14 | for mid, start, end in matcher(doc1): 15 | print(start, end, doc1[start:end]) 16 | 17 | for mid, start, end in matcher(doc2): 18 | print(start, end, doc2[start:end]) 19 | -------------------------------------------------------------------------------- /Chapter04/matcher_short_strings.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | from spacy.matcher import Matcher 3 | 4 | nlp = spacy.load("en_core_web_md") 5 | doc = nlp("I bought a pineapple.") 6 | matcher = Matcher(nlp.vocab) 7 | pattern = [{"LENGTH": 1}] 8 | matcher.add("onlyShort", [pattern]) 9 | matches = matcher(doc) 10 | for match_id, start, end in matches: 11 | m_span = doc[start:end] 12 | print(start, end, m_span.text) 13 | -------------------------------------------------------------------------------- /Chapter04/phrase_matcher_basic.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | from spacy.matcher import PhraseMatcher 3 | 4 | nlp = spacy.load("en_core_web_md") 5 | matcher = PhraseMatcher(nlp.vocab) 6 | 7 | terms = ["Angela Merkel", "Donald Trump", "Alexis Tsipras"] 8 | patterns = [nlp.make_doc(term) for term in terms] 9 | matcher.add("politiciansList", None, *patterns) 10 | 11 | doc = nlp("3 EU leaders met in Berlin. German chancellor Angela Merkel first welcomed the US president Donald Trump. The following day Alexis Tsipras joined them in Brandenburg.") 12 | 13 | matches = matcher(doc) 14 | 15 | for mid, start, end in matches: 16 | print(start, end, doc[start:end]) 17 | -------------------------------------------------------------------------------- /Chapter05/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Mastering-spaCy/eea4fab36b519c6d5d0cf86f5c330fdb6d791dd4/Chapter05/.gitkeep -------------------------------------------------------------------------------- /Chapter05/has_vec_basic.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | nlp = spacy.load("en_core_web_md") 3 | doc = nlp("You went there afskfsd.") 4 | 5 | for token in doc: 6 | print(token.is_oov, token.has_vector) 7 | -------------------------------------------------------------------------------- /Chapter05/noun-chunsk.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | nlp = spacy.load("en_core_web_md") 3 | 4 | doc = nlp("My beautiful and cute dog jumped over the fence") 5 | print(list(doc.noun_chunks)) 6 | 7 | sentences = nlp("I purchased a science fiction book last week. I loved everything related to this fragrance: light, floral and feminine... I purchased a bottle of wine.") 8 | 9 | key = nlp("perfume") 10 | 11 | for sent in sentences.sents: 12 | nchunks = [nchunk.text for nchunk in sent.noun_chunks] 13 | nchunk_doc = nlp(" ".join(nchunks)) 14 | print(nchunk_doc.similarity(key)) 15 | -------------------------------------------------------------------------------- /Chapter05/similarity_basic.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | nlp = spacy.load("en_core_web_md") 3 | doc1 = nlp("I visited England") 4 | doc2 = nlp("I went to London") 5 | 6 | print(doc1[1:3].similarity(doc2[1:4])) 7 | -------------------------------------------------------------------------------- /Chapter05/wv_basic.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | nlp = spacy.load("en_core_web_md") 3 | doc = nlp("I ate a banana.") 4 | print(type(doc[3].vector)) 5 | print(doc[3].vector.shape) 6 | print(doc[3].vector) 7 | -------------------------------------------------------------------------------- /Chapter05/wv_visualize.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from sklearn.decomposition import PCA 3 | import numpy as np 4 | import spacy 5 | 6 | nlp = spacy.load("en_core_web_md") 7 | 8 | vocab = nlp("cat dog tiger elephant bird monkey lion cheetah burger pizza food cheese wine salad noodles macaroni fruit vegetable") 9 | 10 | words = [word.text for word in vocab] 11 | vecs = np.vstack([word.vector for word in vocab if word.has_vector]) 12 | pca = PCA(n_components=2) 13 | vecs_transformed = pca.fit_transform(vecs) 14 | plt.figure(figsize=(20,15)) 15 | plt.scatter(vecs_transformed[:,0], vecs_transformed[:,1]) 16 | for word, coord in zip(words, vecs_transformed): 17 | x,y = coord 18 | plt.text(x,y,word, size=15) 19 | 20 | plt.show() 21 | -------------------------------------------------------------------------------- /Chapter06/ATIS_dataset_exploration.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 20, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "%matplotlib inline" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "dataset = pd.read_csv(\"atis_intents.csv\", header=None)" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 3, 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "data": { 29 | "text/html": [ 30 | "
\n", 31 | "\n", 44 | "\n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | "
01
0atis_flighti want to fly from boston at 838 am and arriv...
1atis_flightwhat flights are available from pittsburgh to...
2atis_flight_timewhat is the arrival time in san francisco for...
3atis_airfarecheapest airfare from tacoma to orlando
4atis_airfareround trip fares from pittsburgh to philadelp...
\n", 80 | "
" 81 | ], 82 | "text/plain": [ 83 | " 0 1\n", 84 | "0 atis_flight i want to fly from boston at 838 am and arriv...\n", 85 | "1 atis_flight what flights are available from pittsburgh to...\n", 86 | "2 atis_flight_time what is the arrival time in san francisco for...\n", 87 | "3 atis_airfare cheapest airfare from tacoma to orlando\n", 88 | "4 atis_airfare round trip fares from pittsburgh to philadelp..." 89 | ] 90 | }, 91 | "execution_count": 3, 92 | "metadata": {}, 93 | "output_type": "execute_result" 94 | } 95 | ], 96 | "source": [ 97 | "dataset.head()" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 6, 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "data": { 107 | "text/plain": [ 108 | "0 i want to fly from boston at 838 am and arriv...\n", 109 | "1 what flights are available from pittsburgh to...\n", 110 | "2 what is the arrival time in san francisco for...\n", 111 | "3 cheapest airfare from tacoma to orlando\n", 112 | "4 round trip fares from pittsburgh to philadelp...\n", 113 | " ... \n", 114 | "4973 what is the airfare for flights from denver t...\n", 115 | "4974 do you have any flights from denver to baltim...\n", 116 | "4975 which airlines fly into and out of denver\n", 117 | "4976 does continental fly from boston to san franc...\n", 118 | "4977 is there a delta flight from denver to san fr...\n", 119 | "Name: 1, Length: 4978, dtype: object" 120 | ] 121 | }, 122 | "execution_count": 6, 123 | "metadata": {}, 124 | "output_type": "execute_result" 125 | } 126 | ], 127 | "source": [ 128 | "dataset[1]" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 12, 134 | "metadata": {}, 135 | "outputs": [ 136 | { 137 | "name": "stdout", 138 | "output_type": "stream", 139 | "text": [ 140 | " i want to fly from boston at 838 am and arrive in denver at 1110 in the morning\n", 141 | " what flights are available from pittsburgh to baltimore on thursday morning\n", 142 | " what is the arrival time in san francisco for the 755 am flight leaving washington\n", 143 | " cheapest airfare from tacoma to orlando\n", 144 | " round trip fares from pittsburgh to philadelphia under 1000 dollars\n" 145 | ] 146 | } 147 | ], 148 | "source": [ 149 | "for text in dataset[1].head():\n", 150 | " print(text)" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 18, 156 | "metadata": {}, 157 | "outputs": [ 158 | { 159 | "name": "stdout", 160 | "output_type": "stream", 161 | "text": [ 162 | "0\n", 163 | "atis_abbreviation 147\n", 164 | "atis_aircraft 81\n", 165 | "atis_aircraft#atis_flight#atis_flight_no 1\n", 166 | "atis_airfare 423\n", 167 | "atis_airfare#atis_flight_time 1\n", 168 | "atis_airline 157\n", 169 | "atis_airline#atis_flight_no 2\n", 170 | "atis_airport 20\n", 171 | "atis_capacity 16\n", 172 | "atis_cheapest 1\n", 173 | "atis_city 19\n", 174 | "atis_distance 20\n", 175 | "atis_flight 3666\n", 176 | "atis_flight#atis_airfare 21\n", 177 | "atis_flight_no 12\n", 178 | "atis_flight_time 54\n", 179 | "atis_ground_fare 18\n", 180 | "atis_ground_service 255\n", 181 | "atis_ground_service#atis_ground_fare 1\n", 182 | "atis_meal 6\n", 183 | "atis_quantity 51\n", 184 | "atis_restriction 6\n", 185 | "dtype: int64\n" 186 | ] 187 | } 188 | ], 189 | "source": [ 190 | "grouped = dataset.groupby(0).size()\n", 191 | "print(grouped)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [] 207 | } 208 | ], 209 | "metadata": { 210 | "kernelspec": { 211 | "display_name": "Python 3", 212 | "language": "python", 213 | "name": "python3" 214 | }, 215 | "language_info": { 216 | "codemirror_mode": { 217 | "name": "ipython", 218 | "version": 3 219 | }, 220 | "file_extension": ".py", 221 | "mimetype": "text/x-python", 222 | "name": "python", 223 | "nbconvert_exporter": "python", 224 | "pygments_lexer": "ipython3", 225 | "version": "3.6.9" 226 | } 227 | }, 228 | "nbformat": 4, 229 | "nbformat_minor": 2 230 | } 231 | -------------------------------------------------------------------------------- /Chapter06/calculate_ent_frequencies.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | import spacy 3 | nlp = spacy.load("en_core_web_md") 4 | 5 | corpus = open("data/atis_utterances.txt", "r").read().split("\n") 6 | 7 | 8 | all_ent_labels = [] 9 | for sentence in corpus: 10 | doc = nlp(sentence.strip()) 11 | ents = doc.ents 12 | all_ent_labels += [ent.label_ for ent in ents] 13 | 14 | c = Counter(all_ent_labels) 15 | print(c) 16 | -------------------------------------------------------------------------------- /Chapter06/extract_abbrevs.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | from spacy.matcher import Matcher 3 | 4 | nlp = spacy.load("en_core_web_md") 5 | 6 | pattern1 = [{"TEXT": {"REGEX": "\w{1,2}\d{1,2}"}}] 7 | pattern2 = [{"SHAPE": { "IN": ["x", "xx"]}}, {"SHAPE": { "IN": ["d", "dd"]}}] 8 | pattern3 = [{"TEXT": {"IN": ["class", "code", "abbrev", "abbreviation"]}}, {"SHAPE": { "IN": ["x", "xx"]}}] 9 | pattern4 = [{"POS": "NOUN", "SHAPE": { "IN": ["x", "xx"]}}] 10 | 11 | matcher = Matcher(nlp.vocab) 12 | matcher.add("abbrevEnts", [pattern1, pattern2, pattern3, pattern4]) 13 | 14 | sentences = [ 15 | 'what does restriction ap 57 mean', 16 | 'what does the abbreviation co mean', 17 | 'what does fare code qo mean', 18 | 'what is the abbreviation d10', 19 | 'what does code y mean', 20 | 'what does the fare code f and fn mean', 21 | 'what is booking class c' 22 | ] 23 | 24 | 25 | for sent in sentences: 26 | doc = nlp(sent) 27 | matches = matcher(doc) 28 | for mid, start, end in matches: 29 | print(doc[start:end]) 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /Chapter06/extract_loc_names.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | from spacy.matcher import Matcher 3 | 4 | nlp = spacy.load("en_core_web_md") 5 | 6 | matcher = Matcher(nlp.vocab) 7 | pattern = [{"POS": "ADP"}, {"ENT_TYPE": "GPE"}] 8 | matcher.add("prepositionLocation", [pattern]) 9 | 10 | 11 | doc = nlp("Show me flights from Denver to Boston on tuesday") 12 | matches = matcher(doc) 13 | print(doc.ents) 14 | for mid, start, end in matches: 15 | print(doc[start:end]) 16 | 17 | -------------------------------------------------------------------------------- /Chapter07/data/corona.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | "Diarrhea, also spelled diarrhoea, is the condition of having at least three loose, liquid, or watery bowel movements each day.[2] It often lasts for a few days and can result in dehydration due to fluid loss.[2] Signs of dehydration often begin with loss of the normal stretchiness of the skin and irritable behaviour.[2] This can progress to decreased urination, loss of skin color, a fast heart rate, and a decrease in responsiveness as it becomes more severe.[2] Loose but non-watery stools in babies who are exclusively breastfed, however, are normal.[2]", 4 | { 5 | "entities": [ 6 | [ 7 | 364, 8 | 382, 9 | "MedicalCondition" 10 | ], 11 | [ 12 | 0, 13 | 8, 14 | "MedicalCondition" 15 | ], 16 | [ 17 | 94, 18 | 116, 19 | "MedicalCondition" 20 | ], 21 | [ 22 | 178, 23 | 189, 24 | "MedicalCondition" 25 | ], 26 | [ 27 | 221, 28 | 232, 29 | "MedicalCondition" 30 | ], 31 | [ 32 | 23, 33 | 32, 34 | "MedicalCondition" 35 | ], 36 | [ 37 | 409, 38 | 435, 39 | "MedicalCondition" 40 | ], 41 | [ 42 | 386, 43 | 401, 44 | "MedicalCondition" 45 | ] 46 | ] 47 | } 48 | ], 49 | [ 50 | "Antiretroviral therapy (ART) is recommended for all HIV-infected individuals to reduce the risk of disease progression.\nART also is recommended for HIV-infected individuals for the prevention of transmission of HIV.\nPatients starting ART should be willing and able to commit to treatment and understand the benefits and risks of therapy and the importance of adherence. Patients may choose to postpone therapy, and providers, on a case-by-case basis, may elect to defer therapy on the basis of clinical and/or psychosocial factors.", 51 | { 52 | "entities": [ 53 | [ 54 | 0, 55 | 22, 56 | "Medicine" 57 | ], 58 | [ 59 | 24, 60 | 27, 61 | "Medicine" 62 | ], 63 | [ 64 | 120, 65 | 123, 66 | "Medicine" 67 | ], 68 | [ 69 | 211, 70 | 214, 71 | "Pathogen" 72 | ], 73 | [ 74 | 52, 75 | 55, 76 | "Pathogen" 77 | ], 78 | [ 79 | 234, 80 | 237, 81 | "Medicine" 82 | ], 83 | [ 84 | 148, 85 | 151, 86 | "Pathogen" 87 | ] 88 | ] 89 | } 90 | ], 91 | [ 92 | "The goals of treatment are to reduce pain, decrease inflammation, and improve a person's overall functioning.[5] This may be helped by balancing rest and exercise, the use of splints and braces, or the use of assistive devices.[1][6][7] Pain medications, steroids, and NSAIDs are frequently used to help with symptoms.[1] Disease-modifying antirheumatic drugs (DMARDs), such as hydroxychloroquine and methotrexate, may be used to try to slow the progression of disease.[1] Biological DMARDs may be used when disease does not respond to other treatments.[8] However, they may have a greater rate of adverse effects.[9] Surgery to repair, replace, or fuse joints may help in certain situations.[1] Most alternative medicine treatments are not supported by evidence.[10][11]", 93 | { 94 | "entities": [ 95 | [ 96 | 401, 97 | 413, 98 | "Medicine" 99 | ], 100 | [ 101 | 378, 102 | 396, 103 | "Medicine" 104 | ], 105 | [ 106 | 473, 107 | 490, 108 | "Medicine" 109 | ], 110 | [ 111 | 255, 112 | 263, 113 | "Medicine" 114 | ] 115 | ] 116 | } 117 | ], 118 | [ 119 | "Hantaviruses, usually found in rodents and shrews, were discovered in two species of bats. The Mouyassué virus (MOUV) was isolated from banana pipistrelle bats captured near Mouyassué village in Cote d'Ivoire, West Africa. The Magboi virus was isolated from hairy slit-faced bats found near the Magboi River in Sierra Leone in 2011. They are single-stranded, negative sense, RNA viruses in the Bunyaviridae family.[29][30][31][32]", 120 | { 121 | "entities": [ 122 | [ 123 | 0, 124 | 12, 125 | "Pathogen" 126 | ], 127 | [ 128 | 394, 129 | 406, 130 | "Pathogen" 131 | ], 132 | [ 133 | 227, 134 | 239, 135 | "Pathogen" 136 | ], 137 | [ 138 | 95, 139 | 110, 140 | "Pathogen" 141 | ] 142 | ] 143 | } 144 | ], 145 | [ 146 | "Bats are the most common source of rabies in humans in North and South America, Western Europe, and Australia. In the United States, there were 19 cases of human rabies from 1997–2006, 17 of which were attributed to bats.[27] In North America, about half of human rabies instances are cryptic, meaning that the patient has no known bite history.[24] While it has been speculated that rabies virus could be transmitted through aerosols, studies of the rabies virus have concluded that this is only feasible in limited conditions. These conditions include a very large colony of bats in a hot and humid cave with poor ventilation. While two human deaths in 1956 and 1959 had been tentatively attributed to aerosolization of the rabies virus after entering a cave with bats, \"investigations of the 2 reported human cases revealed that both infections could be explained by means other than aerosol transmission\".[28] It is instead generally thought that most instances of cryptic rabies are the result of an unknown bat bite.[24] Bites from a bat can be so small that they are not visible without magnification equipment, for example. Outside of bites, rabies virus exposure can also occur if infected fluids come in contact with a mucous membrane or a break in the skin. Rabies virus has also been transmitted when an infected human unknowingly dies of rabies, and their organs are transplanted to others.[28]", 147 | { 148 | "entities": [ 149 | [ 150 | 35, 151 | 41, 152 | "MedicalCondition" 153 | ], 154 | [ 155 | 162, 156 | 168, 157 | "MedicalCondition" 158 | ], 159 | [ 160 | 384, 161 | 396, 162 | "Pathogen" 163 | ], 164 | [ 165 | 1269, 166 | 1281, 167 | "Pathogen" 168 | ], 169 | [ 170 | 1343, 171 | 1347, 172 | "MedicalCondition" 173 | ], 174 | [ 175 | 977, 176 | 983, 177 | "MedicalCondition" 178 | ], 179 | [ 180 | 1027, 181 | 1032, 182 | "MedicalCondition" 183 | ] 184 | ] 185 | } 186 | ], 187 | [ 188 | "Other groups of intracellular bacterial pathogens include Salmonella, Neisseria, Brucella, Mycobacterium, Nocardia, Listeria, Francisella, Legionella, and Yersinia pestis. These can exist intracellularly, but can exist outside of host cells.", 189 | { 190 | "entities": [ 191 | [ 192 | 116, 193 | 124, 194 | "Pathogen" 195 | ], 196 | [ 197 | 155, 198 | 170, 199 | "Pathogen" 200 | ], 201 | [ 202 | 126, 203 | 137, 204 | "Pathogen" 205 | ], 206 | [ 207 | 70, 208 | 79, 209 | "Pathogen" 210 | ], 211 | [ 212 | 139, 213 | 149, 214 | "Pathogen" 215 | ], 216 | [ 217 | 106, 218 | 114, 219 | "Pathogen" 220 | ], 221 | [ 222 | 91, 223 | 104, 224 | "Pathogen" 225 | ], 226 | [ 227 | 81, 228 | 89, 229 | "Pathogen" 230 | ], 231 | [ 232 | 58, 233 | 68, 234 | "Pathogen" 235 | ] 236 | ] 237 | } 238 | ], 239 | [ 240 | "One of the bacterial diseases with the highest disease burden is tuberculosis, caused by Mycobacterium tuberculosis bacteria, which kills about 2 million people a year, mostly in sub-Saharan Africa. Pathogenic bacteria contribute to other globally important diseases, such as pneumonia, which can be caused by bacteria such as Streptococcus and Pseudomonas, and foodborne illnesses, which can be caused by bacteria such as Shigella, Campylobacter, and Salmonella. Pathogenic bacteria also cause infections such as tetanus, typhoid fever, diphtheria, syphilis, and leprosy. Pathogenic bacteria are also the cause of high infant mortality rates in developing countries.[3]", 241 | { 242 | "entities": [ 243 | [ 244 | 327, 245 | 340, 246 | "Pathogen" 247 | ], 248 | [ 249 | 514, 250 | 521, 251 | "MedicalCondition" 252 | ], 253 | [ 254 | 452, 255 | 462, 256 | "Pathogen" 257 | ], 258 | [ 259 | 276, 260 | 285, 261 | "MedicalCondition" 262 | ], 263 | [ 264 | 523, 265 | 536, 266 | "MedicalCondition" 267 | ], 268 | [ 269 | 564, 270 | 571, 271 | "MedicalCondition" 272 | ], 273 | [ 274 | 433, 275 | 446, 276 | "Pathogen" 277 | ], 278 | [ 279 | 538, 280 | 548, 281 | "MedicalCondition" 282 | ], 283 | [ 284 | 345, 285 | 356, 286 | "Pathogen" 287 | ], 288 | [ 289 | 65, 290 | 77, 291 | "MedicalCondition" 292 | ], 293 | [ 294 | 550, 295 | 558, 296 | "MedicalCondition" 297 | ], 298 | [ 299 | 89, 300 | 115, 301 | "Pathogen" 302 | ], 303 | [ 304 | 423, 305 | 431, 306 | "Pathogen" 307 | ] 308 | ] 309 | } 310 | ], 311 | [ 312 | "Although the vast majority of bacteria are harmless or beneficial to one's body, a few pathogenic bacteria can cause infectious diseases. The most common bacterial disease is tuberculosis, caused by the bacterium Mycobacterium tuberculosis, which affects about 2 million people mostly in sub-Saharan Africa. Pathogenic bacteria contribute to other globally important diseases, such as pneumonia, which can be caused by bacteria such as Streptococcus and Pseudomonas, and foodborne illnesses, which can be caused by bacteria such as Shigella, Campylobacter, and Salmonella. Pathogenic bacteria also cause infections such as tetanus, typhoid fever, diphtheria, syphilis, and Hansen's disease. They typically range between 1 and 5 micrometers in length.", 313 | { 314 | "entities": [ 315 | [ 316 | 659, 317 | 667, 318 | "MedicalCondition" 319 | ], 320 | [ 321 | 436, 322 | 449, 323 | "Pathogen" 324 | ], 325 | [ 326 | 673, 327 | 689, 328 | "MedicalCondition" 329 | ], 330 | [ 331 | 30, 332 | 38, 333 | "Pathogen" 334 | ], 335 | [ 336 | 454, 337 | 465, 338 | "Pathogen" 339 | ], 340 | [ 341 | 647, 342 | 657, 343 | "MedicalCondition" 344 | ], 345 | [ 346 | 87, 347 | 106, 348 | "Pathogen" 349 | ], 350 | [ 351 | 532, 352 | 540, 353 | "Pathogen" 354 | ], 355 | [ 356 | 561, 357 | 571, 358 | "Pathogen" 359 | ], 360 | [ 361 | 623, 362 | 630, 363 | "MedicalCondition" 364 | ], 365 | [ 366 | 471, 367 | 490, 368 | "MedicalCondition" 369 | ], 370 | [ 371 | 632, 372 | 645, 373 | "MedicalCondition" 374 | ], 375 | [ 376 | 542, 377 | 555, 378 | "Pathogen" 379 | ] 380 | ] 381 | } 382 | ], 383 | [ 384 | "Much like viral pathogens, infection by certain bacterial pathogens can be prevented via vaccines.[30] Vaccines against bacterial pathogens include the anthrax vaccine and the pneumococcal vaccine. Many other bacterial pathogens lack vaccines as a preventive measure, but infection by these bacteria can often be treated or prevented with antibiotics. Common antibiotics include amoxicillin, ciprofloxacin, and doxycycline. Each antibiotic has different bacteria that it is effective against and has different mechanisms to kill that bacteria. For example, doxycycline inhibits the synthesis of new proteins in both gram-negative and gram-positive bacteria which leads to the death of the affected bacteria.[35]", 385 | { 386 | "entities": [ 387 | [ 388 | 379, 389 | 390, 390 | "Medicine" 391 | ], 392 | [ 393 | 152, 394 | 167, 395 | "Medicine" 396 | ], 397 | [ 398 | 411, 399 | 422, 400 | "Medicine" 401 | ], 402 | [ 403 | 392, 404 | 405, 405 | "Medicine" 406 | ], 407 | [ 408 | 176, 409 | 196, 410 | "Medicine" 411 | ] 412 | ] 413 | } 414 | ], 415 | [ 416 | "The term pathogen came into use in the 1880s.[1][2] Typically, the term is used to describe an infectious microorganism or agent, such as a virus, bacterium, protozoan, prion, viroid, or fungus.[", 417 | { 418 | "entities": [ 419 | [ 420 | 158, 421 | 167, 422 | "Pathogen" 423 | ], 424 | [ 425 | 95, 426 | 119, 427 | "Pathogen" 428 | ], 429 | [ 430 | 187, 431 | 193, 432 | "Pathogen" 433 | ], 434 | [ 435 | 147, 436 | 156, 437 | "Pathogen" 438 | ], 439 | [ 440 | 140, 441 | 145, 442 | "Pathogen" 443 | ] 444 | ] 445 | } 446 | ], 447 | [ 448 | "Some antidepressants are used as a treatment for social anxiety disorder, but their efficacy is not entirely convincing, as only a small proportion of antidepressants showed some efficacy for this condition. Paroxetine was the first drug to be FDA-approved for this disorder. Its efficacy is considered beneficial, although not everyone responds favorably to the drug. Sertraline and fluvoxamine extended release were later approved for it as well, while escitalopram is used off-label with acceptable efficacy. However, there isn't enough evidence to support citalopram for treating social phobia, and fluoxetine was no better than placebo in clinical trials. SSRIs are used as a first-line treatment for social anxiety, but they don't work for everyone. One alternative would be venlafaxine, which is a SNRI. It showed benefits for social phobia in five clinical trials against placebo, while the other SNRIs are not considered particularly useful for this disorder as many of them didn't undergo testing for it. As of now, it is unclear if duloxetine and desvenlafaxine can provide benefits for social anxiety sufferers. However, another class of antidepressants called MAOIs are considered effective for social anxiety, but they come with many unwanted side effects and are rarely used. Phenelzine was shown to be a good treatment option, but its use is limited by dietary restrictions. Moclobemide is a RIMA and showed mixed results but still got approval in some European countries for social anxiety disorder. TCA antidepressants, such as clomipramine and imipramine, are not considered effective for this anxiety disorder in particular. This leaves out SSRIs such as paroxetine, sertraline and fluvoxamine CR as acceptable and tolerated treatment options for this disorder.[19][20]", 449 | { 450 | "entities": [ 451 | [ 452 | 384, 453 | 395, 454 | "Medicine" 455 | ], 456 | [ 457 | 1098, 458 | 1112, 459 | "MedicalCondition" 460 | ], 461 | [ 462 | 1687, 463 | 1697, 464 | "Medicine" 465 | ], 466 | [ 467 | 49, 468 | 72, 469 | "MedicalCondition" 470 | ], 471 | [ 472 | 1173, 473 | 1178, 474 | "Medicine" 475 | ], 476 | [ 477 | 1702, 478 | 1713, 479 | "Medicine" 480 | ], 481 | [ 482 | 781, 483 | 792, 484 | "Medicine" 485 | ], 486 | [ 487 | 1563, 488 | 1573, 489 | "Medicine" 490 | ], 491 | [ 492 | 603, 493 | 613, 494 | "Medicine" 495 | ], 496 | [ 497 | 1675, 498 | 1685, 499 | "MedicalCondition" 500 | ], 501 | [ 502 | 1613, 503 | 1629, 504 | "MedicalCondition" 505 | ], 506 | [ 507 | 369, 508 | 379, 509 | "Medicine" 510 | ], 511 | [ 512 | 1291, 513 | 1301, 514 | "Medicine" 515 | ], 516 | [ 517 | 1546, 518 | 1558, 519 | "Medicine" 520 | ], 521 | [ 522 | 455, 523 | 467, 524 | "Medicine" 525 | ], 526 | [ 527 | 1391, 528 | 1402, 529 | "Medicine" 530 | ], 531 | [ 532 | 584, 533 | 597, 534 | "MedicalCondition" 535 | ] 536 | ] 537 | } 538 | ], 539 | [ 540 | "However, existing data suggest that patients taking bedaquiline in addition to standard TB therapy are five times more likely to die than those without the new drug,[184] which has resulted in medical journal articles raising health policy questions about why the FDA approved the drug and whether financial ties to the company making bedaquiline influenced physicians' support for its use.[183][185]", 541 | { 542 | "entities": [ 543 | [ 544 | 88, 545 | 98, 546 | "Medicine" 547 | ], 548 | [ 549 | 335, 550 | 346, 551 | "Medicine" 552 | ], 553 | [ 554 | 52, 555 | 63, 556 | "Medicine" 557 | ] 558 | ] 559 | } 560 | ], 561 | [ 562 | "Tuberculosis may infect any part of the body, but most commonly occurs in the lungs (known as pulmonary tuberculosis).[9] Extrapulmonary TB occurs when tuberculosis develops outside of the lungs, although extrapulmonary TB may coexist with pulmonary TB.[9]\n\nGeneral signs and symptoms include fever, chills, night sweats, loss of appetite, weight loss, and fatigue.[9] Significant nail clubbing may also occur.[16]", 563 | { 564 | "entities": [] 565 | } 566 | ], 567 | [ 568 | "A number of factors make people more susceptible to TB infections. The most important risk factor globally is HIV; 13% of all people with TB are infected by the virus.[39] This is a particular problem in sub-Saharan Africa, where rates of HIV are high.[40][41] Of people without HIV who are infected with tuberculosis, about 5–10% develop active disease during their lifetimes;[16] in contrast, 30% of those coinfected with HIV develop the active disease.[16]", 569 | { 570 | "entities": [ 571 | [ 572 | 279, 573 | 282, 574 | "Pathogen" 575 | ] 576 | ] 577 | } 578 | ], 579 | [ 580 | "Examples of common human diseases caused by viruses include the common cold, influenza, chickenpox, and cold sores. Many serious diseases such as rabies, Ebola virus disease, AIDS (HIV), avian influenza, and SARS are caused by viruses. The relative ability of viruses to cause disease is described in terms of virulence. Other diseases are under investigation to discover if they have a virus as the causative agent, such as the possible connection between human herpesvirus 6 (HHV6) and neurological diseases such as multiple sclerosis and chronic fatigue syndrome.[151] There is controversy over whether the bornavirus, previously thought to cause neurological diseases in horses, could be responsible for psychiatric illnesses in humans.[152]", 581 | { 582 | "entities": [ 583 | [ 584 | 518, 585 | 536, 586 | "MedicalCondition" 587 | ], 588 | [ 589 | 154, 590 | 165, 591 | "Pathogen" 592 | ], 593 | [ 594 | 708, 595 | 729, 596 | "MedicalCondition" 597 | ], 598 | [ 599 | 463, 600 | 476, 601 | "Pathogen" 602 | ], 603 | [ 604 | 77, 605 | 86, 606 | "MedicalCondition" 607 | ], 608 | [ 609 | 88, 610 | 98, 611 | "MedicalCondition" 612 | ], 613 | [ 614 | 187, 615 | 202, 616 | "MedicalCondition" 617 | ], 618 | [ 619 | 610, 620 | 620, 621 | "Pathogen" 622 | ] 623 | ] 624 | } 625 | ], 626 | [ 627 | "Buprenorphine has been shown experimentally (1982–1995) to be effective against severe, refractory depression", 628 | { 629 | "entities": [ 630 | [ 631 | 0, 632 | 13, 633 | "Medicine" 634 | ], 635 | [ 636 | 88, 637 | 109, 638 | "MedicalCondition" 639 | ] 640 | ] 641 | } 642 | ], 643 | [ 644 | "Bupropion (Wellbutrin), an anti-depressant, is also used as a smoking cessation aid; this indication was later approved, and the name of the smoking cessation product is Zyban. In Ontario, Canada, smoking cessation drugs are not covered by provincial drug plans; elsewhere, Zyban is priced higher than Wellbutrin, despite being the same drug. Therefore, some physicians prescribe Wellbutrin for both indications.[", 645 | { 646 | "entities": [ 647 | [ 648 | 274, 649 | 279, 650 | "Medicine" 651 | ], 652 | [ 653 | 11, 654 | 21, 655 | "Medicine" 656 | ], 657 | [ 658 | 302, 659 | 312, 660 | "Medicine" 661 | ], 662 | [ 663 | 380, 664 | 390, 665 | "Medicine" 666 | ], 667 | [ 668 | 170, 669 | 175, 670 | "Medicine" 671 | ], 672 | [ 673 | 0, 674 | 9, 675 | "Medicine" 676 | ] 677 | ] 678 | } 679 | ], 680 | [ 681 | "Carbamazepine is an approved treatment for bipolar disorder and epileptic seizures, but it has side effects useful in treating attention-deficit hyperactivity disorder (ADHD), schizophrenia, phantom limb syndrome, paroxysmal extreme pain disorder, neuromyotonia, and post-traumatic stress disorder.[8]", 682 | { 683 | "entities": [ 684 | [ 685 | 267, 686 | 288, 687 | "MedicalCondition" 688 | ], 689 | [ 690 | 248, 691 | 261, 692 | "MedicalCondition" 693 | ], 694 | [ 695 | 0, 696 | 13, 697 | "Medicine" 698 | ], 699 | [ 700 | 43, 701 | 59, 702 | "MedicalCondition" 703 | ], 704 | [ 705 | 145, 706 | 167, 707 | "MedicalCondition" 708 | ], 709 | [ 710 | 176, 711 | 189, 712 | "MedicalCondition" 713 | ], 714 | [ 715 | 64, 716 | 82, 717 | "MedicalCondition" 718 | ], 719 | [ 720 | 191, 721 | 212, 722 | "MedicalCondition" 723 | ] 724 | ] 725 | } 726 | ], 727 | [ 728 | "The antiviral drugs amantadine and rimantadine inhibit a viral ion channel (M2 protein), thus inhibiting replication of the influenza A virus.[86] These drugs are sometimes effective against influenza A if given early in the infection but are ineffective against influenza B viruses, which lack the M2 drug target.[160] Measured resistance to amantadine and rimantadine in American isolates of H3N2 has increased to 91% in 2005.[161] This high level of resistance may be due to the easy availability of amantadines as part of over-the-counter cold remedies in countries such as China and Russia,[162] and their use to prevent outbreaks of influenza in farmed poultry.[163][164] The CDC recommended against using M2 inhibitors during the 2005–06 influenza season due to high levels of drug resistance.[165]", 729 | { 730 | "entities": [ 731 | [ 732 | 639, 733 | 648, 734 | "MedicalCondition" 735 | ], 736 | [ 737 | 35, 738 | 46, 739 | "Medicine" 740 | ], 741 | [ 742 | 712, 743 | 725, 744 | "Medicine" 745 | ], 746 | [ 747 | 20, 748 | 30, 749 | "Medicine" 750 | ] 751 | ] 752 | } 753 | ], 754 | [ 755 | "The two classes of antiviral drugs used against influenza are neuraminidase inhibitors (oseltamivir, zanamivir, laninamivir and peramivir) and M2 protein inhibitors (adamantane derivatives)", 756 | { 757 | "entities": [ 758 | [ 759 | 128, 760 | 137, 761 | "Medicine" 762 | ], 763 | [ 764 | 101, 765 | 110, 766 | "Medicine" 767 | ], 768 | [ 769 | 112, 770 | 123, 771 | "Medicine" 772 | ], 773 | [ 774 | 48, 775 | 57, 776 | "MedicalCondition" 777 | ], 778 | [ 779 | 88, 780 | 99, 781 | "Medicine" 782 | ] 783 | ] 784 | } 785 | ] 786 | ] 787 | -------------------------------------------------------------------------------- /Chapter07/train_on_cord19.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | with open("data/corona.json") as f: 4 | data = json.loads(f.read()) 5 | 6 | 7 | TRAIN_DATA = [] 8 | 9 | for (text, annot) in data: 10 | new_anno = [] 11 | for anno in annot["entities"]: 12 | st, end, label = anno 13 | new_anno.append((st, end, label)) 14 | TRAIN_DATA.append((text, {"entities": new_anno})) 15 | 16 | 17 | labels = ['Pathogen', 'MedicalCondition', 'Medicine'] 18 | 19 | import random 20 | import spacy 21 | 22 | from spacy.training import Example 23 | 24 | 25 | 26 | 27 | 28 | nlp = spacy.blank("en") 29 | ner = nlp.add_pipe("ner") 30 | 31 | 32 | print(ner) 33 | print(nlp.meta) 34 | 35 | 36 | for ent in labels: 37 | ner.add_label(ent) 38 | 39 | print(ner.labels) 40 | 41 | 42 | epochs = 25 43 | 44 | other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] 45 | 46 | with nlp.disable_pipes(*other_pipes): 47 | 48 | optimizer = nlp.begin_training() 49 | 50 | for i in range(100): 51 | random.shuffle(TRAIN_DATA) 52 | for text, annotation in TRAIN_DATA: 53 | doc = nlp.make_doc(text) 54 | example = Example.from_dict(doc, annotation) 55 | nlp.update([example], sgd=optimizer) 56 | 57 | 58 | 59 | doc = nlp("One of the bacterial diseases with the highest disease burden is tuberculosis, caused by Mycobacterium tuberculosis bacteria, which kills about 2 million people a year.") 60 | doc2 = nlp("Pathogenic bacteria contribute to other globally important diseases, such as pneumonia, which can be caused by bacteria such as Streptococcus and Pseudomonas, and foodborne illnesses, which can be caused by bacteria such as Shigella, Campylobacter, and Salmonella. Pathogenic bacteria also cause infections such as tetanus, typhoid fever, diphtheria, syphilis, and leprosy. Pathogenic bacteria are also the cause of high infant mortality rates in developing countries.") 61 | 62 | 63 | print(doc.ents) 64 | print(doc) 65 | 66 | from spacy import displacy 67 | displacy.serve(doc2, style="ent") 68 | 69 | 70 | -------------------------------------------------------------------------------- /Chapter08/Keras_train.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import matplotlib.pyplot as plt\n", 11 | "%matplotlib inline" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "data": { 21 | "text/plain": [ 22 | "(3999, 10)" 23 | ] 24 | }, 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "output_type": "execute_result" 28 | } 29 | ], 30 | "source": [ 31 | "reviews_df=pd.read_csv('data/Reviews.csv')\n", 32 | "reviews_df.shape" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 3, 38 | "metadata": {}, 39 | "outputs": [ 40 | { 41 | "data": { 42 | "text/html": [ 43 | "
\n", 44 | "\n", 57 | "\n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | "
IdProductIdUserIdProfileNameHelpfulnessNumeratorHelpfulnessDenominatorScoreTimeSummaryText
050057B000ER5DFQA1ESDLEDR9Y0JXA. Spencer1211310256000the garbanzo beans in it give horrible gasTo be fair only one of my twins got gas from t...
1366917B001AIQP8MA324KM3YY1DWQGdanitrice0051251072000Yummy Lil' Treasures!!Just recieved our first order of these (they d...
2214380B001E5E1XWA3QCWO53N69HW3M. A. Vaughan \"-_-GOBNOGO-_-\"2251276473600Great ChaiThis is a fantastic Chai Masala. I am very pic...
3178476B000TIZP5IAYZ5NG9705AG1Consumer0051341360000Celtic Salt worth extra priceFlavorful and has added nutrition! You use le...
4542504B000E18CVEA2LMWCJUF5HZ4ZMiki Lam \"mikilam\"81131222732800mixed feelingsI thought this soup tasted good. I liked the t...
\n", 141 | "
" 142 | ], 143 | "text/plain": [ 144 | " Id ProductId UserId ProfileName \\\n", 145 | "0 50057 B000ER5DFQ A1ESDLEDR9Y0JX A. Spencer \n", 146 | "1 366917 B001AIQP8M A324KM3YY1DWQG danitrice \n", 147 | "2 214380 B001E5E1XW A3QCWO53N69HW3 M. A. Vaughan \"-_-GOBNOGO-_-\" \n", 148 | "3 178476 B000TIZP5I AYZ5NG9705AG1 Consumer \n", 149 | "4 542504 B000E18CVE A2LMWCJUF5HZ4Z Miki Lam \"mikilam\" \n", 150 | "\n", 151 | " HelpfulnessNumerator HelpfulnessDenominator Score Time \\\n", 152 | "0 1 2 1 1310256000 \n", 153 | "1 0 0 5 1251072000 \n", 154 | "2 2 2 5 1276473600 \n", 155 | "3 0 0 5 1341360000 \n", 156 | "4 8 11 3 1222732800 \n", 157 | "\n", 158 | " Summary \\\n", 159 | "0 the garbanzo beans in it give horrible gas \n", 160 | "1 Yummy Lil' Treasures!! \n", 161 | "2 Great Chai \n", 162 | "3 Celtic Salt worth extra price \n", 163 | "4 mixed feelings \n", 164 | "\n", 165 | " Text \n", 166 | "0 To be fair only one of my twins got gas from t... \n", 167 | "1 Just recieved our first order of these (they d... \n", 168 | "2 This is a fantastic Chai Masala. I am very pic... \n", 169 | "3 Flavorful and has added nutrition! You use le... \n", 170 | "4 I thought this soup tasted good. I liked the t... " 171 | ] 172 | }, 173 | "execution_count": 3, 174 | "metadata": {}, 175 | "output_type": "execute_result" 176 | } 177 | ], 178 | "source": [ 179 | "reviews_df.head()" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 4, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "reviews_df = reviews_df[['Text','Score']].dropna()" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 5, 194 | "metadata": {}, 195 | "outputs": [ 196 | { 197 | "data": { 198 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAD4CAYAAADlwTGnAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAADolJREFUeJzt3X+o3fV9x/Hny/iDdS014l1wSdpIyTZStkV7FwXLsJRqdGOxsInZqEG63v4RmWX9J+0Y9geCf6wVBCekM61Cq7i1YlZCbSauxQ1tbqyo0Yl3VjEhxtsp2s5iiX3vj/PNchpvcs/9kXNu+DwfcDjf8/5+vt/z/n6V+8r3172pKiRJ7Tlt1A1IkkbDAJCkRhkAktQoA0CSGmUASFKjDABJapQBIEmNMgAkqVEGgCQ16vRRN3Ai5557bq1Zs2bUbUjSKWXv3r0/raqx2cYt6QBYs2YNk5OTo25Dkk4pSV4cZJyngCSpUQaAJDXKAJCkRhkAktQoA0CSGjVrACRZneShJE8n2Zfkhq7+hSQHkjzeva7sW+ZzSaaSPJvk8r76xq42lWTbydkkSdIgBrkN9DDw2ap6LMl7gL1Jdnfzbqmqf+gfnGQdcA3wQeC3gX9L8jvd7NuAjwH7gT1JdlbV04uxIZKkuZk1AKrqIHCwm/5ZkmeAlSdYZBNwT1W9BfwkyRSwoZs3VVXPAyS5pxtrAEjSCMzpQbAka4ALgEeBS4Drk1wLTNI7SniNXjg80rfYfo4GxkvH1C+a4TsmgAmA973vfXNpb0Z3/3j/gtexGDZfsGrULUjSrxn4InCSdwPfBj5TVW8AtwMfANbTO0L4ymI0VFXbq2q8qsbHxmZ9klmSNE8DHQEkOYPeD/9vVtV3AKrqUN/8rwHf7T4eAFb3Lb6qq3GCuiRpyAa5CyjAHcAzVfXVvvp5fcM+DjzVTe8ErklyVpLzgbXAj4A9wNok5yc5k96F4p2LsxmSpLka5AjgEuATwJNJHu9qnwc2J1kPFPAC8GmAqtqX5F56F3cPA1ur6m2AJNcDDwDLgB1VtW8Rt0WSNAeD3AX0MJAZZu06wTI3ATfNUN91ouUkScPjk8CS1CgDQJIaZQBIUqMMAElqlAEgSY0yACSpUQaAJDXKAJCkRhkAktQoA0CSGmUASFKjDABJapQBIEmNMgAkqVEGgCQ1ygCQpEYZAJLUKANAkhplAEhSowwASWqUASBJjTIAJKlRBoAkNcoAkKRGGQCS1CgDQJIaZQBIUqMMAElqlAEgSY0yACSpUQaAJDXKAJCkRs0aAElWJ3koydNJ9iW5oaufk2R3kue69+VdPUluTTKV5IkkF/ata0s3/rkkW07eZkmSZjPIEcBh4LNVtQ64GNiaZB2wDXiwqtYCD3afAa4A1navCeB26AUGcCNwEbABuPFIaEiShm/WAKiqg1X1WDf9M+AZYCWwCbizG3YncFU3vQm4q3oeAc5Och5wObC7ql6tqteA3cDGRd0aSdLA5nQNIMka4ALgUWBFVR3sZr0MrOimVwIv9S22v6sdry5JGoGBAyDJu4FvA5+pqjf651VVAbUYDSWZSDKZZHJ6enoxVilJmsFAAZDkDHo//L9ZVd/pyoe6Uzt076909QPA6r7FV3W149V/TVVtr6rxqhofGxuby7ZIkuZgkLuAAtwBPFNVX+2btRM4cifPFuD+vvq13d1AFwOvd6eKHgAuS7K8u/h7WVeTJI3A6QOMuQT4BPBkkse72ueBm4F7k3wSeBG4upu3C7gSmALeBK4DqKpXk3wZ2NON+1JVvbooWyFJmrNZA6CqHgZynNkfnWF8AVuPs64dwI65NChJOjl8EliSGmUASFKjDABJapQBIEmNMgAkqVEGgCQ1ygCQpEYZAJLUKANAkhplAEhSowwASWqUASBJjTIAJKlRBoAkNcoAkKRGGQCS1CgDQJIaZQBIUqMMAElqlAEgSY0yACSpUQaAJDXKAJCkRhkAktQoA0CSGmUASFKjDABJapQBIEmNMgAkqVEGgCQ1ygCQpEYZAJLUKANAkho1awAk2ZHklSRP9dW+kORAkse715V98z6XZCrJs0ku76tv7GpTSbYt/qZIkuZikCOAbwAbZ6jfUlXru9cugCTrgGuAD3bL/GOSZUmWAbcBVwDrgM3dWEnSiJw+24Cq+mGSNQOubxNwT1W9BfwkyRSwoZs3VVXPAyS5pxv79Jw7liQtioVcA7g+yRPdKaLlXW0l8FLfmP1d7Xj1d0gykWQyyeT09PQC2pMknch8A+B24APAeuAg8JXFaqiqtlfVeFWNj42NLdZqJUnHmPUU0Eyq6tCR6SRfA77bfTwArO4buqqrcYK6JGkE5nUEkOS8vo8fB47cIbQTuCbJWUnOB9YCPwL2AGuTnJ/kTHoXinfOv21J0kLNegSQ5G7gUuDcJPuBG4FLk6wHCngB+DRAVe1Lci+9i7uHga1V9Xa3nuuBB4BlwI6q2rfoWyNJGtggdwFtnqF8xwnG3wTcNEN9F7BrTt1Jkk4anwSWpEYZAJLUKANAkhplAEhSowwASWqUASBJjTIAJKlRBoAkNcoAkKRGGQCS1CgDQJIaZQBIUqMMAElqlAEgSY0yACSpUQaAJDXKAJCkRhkAktQoA0CSGmUASFKjDABJapQBIEmNMgAkqVEGgCQ1ygCQpEYZAJLUKANAkhplAEhSowwASWqUASBJjTIAJKlRBoAkNWrWAEiyI8krSZ7qq52TZHeS57r35V09SW5NMpXkiSQX9i2zpRv/XJItJ2dzJEmDGuQI4BvAxmNq24AHq2ot8GD3GeAKYG33mgBuh15gADcCFwEbgBuPhIYkaTRmDYCq+iHw6jHlTcCd3fSdwFV99buq5xHg7CTnAZcDu6vq1ap6DdjNO0NFkjRE870GsKKqDnbTLwMruumVwEt94/Z3tePVJUkjsuCLwFVVQC1CLwAkmUgymWRyenp6sVYrSTrGfAPgUHdqh+79la5+AFjdN25VVzte/R2qantVjVfV+NjY2DzbkyTNZr4BsBM4cifPFuD+vvq13d1AFwOvd6eKHgAuS7K8u/h7WVeTJI3I6bMNSHI3cClwbpL99O7muRm4N8kngReBq7vhu4ArgSngTeA6gKp6NcmXgT3duC9V1bEXliVJQzRrAFTV5uPM+ugMYwvYepz17AB2zKk7SdJJ45PAktQoA0CSGmUASFKjDABJapQBIEmNMgAkqVEGgCQ1ygCQpEYZAJLUKANAkhplAEhSowwASWqUASBJjTIAJKlRBoAkNcoAkKRGGQCS1CgDQJIaZQBIUqMMAElqlAEgSY0yACSpUQaAJDXKAJCkRhkAktQoA0CSGmUASFKjDABJatTpo25Aw3P3j/ePugUANl+watQtSMIjAElqlgEgSY0yACSpUQaAJDVqQQGQ5IUkTyZ5PMlkVzsnye4kz3Xvy7t6ktyaZCrJE0kuXIwNkCTNz2IcAXykqtZX1Xj3eRvwYFWtBR7sPgNcAaztXhPA7Yvw3ZKkeToZp4A2AXd203cCV/XV76qeR4Czk5x3Er5fkjSAhQZAAd9PsjfJRFdbUVUHu+mXgRXd9Ergpb5l93e1X5NkIslkksnp6ekFtidJOp6FPgj24ao6kOS3gN1J/qt/ZlVVkprLCqtqO7AdYHx8fE7LSpIGt6AjgKo60L2/AtwHbAAOHTm1072/0g0/AKzuW3xVV5MkjcC8AyDJbyZ5z5Fp4DLgKWAnsKUbtgW4v5veCVzb3Q10MfB636kiSdKQLeQU0ArgviRH1vOtqvpekj3AvUk+CbwIXN2N3wVcCUwBbwLXLeC7JUkLNO8AqKrngT+cof4/wEdnqBewdb7fJ0laXD4JLEmNMgAkqVEGgCQ1ygCQpEb5F8HUJP86muQRgCQ1ywCQpEYZAJLUKK8BSI3zeki7PAKQpEYZAJLUKANAkhplAEhSowwASWqUASBJjTIAJKlRBoAkNcoAkKRGGQCS1Ch/FYQkdVr7tRgeAUhSowwASWqUASBJjTIAJKlRBoAkNcoAkKRGGQCS1CgDQJIaZQBIUqMMAElqlAEgSY0yACSpUUMPgCQbkzybZCrJtmF/vySpZ6gBkGQZcBtwBbAO2Jxk3TB7kCT1DPsIYAMwVVXPV9UvgXuATUPuQZIEpKqG92XJnwMbq+qvu8+fAC6qquv7xkwAE93H3wWeHVqDx3cu8NNRN7FEuC+Ocl8c5b44ainsi/dX1dhsg5bcH4Spqu3A9lH30S/JZFWNj7qPpcB9cZT74ij3xVGn0r4Y9imgA8Dqvs+rupokaciGHQB7gLVJzk9yJnANsHPIPUiSGPIpoKo6nOR64AFgGbCjqvYNs4d5WlKnpEbMfXGU++Io98VRp8y+GOpFYEnS0uGTwJLUKANAkhplAEhSowwADSzJXaPuQUtDkg1J/qibXpfkb5NcOeq+NDdL7kGwpSbJh+n9Counqur7o+5nWJIce3tugI8kORugqv5s+F1pKUhyI73f53V6kt3ARcBDwLYkF1TVTSNtcMiS/B6wEni0qn7eV99YVd8bXWez8y6gYyT5UVVt6KY/BWwF7gMuA/61qm4eZX/DkuQx4Gngn4CiFwB303t2g6r6wei6W1qSXFdVXx91H8OS5ElgPXAW8DKwqqreSPIb9H4I/sFIGxyiJH9D72fEM/T2yQ1VdX8377GqunCU/c3GU0DvdEbf9ATwsar6Ir0A+KvRtDQS48Be4O+A16vq34FfVNUP/OH/Dl8cdQNDdriq3q6qN4H/rqo3AKrqF8CvRtva0H0K+FBVXQVcCvx9khu6eRlZVwPyFNA7nZZkOb1wTFVNA1TV/yY5PNrWhqeqfgXckuSfu/dDNPz/S5InjjcLWDHMXpaAXyZ5VxcAHzpSTPJe2guA046c9qmqF5JcCvxLkvdjAJyS3kvvX74BKsl5VXUwybs5Bf6DLraq2g/8RZI/Ad4YdT8jtAK4HHjtmHqA/xx+OyP1x1X1Fvz/PxSOOAPYMpqWRuZQkvVV9ThAVf08yZ8CO4DfH21rs/MawICSvAtYUVU/GXUvGr4kdwBfr6qHZ5j3rar6yxG0pRFLsoreKbGXZ5h3SVX9xwjaGpgBIEmN8iKwJDXKAJCkRhkAktQoA0CSGvV/ge68SbzLI/kAAAAASUVORK5CYII=\n", 199 | "text/plain": [ 200 | "
" 201 | ] 202 | }, 203 | "metadata": { 204 | "needs_background": "light" 205 | }, 206 | "output_type": "display_data" 207 | } 208 | ], 209 | "source": [ 210 | "ax=reviews_df.Score.value_counts().plot(kind='bar', colormap='Paired')\n", 211 | "plt.show()" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 6, 217 | "metadata": {}, 218 | "outputs": [ 219 | { 220 | "name": "stderr", 221 | "output_type": "stream", 222 | "text": [ 223 | "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n", 224 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 225 | "\n", 226 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 227 | " \"\"\"Entry point for launching an IPython kernel.\n", 228 | "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n", 229 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 230 | "\n", 231 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 232 | " \n" 233 | ] 234 | } 235 | ], 236 | "source": [ 237 | "reviews_df.Score[reviews_df.Score<=3]=0\n", 238 | "reviews_df.Score[reviews_df.Score>=4]=1" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 7, 244 | "metadata": {}, 245 | "outputs": [ 246 | { 247 | "data": { 248 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAD4CAYAAADlwTGnAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAADpxJREFUeJzt3X+o3Xd9x/Hny8S6MWWN612ISbYEvUPSP0zlEjvcH85im/afVNgkGWgohfhHCgr+seo/9ccKDqYFQQuRZsbhmoWpNJSwLmYOkVGbW81i067rXbUkITZXU6si65b63h/3EzyLN7nnJjfntPk8H3A43+/78/l+z/sLl77y/XFOU1VIkvrzmnE3IEkaDwNAkjplAEhSpwwASeqUASBJnTIAJKlTBoAkdcoAkKROGQCS1Knl427gYq677rpat27duNuQpFeVxx9//MdVNbHQvFd0AKxbt47p6elxtyFJrypJnhtmnpeAJKlTBoAkdcoAkKROGQCS1CkDQJI6ZQBIUqcMAEnqlAEgSZ16RX8R7NXiwe+dGHcLV5VtN6wZdwtSFzwDkKROGQCS1KkFAyDJbyV5LMm/JzmW5BOtvj7Jd5LMJPmHJNe0+uva+kwbXzewr4+2+tNJbrlSByVJWtgwZwAvAe+uqrcBG4HNSW4E/hq4r6reArwA3Nnm3wm80Or3tXkk2QBsBa4HNgNfSLJsKQ9GkjS8BQOg5vyirb62vQp4N/CPrb4HuL0tb2nrtPGbkqTV91bVS1X1A2AG2LQkRyFJWrSh7gEkWZbkCHAaOAj8F/DTqjrbppwAVrfl1cBxgDb+IvB7g/V5thn8rB1JppNMz87OLv6IJElDGSoAqurlqtoIrGHuX+1vvVINVdWuqpqqqqmJiQX/fwaSpEu0qKeAquqnwDeBPwauTXLuewRrgJNt+SSwFqCN/y7wk8H6PNtIkkZsmKeAJpJc25Z/G3gP8BRzQfBnbdp24KG2vL+t08b/paqq1be2p4TWA5PAY0t1IJKkxRnmm8CrgD3tiZ3XAPuq6uEkTwJ7k/wV8D3ggTb/AeDvkswAZ5h78oeqOpZkH/AkcBbYWVUvL+3hSJKGtWAAVNVR4IZ56s8yz1M8VfXfwJ9fYF/3Avcuvk1J0lLzm8CS1CkDQJI6ZQBIUqcMAEnqlAEgSZ0yACSpUwaAJHXKAJCkThkAktQpA0CSOmUASFKnDABJ6pQBIEmdMgAkqVMGgCR1ygCQpE4ZAJLUKQNAkjplAEhSpwwASeqUASBJnTIAJKlTBoAkdcoAkKROGQCS1KkFAyDJ2iTfTPJkkmNJPtTqH09yMsmR9rptYJuPJplJ8nSSWwbqm1ttJsndV+aQJEnDWD7EnLPAR6rqu0neADye5GAbu6+q/mZwcpINwFbgeuBNwDeS/FEb/jzwHuAEcDjJ/qp6cikORJK0OAsGQFWdAk615Z8neQpYfZFNtgB7q+ol4AdJZoBNbWymqp4FSLK3zTUAJGkMFnUPIMk64AbgO610V5KjSXYnWdFqq4HjA5udaLUL1c//jB1JppNMz87OLqY9SdIiDB0ASV4PfBX4cFX9DLgfeDOwkbkzhM8sRUNVtauqpqpqamJiYil2KUmaxzD3AEjyWub+4/+VqvoaQFU9PzD+ReDhtnoSWDuw+ZpW4yJ1SdKIDfMUUIAHgKeq6rMD9VUD094LPNGW9wNbk7wuyXpgEngMOAxMJlmf5BrmbhTvX5rDkCQt1jBnAO8E3g98P8mRVvsYsC3JRqCAHwIfBKiqY0n2MXdz9yyws6peBkhyF/AIsAzYXVXHlvBYJEmLMMxTQN8GMs/QgYtscy9w7zz1AxfbTpI0On4TWJI6ZQBIUqcMAEnqlAEgSZ0yACSpUwaAJHXKAJCkThkAktQpA0CSOmUASFKnDABJ6pQBIEmdMgAkqVMGgCR1ygCQpE4ZAJLUKQNAkjplAEhSpwwASeqUASBJnTIAJKlTBoAkdcoAkKROGQCS1KkFAyDJ2iTfTPJkkmNJPtTqb0xyMMkz7X1FqyfJ55LMJDma5O0D+9re5j+TZPuVOyxJ0kKGOQM4C3ykqjYANwI7k2wA7gYOVdUkcKitA9wKTLbXDuB+mAsM4B7gHcAm4J5zoSFJGr0FA6CqTlXVd9vyz4GngNXAFmBPm7YHuL0tbwG+XHMeBa5Nsgq4BThYVWeq6gXgILB5SY9GkjS0Rd0DSLIOuAH4DrCyqk61oR8BK9vyauD4wGYnWu1C9fM/Y0eS6STTs7Ozi2lPkrQIQwdAktcDXwU+XFU/GxyrqgJqKRqqql1VNVVVUxMTE0uxS0nSPIYKgCSvZe4//l+pqq+18vPt0g7t/XSrnwTWDmy+ptUuVJckjcEwTwEFeAB4qqo+OzC0Hzj3JM924KGB+gfa00A3Ai+2S0WPADcnWdFu/t7capKkMVg+xJx3Au8Hvp/kSKt9DPg0sC/JncBzwPva2AHgNmAG+CVwB0BVnUnyKeBwm/fJqjqzJEchSVq0BQOgqr4N5ALDN80zv4CdF9jXbmD3YhqUJF0ZfhNYkjplAEhSpwwASeqUASBJnTIAJKlTBoAkdcoAkKROGQCS1CkDQJI6ZQBIUqcMAEnqlAEgSZ0yACSpUwaAJHXKAJCkThkAktQpA0CSOmUASFKnDABJ6pQBIEmdMgAkqVMGgCR1ygCQpE4ZAJLUKQNAkjq1YAAk2Z3kdJInBmofT3IyyZH2um1g7KNJZpI8neSWgfrmVptJcvfSH4okaTGGOQP4ErB5nvp9VbWxvQ4AJNkAbAWub9t8IcmyJMuAzwO3AhuAbW2uJGlMli80oaq+lWTdkPvbAuytqpeAHySZATa1sZmqehYgyd4298lFdyxJWhKXcw/griRH2yWiFa22Gjg+MOdEq12o/huS7EgynWR6dnb2MtqTJF3MpQbA/cCbgY3AKeAzS9VQVe2qqqmqmpqYmFiq3UqSzrPgJaD5VNXz55aTfBF4uK2eBNYOTF3TalykLkkag0s6A0iyamD1vcC5J4T2A1uTvC7JemASeAw4DEwmWZ/kGuZuFO+/9LYlSZdrwTOAJA8C7wKuS3ICuAd4V5KNQAE/BD4IUFXHkuxj7ubuWWBnVb3c9nMX8AiwDNhdVceW/GgkSUMb5imgbfOUH7jI/HuBe+epHwAOLKo7SdIV4zeBJalTBoAkdcoAkKROGQCS1CkDQJI6ZQBIUqcMAEnqlAEgSZ0yACSpUwaAJHXKAJCkThkAktQpA0CSOmUASFKnDABJ6pQBIEmdMgAkqVMGgCR1ygCQpE4ZAJLUKQNAkjplAEhSpwwASeqUASBJnVowAJLsTnI6yRMDtTcmOZjkmfa+otWT5HNJZpIcTfL2gW22t/nPJNl+ZQ5HkjSsYc4AvgRsPq92N3CoqiaBQ20d4FZgsr12APfDXGAA9wDvADYB95wLDUnSeCwYAFX1LeDMeeUtwJ62vAe4faD+5ZrzKHBtklXALcDBqjpTVS8AB/nNUJEkjdCl3gNYWVWn2vKPgJVteTVwfGDeiVa7UP03JNmRZDrJ9Ozs7CW2J0layGXfBK6qAmoJejm3v11VNVVVUxMTE0u1W0nSeS41AJ5vl3Zo76db/SSwdmDemla7UF2SNCaXGgD7gXNP8mwHHhqof6A9DXQj8GK7VPQIcHOSFe3m782tJkkak+ULTUjyIPAu4LokJ5h7mufTwL4kdwLPAe9r0w8AtwEzwC+BOwCq6kySTwGH27xPVtX5N5YlSSO0YABU1bYLDN00z9wCdl5gP7uB3YvqTpJ0xSwYAJJe3R783olxt3DV2HbDmnG3sKT8KQhJ6pQBIEmdMgAkqVMGgCR1ygCQpE4ZAJLUKQNAkjplAEhSpwwASeqUASBJnTIAJKlTBoAkdcoAkKROGQCS1CkDQJI6ZQBIUqcMAEnqlAEgSZ0yACSpUwaAJHXKAJCkThkAktQpA0CSOmUASFKnLisAkvwwyfeTHEky3WpvTHIwyTPtfUWrJ8nnkswkOZrk7UtxAJKkS7MUZwB/WlUbq2qqrd8NHKqqSeBQWwe4FZhsrx3A/Uvw2ZKkS3QlLgFtAfa05T3A7QP1L9ecR4Frk6y6Ap8vSRrC5QZAAf+c5PEkO1ptZVWdass/Ala25dXA8YFtT7Ta/5NkR5LpJNOzs7OX2Z4k6UKWX+b2f1JVJ5P8PnAwyX8MDlZVJanF7LCqdgG7AKampha1rSRpeJd1BlBVJ9v7aeDrwCbg+XOXdtr76Tb9JLB2YPM1rSZJGoNLDoAkv5PkDeeWgZuBJ4D9wPY2bTvwUFveD3ygPQ10I/DiwKUiSdKIXc4loJXA15Oc28/fV9U/JTkM7EtyJ/Ac8L42/wBwGzAD/BK44zI+W5J0mS45AKrqWeBt89R/Atw0T72AnZf6eZKkpeU3gSWpUwaAJHXKAJCkThkAktQpA0CSOmUASFKnDABJ6pQBIEmdMgAkqVMGgCR1ygCQpE4ZAJLUKQNAkjplAEhSpwwASeqUASBJnTIAJKlTBoAkdcoAkKROGQCS1CkDQJI6ZQBIUqcMAEnqlAEgSZ0aeQAk2Zzk6SQzSe4e9edLkuaMNACSLAM+D9wKbAC2Jdkwyh4kSXNGfQawCZipqmer6n+AvcCWEfcgSQKWj/jzVgPHB9ZPAO8YnJBkB7Cjrf4iydMj6q0H1wE/HncTC/mLcTegcXnF/32+iv42/3CYSaMOgAVV1S5g17j7uBolma6qqXH3Ic3Hv8/RG/UloJPA2oH1Na0mSRqxUQfAYWAyyfok1wBbgf0j7kGSxIgvAVXV2SR3AY8Ay4DdVXVslD10zktreiXz73PEUlXj7kGSNAZ+E1iSOmUASFKnDABJ6tQr7nsAkq5+Sd7K3K8ArG6lk8D+qnpqfF31xzMASSOV5C+Z+xmYAI+1V4AH/YHI0fIpoA4luaOq/nbcfahPSf4TuL6q/ve8+jXAsaqaHE9n/fEMoE+fGHcD6tqvgDfNU1/VxjQi3gO4SiU5eqEhYOUoe5HO82HgUJJn+PWPQ/4B8BbgrrF11SEvAV2lkjwP3AK8cP4Q8G9VNd+/wKSRSPIa5n4efvAm8OGqenl8XfXHM4Cr18PA66vqyPkDSf519O1Iv1ZVvwIeHXcfvfMMQJI65U1gSeqUASBJnTIAJKlTBoAkder/APcMnNmwIY/4AAAAAElFTkSuQmCC\n", 249 | "text/plain": [ 250 | "
" 251 | ] 252 | }, 253 | "metadata": { 254 | "needs_background": "light" 255 | }, 256 | "output_type": "display_data" 257 | } 258 | ], 259 | "source": [ 260 | "ax=reviews_df.Score.value_counts().plot(kind='bar', colormap='Paired')\n", 261 | "plt.show()" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 8, 267 | "metadata": {}, 268 | "outputs": [ 269 | { 270 | "data": { 271 | "text/html": [ 272 | "
\n", 273 | "\n", 286 | "\n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | "
TextScore
0To be fair only one of my twins got gas from t...0
1Just recieved our first order of these (they d...1
2This is a fantastic Chai Masala. I am very pic...1
3Flavorful and has added nutrition! You use le...1
4I thought this soup tasted good. I liked the t...0
\n", 322 | "
" 323 | ], 324 | "text/plain": [ 325 | " Text Score\n", 326 | "0 To be fair only one of my twins got gas from t... 0\n", 327 | "1 Just recieved our first order of these (they d... 1\n", 328 | "2 This is a fantastic Chai Masala. I am very pic... 1\n", 329 | "3 Flavorful and has added nutrition! You use le... 1\n", 330 | "4 I thought this soup tasted good. I liked the t... 0" 331 | ] 332 | }, 333 | "execution_count": 8, 334 | "metadata": {}, 335 | "output_type": "execute_result" 336 | } 337 | ], 338 | "source": [ 339 | "reviews_df.head()" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 9, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [ 348 | "import spacy\n", 349 | "nlp = spacy.load(\"en_core_web_md\") " 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": 25, 355 | "metadata": {}, 356 | "outputs": [], 357 | "source": [] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 28, 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "train_examples = []\n", 366 | "labels = []\n", 367 | "\n", 368 | "for index, row in reviews_df.iterrows():\n", 369 | " text = row[\"Text\"]\n", 370 | " rating = row[\"Score\"]\n", 371 | " labels.append(rating)\n", 372 | " tokens = [token.text for token in nlp(text)]\n", 373 | " train_examples.append(tokens) " 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": 29, 379 | "metadata": {}, 380 | "outputs": [ 381 | { 382 | "data": { 383 | "text/plain": [ 384 | "['To',\n", 385 | " 'be',\n", 386 | " 'fair',\n", 387 | " 'only',\n", 388 | " 'one',\n", 389 | " 'of',\n", 390 | " 'my',\n", 391 | " 'twins',\n", 392 | " 'got',\n", 393 | " 'gas',\n", 394 | " 'from',\n", 395 | " 'this',\n", 396 | " 'but',\n", 397 | " 'it',\n", 398 | " 'was',\n", 399 | " 'horrible',\n", 400 | " '.',\n", 401 | " 'Up',\n", 402 | " 'all',\n", 403 | " 'night',\n", 404 | " 'screaming',\n", 405 | " 'from',\n", 406 | " 'gas',\n", 407 | " 'pains',\n", 408 | " '.',\n", 409 | " 'Garbanzo',\n", 410 | " 'beans',\n", 411 | " 'are',\n", 412 | " 'not',\n", 413 | " 'an',\n", 414 | " 'ideal',\n", 415 | " 'food',\n", 416 | " 'for',\n", 417 | " 'young',\n", 418 | " 'babies',\n", 419 | " '.']" 420 | ] 421 | }, 422 | "execution_count": 29, 423 | "metadata": {}, 424 | "output_type": "execute_result" 425 | } 426 | ], 427 | "source": [ 428 | "train_examples[0]" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": 32, 434 | "metadata": {}, 435 | "outputs": [], 436 | "source": [ 437 | "from tensorflow.keras.preprocessing.text import Tokenizer\n", 438 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", 439 | "import numpy as np" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": 30, 445 | "metadata": {}, 446 | "outputs": [], 447 | "source": [ 448 | "tokenizer = Tokenizer(lower=True)\n", 449 | "tokenizer.fit_on_texts(train_examples)\n", 450 | "\n", 451 | "sequences = tokenizer.texts_to_sequences(train_examples)" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": 27, 457 | "metadata": {}, 458 | "outputs": [ 459 | { 460 | "data": { 461 | "text/plain": [ 462 | "[To,\n", 463 | " be,\n", 464 | " fair,\n", 465 | " only,\n", 466 | " one,\n", 467 | " of,\n", 468 | " my,\n", 469 | " twins,\n", 470 | " got,\n", 471 | " gas,\n", 472 | " from,\n", 473 | " this,\n", 474 | " but,\n", 475 | " it,\n", 476 | " was,\n", 477 | " horrible,\n", 478 | " .,\n", 479 | " Up,\n", 480 | " all,\n", 481 | " night,\n", 482 | " screaming,\n", 483 | " from,\n", 484 | " gas,\n", 485 | " pains,\n", 486 | " .,\n", 487 | " Garbanzo,\n", 488 | " beans,\n", 489 | " are,\n", 490 | " not,\n", 491 | " an,\n", 492 | " ideal,\n", 493 | " food,\n", 494 | " for,\n", 495 | " young,\n", 496 | " babies,\n", 497 | " .]" 498 | ] 499 | }, 500 | "execution_count": 27, 501 | "metadata": {}, 502 | "output_type": "execute_result" 503 | } 504 | ], 505 | "source": [ 506 | "train_examples[0]" 507 | ] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "execution_count": 82, 512 | "metadata": {}, 513 | "outputs": [ 514 | { 515 | "data": { 516 | "text/plain": [ 517 | "[7,\n", 518 | " 40,\n", 519 | " 1489,\n", 520 | " 71,\n", 521 | " 48,\n", 522 | " 10,\n", 523 | " 15,\n", 524 | " 6075,\n", 525 | " 159,\n", 526 | " 1158,\n", 527 | " 54,\n", 528 | " 12,\n", 529 | " 19,\n", 530 | " 8,\n", 531 | " 24,\n", 532 | " 1187,\n", 533 | " 1,\n", 534 | " 69,\n", 535 | " 44,\n", 536 | " 661,\n", 537 | " 6076,\n", 538 | " 54,\n", 539 | " 1158,\n", 540 | " 3346,\n", 541 | " 1,\n", 542 | " 4895,\n", 543 | " 387,\n", 544 | " 22,\n", 545 | " 23,\n", 546 | " 76,\n", 547 | " 1927,\n", 548 | " 65,\n", 549 | " 13,\n", 550 | " 1667,\n", 551 | " 1928,\n", 552 | " 1]" 553 | ] 554 | }, 555 | "execution_count": 82, 556 | "metadata": {}, 557 | "output_type": "execute_result" 558 | } 559 | ], 560 | "source": [ 561 | "sequences[0]" 562 | ] 563 | }, 564 | { 565 | "cell_type": "code", 566 | "execution_count": 49, 567 | "metadata": {}, 568 | "outputs": [], 569 | "source": [ 570 | "MAX_LEN = 50\n", 571 | "\n", 572 | "X = pad_sequences(sequences, MAX_LEN, padding=\"post\")\n", 573 | "\n", 574 | "X = np.array(X)\n", 575 | "y = np.array(labels)" 576 | ] 577 | }, 578 | { 579 | "cell_type": "code", 580 | "execution_count": 68, 581 | "metadata": {}, 582 | "outputs": [], 583 | "source": [ 584 | "y = y.reshape(y.shape[0] , 1)" 585 | ] 586 | }, 587 | { 588 | "cell_type": "code", 589 | "execution_count": 69, 590 | "metadata": {}, 591 | "outputs": [ 592 | { 593 | "data": { 594 | "text/plain": [ 595 | "(3999, 1)" 596 | ] 597 | }, 598 | "execution_count": 69, 599 | "metadata": {}, 600 | "output_type": "execute_result" 601 | } 602 | ], 603 | "source": [ 604 | "y.shape" 605 | ] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "execution_count": 40, 610 | "metadata": {}, 611 | "outputs": [], 612 | "source": [ 613 | "from tensorflow.keras.models import Model\n", 614 | "from tensorflow.keras.layers import Input, LSTM, Dense, Embedding\n", 615 | "from tensorflow.keras import optimizers" 616 | ] 617 | }, 618 | { 619 | "cell_type": "code", 620 | "execution_count": 88, 621 | "metadata": {}, 622 | "outputs": [], 623 | "source": [ 624 | "sentence_input = Input(shape=(None,))\n", 625 | "embedding = Embedding(input_dim = len(tokenizer.word_index)+1, output_dim = 100)(sentence_input)\n", 626 | "LSTM_layer = LSTM(units=256)(embedding)\n", 627 | "output_dense = Dense(1, activation='sigmoid')(LSTM_layer)" 628 | ] 629 | }, 630 | { 631 | "cell_type": "code", 632 | "execution_count": 89, 633 | "metadata": {}, 634 | "outputs": [], 635 | "source": [ 636 | "model = Model(inputs=[sentence_input],outputs=[output_dense])" 637 | ] 638 | }, 639 | { 640 | "cell_type": "code", 641 | "execution_count": 90, 642 | "metadata": {}, 643 | "outputs": [ 644 | { 645 | "name": "stdout", 646 | "output_type": "stream", 647 | "text": [ 648 | "Model: \"model_8\"\n", 649 | "_________________________________________________________________\n", 650 | "Layer (type) Output Shape Param # \n", 651 | "=================================================================\n", 652 | "input_8 (InputLayer) [(None, None)] 0 \n", 653 | "_________________________________________________________________\n", 654 | "embedding_7 (Embedding) (None, None, 100) 1623900 \n", 655 | "_________________________________________________________________\n", 656 | "lstm_7 (LSTM) (None, 256) 365568 \n", 657 | "_________________________________________________________________\n", 658 | "dense_7 (Dense) (None, 1) 257 \n", 659 | "=================================================================\n", 660 | "Total params: 1,989,725\n", 661 | "Trainable params: 1,989,725\n", 662 | "Non-trainable params: 0\n", 663 | "_________________________________________________________________\n" 664 | ] 665 | } 666 | ], 667 | "source": [ 668 | "model.summary()" 669 | ] 670 | }, 671 | { 672 | "cell_type": "code", 673 | "execution_count": 91, 674 | "metadata": {}, 675 | "outputs": [], 676 | "source": [ 677 | "model.compile(optimizer=\"adam\", loss=\"binary_crossentropy\", metrics=[\"accuracy\"])\n" 678 | ] 679 | }, 680 | { 681 | "cell_type": "code", 682 | "execution_count": 92, 683 | "metadata": {}, 684 | "outputs": [ 685 | { 686 | "name": "stdout", 687 | "output_type": "stream", 688 | "text": [ 689 | "Epoch 1/5\n", 690 | "50/50 [==============================] - 8s 164ms/step - loss: 0.5400 - accuracy: 0.7706 - val_loss: 0.5304 - val_accuracy: 0.7563\n", 691 | "Epoch 2/5\n", 692 | "50/50 [==============================] - 8s 154ms/step - loss: 0.4032 - accuracy: 0.8224 - val_loss: 0.4743 - val_accuracy: 0.7875\n", 693 | "Epoch 3/5\n", 694 | "50/50 [==============================] - 8s 153ms/step - loss: 0.2242 - accuracy: 0.9159 - val_loss: 0.4577 - val_accuracy: 0.7975\n", 695 | "Epoch 4/5\n", 696 | "50/50 [==============================] - 8s 158ms/step - loss: 0.1400 - accuracy: 0.9578 - val_loss: 0.6093 - val_accuracy: 0.7912\n", 697 | "Epoch 5/5\n", 698 | "50/50 [==============================] - 8s 154ms/step - loss: 0.0603 - accuracy: 0.9812 - val_loss: 0.6885 - val_accuracy: 0.7950\n" 699 | ] 700 | }, 701 | { 702 | "data": { 703 | "text/plain": [ 704 | "" 705 | ] 706 | }, 707 | "execution_count": 92, 708 | "metadata": {}, 709 | "output_type": "execute_result" 710 | } 711 | ], 712 | "source": [ 713 | "model.fit(x=X,\n", 714 | " y=y,\n", 715 | " batch_size=64,\n", 716 | " epochs=5,\n", 717 | " validation_split=0.2)" 718 | ] 719 | }, 720 | { 721 | "cell_type": "code", 722 | "execution_count": null, 723 | "metadata": {}, 724 | "outputs": [], 725 | "source": [] 726 | }, 727 | { 728 | "cell_type": "code", 729 | "execution_count": null, 730 | "metadata": {}, 731 | "outputs": [], 732 | "source": [] 733 | }, 734 | { 735 | "cell_type": "code", 736 | "execution_count": null, 737 | "metadata": {}, 738 | "outputs": [], 739 | "source": [] 740 | } 741 | ], 742 | "metadata": { 743 | "kernelspec": { 744 | "display_name": "Python 3", 745 | "language": "python", 746 | "name": "python3" 747 | }, 748 | "language_info": { 749 | "codemirror_mode": { 750 | "name": "ipython", 751 | "version": 3 752 | }, 753 | "file_extension": ".py", 754 | "mimetype": "text/x-python", 755 | "name": "python", 756 | "nbconvert_exporter": "python", 757 | "pygments_lexer": "ipython3", 758 | "version": "3.6.9" 759 | } 760 | }, 761 | "nbformat": 4, 762 | "nbformat_minor": 2 763 | } 764 | -------------------------------------------------------------------------------- /Chapter08/Train_Food_reviews.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 7, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import matplotlib.pyplot as plt\n", 11 | "%matplotlib inline" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 9, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "data": { 21 | "text/plain": [ 22 | "(3999, 10)" 23 | ] 24 | }, 25 | "execution_count": 9, 26 | "metadata": {}, 27 | "output_type": "execute_result" 28 | } 29 | ], 30 | "source": [ 31 | "reviews_df=pd.read_csv('data/Reviews.csv')\n", 32 | "reviews_df.shape" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 8, 38 | "metadata": {}, 39 | "outputs": [ 40 | { 41 | "data": { 42 | "text/html": [ 43 | "
\n", 44 | "\n", 57 | "\n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | "
IdProductIdUserIdProfileNameHelpfulnessNumeratorHelpfulnessDenominatorScoreTimeSummaryText
050057B000ER5DFQA1ESDLEDR9Y0JXA. Spencer1211310256000the garbanzo beans in it give horrible gasTo be fair only one of my twins got gas from t...
1366917B001AIQP8MA324KM3YY1DWQGdanitrice0051251072000Yummy Lil' Treasures!!Just recieved our first order of these (they d...
2214380B001E5E1XWA3QCWO53N69HW3M. A. Vaughan \"-_-GOBNOGO-_-\"2251276473600Great ChaiThis is a fantastic Chai Masala. I am very pic...
3178476B000TIZP5IAYZ5NG9705AG1Consumer0051341360000Celtic Salt worth extra priceFlavorful and has added nutrition! You use le...
4542504B000E18CVEA2LMWCJUF5HZ4ZMiki Lam \"mikilam\"81131222732800mixed feelingsI thought this soup tasted good. I liked the t...
\n", 141 | "
" 142 | ], 143 | "text/plain": [ 144 | " Id ProductId UserId ProfileName \\\n", 145 | "0 50057 B000ER5DFQ A1ESDLEDR9Y0JX A. Spencer \n", 146 | "1 366917 B001AIQP8M A324KM3YY1DWQG danitrice \n", 147 | "2 214380 B001E5E1XW A3QCWO53N69HW3 M. A. Vaughan \"-_-GOBNOGO-_-\" \n", 148 | "3 178476 B000TIZP5I AYZ5NG9705AG1 Consumer \n", 149 | "4 542504 B000E18CVE A2LMWCJUF5HZ4Z Miki Lam \"mikilam\" \n", 150 | "\n", 151 | " HelpfulnessNumerator HelpfulnessDenominator Score Time \\\n", 152 | "0 1 2 1 1310256000 \n", 153 | "1 0 0 5 1251072000 \n", 154 | "2 2 2 5 1276473600 \n", 155 | "3 0 0 5 1341360000 \n", 156 | "4 8 11 3 1222732800 \n", 157 | "\n", 158 | " Summary \\\n", 159 | "0 the garbanzo beans in it give horrible gas \n", 160 | "1 Yummy Lil' Treasures!! \n", 161 | "2 Great Chai \n", 162 | "3 Celtic Salt worth extra price \n", 163 | "4 mixed feelings \n", 164 | "\n", 165 | " Text \n", 166 | "0 To be fair only one of my twins got gas from t... \n", 167 | "1 Just recieved our first order of these (they d... \n", 168 | "2 This is a fantastic Chai Masala. I am very pic... \n", 169 | "3 Flavorful and has added nutrition! You use le... \n", 170 | "4 I thought this soup tasted good. I liked the t... " 171 | ] 172 | }, 173 | "execution_count": 8, 174 | "metadata": {}, 175 | "output_type": "execute_result" 176 | } 177 | ], 178 | "source": [ 179 | "reviews_df.head()" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 10, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "reviews_df = reviews_df[['Text','Score']].dropna()" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 11, 194 | "metadata": {}, 195 | "outputs": [ 196 | { 197 | "data": { 198 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAD4CAYAAADlwTGnAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAADolJREFUeJzt3X+o3fV9x/Hny/iDdS014l1wSdpIyTZStkV7FwXLsJRqdGOxsInZqEG63v4RmWX9J+0Y9geCf6wVBCekM61Cq7i1YlZCbSauxQ1tbqyo0Yl3VjEhxtsp2s5iiX3vj/PNchpvcs/9kXNu+DwfcDjf8/5+vt/z/n6V+8r3172pKiRJ7Tlt1A1IkkbDAJCkRhkAktQoA0CSGmUASFKjDABJapQBIEmNMgAkqVEGgCQ16vRRN3Ai5557bq1Zs2bUbUjSKWXv3r0/raqx2cYt6QBYs2YNk5OTo25Dkk4pSV4cZJyngCSpUQaAJDXKAJCkRhkAktQoA0CSGjVrACRZneShJE8n2Zfkhq7+hSQHkjzeva7sW+ZzSaaSPJvk8r76xq42lWTbydkkSdIgBrkN9DDw2ap6LMl7gL1Jdnfzbqmqf+gfnGQdcA3wQeC3gX9L8jvd7NuAjwH7gT1JdlbV04uxIZKkuZk1AKrqIHCwm/5ZkmeAlSdYZBNwT1W9BfwkyRSwoZs3VVXPAyS5pxtrAEjSCMzpQbAka4ALgEeBS4Drk1wLTNI7SniNXjg80rfYfo4GxkvH1C+a4TsmgAmA973vfXNpb0Z3/3j/gtexGDZfsGrULUjSrxn4InCSdwPfBj5TVW8AtwMfANbTO0L4ymI0VFXbq2q8qsbHxmZ9klmSNE8DHQEkOYPeD/9vVtV3AKrqUN/8rwHf7T4eAFb3Lb6qq3GCuiRpyAa5CyjAHcAzVfXVvvp5fcM+DjzVTe8ErklyVpLzgbXAj4A9wNok5yc5k96F4p2LsxmSpLka5AjgEuATwJNJHu9qnwc2J1kPFPAC8GmAqtqX5F56F3cPA1ur6m2AJNcDDwDLgB1VtW8Rt0WSNAeD3AX0MJAZZu06wTI3ATfNUN91ouUkScPjk8CS1CgDQJIaZQBIUqMMAElqlAEgSY0yACSpUQaAJDXKAJCkRhkAktQoA0CSGmUASFKjDABJapQBIEmNMgAkqVEGgCQ1ygCQpEYZAJLUKANAkhplAEhSowwASWqUASBJjTIAJKlRBoAkNcoAkKRGGQCS1CgDQJIaZQBIUqMMAElqlAEgSY0yACSpUQaAJDXKAJCkRs0aAElWJ3koydNJ9iW5oaufk2R3kue69+VdPUluTTKV5IkkF/ata0s3/rkkW07eZkmSZjPIEcBh4LNVtQ64GNiaZB2wDXiwqtYCD3afAa4A1navCeB26AUGcCNwEbABuPFIaEiShm/WAKiqg1X1WDf9M+AZYCWwCbizG3YncFU3vQm4q3oeAc5Och5wObC7ql6tqteA3cDGRd0aSdLA5nQNIMka4ALgUWBFVR3sZr0MrOimVwIv9S22v6sdry5JGoGBAyDJu4FvA5+pqjf651VVAbUYDSWZSDKZZHJ6enoxVilJmsFAAZDkDHo//L9ZVd/pyoe6Uzt076909QPA6r7FV3W149V/TVVtr6rxqhofGxuby7ZIkuZgkLuAAtwBPFNVX+2btRM4cifPFuD+vvq13d1AFwOvd6eKHgAuS7K8u/h7WVeTJI3A6QOMuQT4BPBkkse72ueBm4F7k3wSeBG4upu3C7gSmALeBK4DqKpXk3wZ2NON+1JVvbooWyFJmrNZA6CqHgZynNkfnWF8AVuPs64dwI65NChJOjl8EliSGmUASFKjDABJapQBIEmNMgAkqVEGgCQ1ygCQpEYZAJLUKANAkhplAEhSowwASWqUASBJjTIAJKlRBoAkNcoAkKRGGQCS1CgDQJIaZQBIUqMMAElqlAEgSY0yACSpUQaAJDXKAJCkRhkAktQoA0CSGmUASFKjDABJapQBIEmNMgAkqVEGgCQ1ygCQpEYZAJLUKANAkho1awAk2ZHklSRP9dW+kORAkse715V98z6XZCrJs0ku76tv7GpTSbYt/qZIkuZikCOAbwAbZ6jfUlXru9cugCTrgGuAD3bL/GOSZUmWAbcBVwDrgM3dWEnSiJw+24Cq+mGSNQOubxNwT1W9BfwkyRSwoZs3VVXPAyS5pxv79Jw7liQtioVcA7g+yRPdKaLlXW0l8FLfmP1d7Xj1d0gykWQyyeT09PQC2pMknch8A+B24APAeuAg8JXFaqiqtlfVeFWNj42NLdZqJUnHmPUU0Eyq6tCR6SRfA77bfTwArO4buqqrcYK6JGkE5nUEkOS8vo8fB47cIbQTuCbJWUnOB9YCPwL2AGuTnJ/kTHoXinfOv21J0kLNegSQ5G7gUuDcJPuBG4FLk6wHCngB+DRAVe1Lci+9i7uHga1V9Xa3nuuBB4BlwI6q2rfoWyNJGtggdwFtnqF8xwnG3wTcNEN9F7BrTt1Jkk4anwSWpEYZAJLUKANAkhplAEhSowwASWqUASBJjTIAJKlRBoAkNcoAkKRGGQCS1CgDQJIaZQBIUqMMAElqlAEgSY0yACSpUQaAJDXKAJCkRhkAktQoA0CSGmUASFKjDABJapQBIEmNMgAkqVEGgCQ1ygCQpEYZAJLUKANAkhplAEhSowwASWqUASBJjTIAJKlRBoAkNWrWAEiyI8krSZ7qq52TZHeS57r35V09SW5NMpXkiSQX9i2zpRv/XJItJ2dzJEmDGuQI4BvAxmNq24AHq2ot8GD3GeAKYG33mgBuh15gADcCFwEbgBuPhIYkaTRmDYCq+iHw6jHlTcCd3fSdwFV99buq5xHg7CTnAZcDu6vq1ap6DdjNO0NFkjRE870GsKKqDnbTLwMruumVwEt94/Z3tePVJUkjsuCLwFVVQC1CLwAkmUgymWRyenp6sVYrSTrGfAPgUHdqh+79la5+AFjdN25VVzte/R2qantVjVfV+NjY2DzbkyTNZr4BsBM4cifPFuD+vvq13d1AFwOvd6eKHgAuS7K8u/h7WVeTJI3I6bMNSHI3cClwbpL99O7muRm4N8kngReBq7vhu4ArgSngTeA6gKp6NcmXgT3duC9V1bEXliVJQzRrAFTV5uPM+ugMYwvYepz17AB2zKk7SdJJ45PAktQoA0CSGmUASFKjDABJapQBIEmNMgAkqVEGgCQ1ygCQpEYZAJLUKANAkhplAEhSowwASWqUASBJjTIAJKlRBoAkNcoAkKRGGQCS1CgDQJIaZQBIUqMMAElqlAEgSY0yACSpUQaAJDXKAJCkRhkAktQoA0CSGmUASFKjDABJatTpo25Aw3P3j/ePugUANl+watQtSMIjAElqlgEgSY0yACSpUQaAJDVqQQGQ5IUkTyZ5PMlkVzsnye4kz3Xvy7t6ktyaZCrJE0kuXIwNkCTNz2IcAXykqtZX1Xj3eRvwYFWtBR7sPgNcAaztXhPA7Yvw3ZKkeToZp4A2AXd203cCV/XV76qeR4Czk5x3Er5fkjSAhQZAAd9PsjfJRFdbUVUHu+mXgRXd9Ergpb5l93e1X5NkIslkksnp6ekFtidJOp6FPgj24ao6kOS3gN1J/qt/ZlVVkprLCqtqO7AdYHx8fE7LSpIGt6AjgKo60L2/AtwHbAAOHTm1072/0g0/AKzuW3xVV5MkjcC8AyDJbyZ5z5Fp4DLgKWAnsKUbtgW4v5veCVzb3Q10MfB636kiSdKQLeQU0ArgviRH1vOtqvpekj3AvUk+CbwIXN2N3wVcCUwBbwLXLeC7JUkLNO8AqKrngT+cof4/wEdnqBewdb7fJ0laXD4JLEmNMgAkqVEGgCQ1ygCQpEb5F8HUJP86muQRgCQ1ywCQpEYZAJLUKK8BSI3zeki7PAKQpEYZAJLUKANAkhplAEhSowwASWqUASBJjTIAJKlRBoAkNcoAkKRGGQCS1Ch/FYQkdVr7tRgeAUhSowwASWqUASBJjTIAJKlRBoAkNcoAkKRGGQCS1CgDQJIaZQBIUqMMAElqlAEgSY0yACSpUUMPgCQbkzybZCrJtmF/vySpZ6gBkGQZcBtwBbAO2Jxk3TB7kCT1DPsIYAMwVVXPV9UvgXuATUPuQZIEpKqG92XJnwMbq+qvu8+fAC6qquv7xkwAE93H3wWeHVqDx3cu8NNRN7FEuC+Ocl8c5b44ainsi/dX1dhsg5bcH4Spqu3A9lH30S/JZFWNj7qPpcB9cZT74ij3xVGn0r4Y9imgA8Dqvs+rupokaciGHQB7gLVJzk9yJnANsHPIPUiSGPIpoKo6nOR64AFgGbCjqvYNs4d5WlKnpEbMfXGU++Io98VRp8y+GOpFYEnS0uGTwJLUKANAkhplAEhSowwADSzJXaPuQUtDkg1J/qibXpfkb5NcOeq+NDdL7kGwpSbJh+n9Counqur7o+5nWJIce3tugI8kORugqv5s+F1pKUhyI73f53V6kt3ARcBDwLYkF1TVTSNtcMiS/B6wEni0qn7eV99YVd8bXWez8y6gYyT5UVVt6KY/BWwF7gMuA/61qm4eZX/DkuQx4Gngn4CiFwB303t2g6r6wei6W1qSXFdVXx91H8OS5ElgPXAW8DKwqqreSPIb9H4I/sFIGxyiJH9D72fEM/T2yQ1VdX8377GqunCU/c3GU0DvdEbf9ATwsar6Ir0A+KvRtDQS48Be4O+A16vq34FfVNUP/OH/Dl8cdQNDdriq3q6qN4H/rqo3AKrqF8CvRtva0H0K+FBVXQVcCvx9khu6eRlZVwPyFNA7nZZkOb1wTFVNA1TV/yY5PNrWhqeqfgXckuSfu/dDNPz/S5InjjcLWDHMXpaAXyZ5VxcAHzpSTPJe2guA046c9qmqF5JcCvxLkvdjAJyS3kvvX74BKsl5VXUwybs5Bf6DLraq2g/8RZI/Ad4YdT8jtAK4HHjtmHqA/xx+OyP1x1X1Fvz/PxSOOAPYMpqWRuZQkvVV9ThAVf08yZ8CO4DfH21rs/MawICSvAtYUVU/GXUvGr4kdwBfr6qHZ5j3rar6yxG0pRFLsoreKbGXZ5h3SVX9xwjaGpgBIEmN8iKwJDXKAJCkRhkAktQoA0CSGvV/ge68SbzLI/kAAAAASUVORK5CYII=\n", 199 | "text/plain": [ 200 | "
" 201 | ] 202 | }, 203 | "metadata": { 204 | "needs_background": "light" 205 | }, 206 | "output_type": "display_data" 207 | } 208 | ], 209 | "source": [ 210 | "ax=reviews_df.Score.value_counts().plot(kind='bar', colormap='Paired')\n", 211 | "plt.show()" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 12, 217 | "metadata": {}, 218 | "outputs": [ 219 | { 220 | "name": "stderr", 221 | "output_type": "stream", 222 | "text": [ 223 | "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n", 224 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 225 | "\n", 226 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 227 | " \"\"\"Entry point for launching an IPython kernel.\n", 228 | "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n", 229 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 230 | "\n", 231 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 232 | " \n" 233 | ] 234 | } 235 | ], 236 | "source": [ 237 | "reviews_df.Score[reviews_df.Score<=3]=0\n", 238 | "reviews_df.Score[reviews_df.Score>=4]=1" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 15, 244 | "metadata": {}, 245 | "outputs": [ 246 | { 247 | "data": { 248 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAD4CAYAAADlwTGnAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAADpxJREFUeJzt3X+o3Xd9x/Hny8S6MWWN612ISbYEvUPSP0zlEjvcH85im/afVNgkGWgohfhHCgr+seo/9ccKDqYFQQuRZsbhmoWpNJSwLmYOkVGbW81i067rXbUkITZXU6si65b63h/3EzyLN7nnJjfntPk8H3A43+/78/l+z/sLl77y/XFOU1VIkvrzmnE3IEkaDwNAkjplAEhSpwwASeqUASBJnTIAJKlTBoAkdcoAkKROGQCS1Knl427gYq677rpat27duNuQpFeVxx9//MdVNbHQvFd0AKxbt47p6elxtyFJrypJnhtmnpeAJKlTBoAkdcoAkKROGQCS1CkDQJI6ZQBIUqcMAEnqlAEgSZ16RX8R7NXiwe+dGHcLV5VtN6wZdwtSFzwDkKROGQCS1KkFAyDJbyV5LMm/JzmW5BOtvj7Jd5LMJPmHJNe0+uva+kwbXzewr4+2+tNJbrlSByVJWtgwZwAvAe+uqrcBG4HNSW4E/hq4r6reArwA3Nnm3wm80Or3tXkk2QBsBa4HNgNfSLJsKQ9GkjS8BQOg5vyirb62vQp4N/CPrb4HuL0tb2nrtPGbkqTV91bVS1X1A2AG2LQkRyFJWrSh7gEkWZbkCHAaOAj8F/DTqjrbppwAVrfl1cBxgDb+IvB7g/V5thn8rB1JppNMz87OLv6IJElDGSoAqurlqtoIrGHuX+1vvVINVdWuqpqqqqmJiQX/fwaSpEu0qKeAquqnwDeBPwauTXLuewRrgJNt+SSwFqCN/y7wk8H6PNtIkkZsmKeAJpJc25Z/G3gP8BRzQfBnbdp24KG2vL+t08b/paqq1be2p4TWA5PAY0t1IJKkxRnmm8CrgD3tiZ3XAPuq6uEkTwJ7k/wV8D3ggTb/AeDvkswAZ5h78oeqOpZkH/AkcBbYWVUvL+3hSJKGtWAAVNVR4IZ56s8yz1M8VfXfwJ9fYF/3Avcuvk1J0lLzm8CS1CkDQJI6ZQBIUqcMAEnqlAEgSZ0yACSpUwaAJHXKAJCkThkAktQpA0CSOmUASFKnDABJ6pQBIEmdMgAkqVMGgCR1ygCQpE4ZAJLUKQNAkjplAEhSpwwASeqUASBJnTIAJKlTBoAkdcoAkKROGQCS1KkFAyDJ2iTfTPJkkmNJPtTqH09yMsmR9rptYJuPJplJ8nSSWwbqm1ttJsndV+aQJEnDWD7EnLPAR6rqu0neADye5GAbu6+q/mZwcpINwFbgeuBNwDeS/FEb/jzwHuAEcDjJ/qp6cikORJK0OAsGQFWdAk615Z8neQpYfZFNtgB7q+ol4AdJZoBNbWymqp4FSLK3zTUAJGkMFnUPIMk64AbgO610V5KjSXYnWdFqq4HjA5udaLUL1c//jB1JppNMz87OLqY9SdIiDB0ASV4PfBX4cFX9DLgfeDOwkbkzhM8sRUNVtauqpqpqamJiYil2KUmaxzD3AEjyWub+4/+VqvoaQFU9PzD+ReDhtnoSWDuw+ZpW4yJ1SdKIDfMUUIAHgKeq6rMD9VUD094LPNGW9wNbk7wuyXpgEngMOAxMJlmf5BrmbhTvX5rDkCQt1jBnAO8E3g98P8mRVvsYsC3JRqCAHwIfBKiqY0n2MXdz9yyws6peBkhyF/AIsAzYXVXHlvBYJEmLMMxTQN8GMs/QgYtscy9w7zz1AxfbTpI0On4TWJI6ZQBIUqcMAEnqlAEgSZ0yACSpUwaAJHXKAJCkThkAktQpA0CSOmUASFKnDABJ6pQBIEmdMgAkqVMGgCR1ygCQpE4ZAJLUKQNAkjplAEhSpwwASeqUASBJnTIAJKlTBoAkdcoAkKROGQCS1KkFAyDJ2iTfTPJkkmNJPtTqb0xyMMkz7X1FqyfJ55LMJDma5O0D+9re5j+TZPuVOyxJ0kKGOQM4C3ykqjYANwI7k2wA7gYOVdUkcKitA9wKTLbXDuB+mAsM4B7gHcAm4J5zoSFJGr0FA6CqTlXVd9vyz4GngNXAFmBPm7YHuL0tbwG+XHMeBa5Nsgq4BThYVWeq6gXgILB5SY9GkjS0Rd0DSLIOuAH4DrCyqk61oR8BK9vyauD4wGYnWu1C9fM/Y0eS6STTs7Ozi2lPkrQIQwdAktcDXwU+XFU/GxyrqgJqKRqqql1VNVVVUxMTE0uxS0nSPIYKgCSvZe4//l+pqq+18vPt0g7t/XSrnwTWDmy+ptUuVJckjcEwTwEFeAB4qqo+OzC0Hzj3JM924KGB+gfa00A3Ai+2S0WPADcnWdFu/t7capKkMVg+xJx3Au8Hvp/kSKt9DPg0sC/JncBzwPva2AHgNmAG+CVwB0BVnUnyKeBwm/fJqjqzJEchSVq0BQOgqr4N5ALDN80zv4CdF9jXbmD3YhqUJF0ZfhNYkjplAEhSpwwASeqUASBJnTIAJKlTBoAkdcoAkKROGQCS1CkDQJI6ZQBIUqcMAEnqlAEgSZ0yACSpUwaAJHXKAJCkThkAktQpA0CSOmUASFKnDABJ6pQBIEmdMgAkqVMGgCR1ygCQpE4ZAJLUKQNAkjq1YAAk2Z3kdJInBmofT3IyyZH2um1g7KNJZpI8neSWgfrmVptJcvfSH4okaTGGOQP4ErB5nvp9VbWxvQ4AJNkAbAWub9t8IcmyJMuAzwO3AhuAbW2uJGlMli80oaq+lWTdkPvbAuytqpeAHySZATa1sZmqehYgyd4298lFdyxJWhKXcw/griRH2yWiFa22Gjg+MOdEq12o/huS7EgynWR6dnb2MtqTJF3MpQbA/cCbgY3AKeAzS9VQVe2qqqmqmpqYmFiq3UqSzrPgJaD5VNXz55aTfBF4uK2eBNYOTF3TalykLkkag0s6A0iyamD1vcC5J4T2A1uTvC7JemASeAw4DEwmWZ/kGuZuFO+/9LYlSZdrwTOAJA8C7wKuS3ICuAd4V5KNQAE/BD4IUFXHkuxj7ubuWWBnVb3c9nMX8AiwDNhdVceW/GgkSUMb5imgbfOUH7jI/HuBe+epHwAOLKo7SdIV4zeBJalTBoAkdcoAkKROGQCS1CkDQJI6ZQBIUqcMAEnqlAEgSZ0yACSpUwaAJHXKAJCkThkAktQpA0CSOmUASFKnDABJ6pQBIEmdMgAkqVMGgCR1ygCQpE4ZAJLUKQNAkjplAEhSpwwASeqUASBJnVowAJLsTnI6yRMDtTcmOZjkmfa+otWT5HNJZpIcTfL2gW22t/nPJNl+ZQ5HkjSsYc4AvgRsPq92N3CoqiaBQ20d4FZgsr12APfDXGAA9wDvADYB95wLDUnSeCwYAFX1LeDMeeUtwJ62vAe4faD+5ZrzKHBtklXALcDBqjpTVS8AB/nNUJEkjdCl3gNYWVWn2vKPgJVteTVwfGDeiVa7UP03JNmRZDrJ9Ozs7CW2J0layGXfBK6qAmoJejm3v11VNVVVUxMTE0u1W0nSeS41AJ5vl3Zo76db/SSwdmDemla7UF2SNCaXGgD7gXNP8mwHHhqof6A9DXQj8GK7VPQIcHOSFe3m782tJkkak+ULTUjyIPAu4LokJ5h7mufTwL4kdwLPAe9r0w8AtwEzwC+BOwCq6kySTwGH27xPVtX5N5YlSSO0YABU1bYLDN00z9wCdl5gP7uB3YvqTpJ0xSwYAJJe3R783olxt3DV2HbDmnG3sKT8KQhJ6pQBIEmdMgAkqVMGgCR1ygCQpE4ZAJLUKQNAkjplAEhSpwwASeqUASBJnTIAJKlTBoAkdcoAkKROGQCS1CkDQJI6ZQBIUqcMAEnqlAEgSZ0yACSpUwaAJHXKAJCkThkAktQpA0CSOmUASFKnLisAkvwwyfeTHEky3WpvTHIwyTPtfUWrJ8nnkswkOZrk7UtxAJKkS7MUZwB/WlUbq2qqrd8NHKqqSeBQWwe4FZhsrx3A/Uvw2ZKkS3QlLgFtAfa05T3A7QP1L9ecR4Frk6y6Ap8vSRrC5QZAAf+c5PEkO1ptZVWdass/Ala25dXA8YFtT7Ta/5NkR5LpJNOzs7OX2Z4k6UKWX+b2f1JVJ5P8PnAwyX8MDlZVJanF7LCqdgG7AKampha1rSRpeJd1BlBVJ9v7aeDrwCbg+XOXdtr76Tb9JLB2YPM1rSZJGoNLDoAkv5PkDeeWgZuBJ4D9wPY2bTvwUFveD3ygPQ10I/DiwKUiSdKIXc4loJXA15Oc28/fV9U/JTkM7EtyJ/Ac8L42/wBwGzAD/BK44zI+W5J0mS45AKrqWeBt89R/Atw0T72AnZf6eZKkpeU3gSWpUwaAJHXKAJCkThkAktQpA0CSOmUASFKnDABJ6pQBIEmdMgAkqVMGgCR1ygCQpE4ZAJLUKQNAkjplAEhSpwwASeqUASBJnTIAJKlTBoAkdcoAkKROGQCS1CkDQJI6ZQBIUqcMAEnqlAEgSZ0aeQAk2Zzk6SQzSe4e9edLkuaMNACSLAM+D9wKbAC2Jdkwyh4kSXNGfQawCZipqmer6n+AvcCWEfcgSQKWj/jzVgPHB9ZPAO8YnJBkB7Cjrf4iydMj6q0H1wE/HncTC/mLcTegcXnF/32+iv42/3CYSaMOgAVV1S5g17j7uBolma6qqXH3Ic3Hv8/RG/UloJPA2oH1Na0mSRqxUQfAYWAyyfok1wBbgf0j7kGSxIgvAVXV2SR3AY8Ay4DdVXVslD10zktreiXz73PEUlXj7kGSNAZ+E1iSOmUASFKnDABJ6tQr7nsAkq5+Sd7K3K8ArG6lk8D+qnpqfF31xzMASSOV5C+Z+xmYAI+1V4AH/YHI0fIpoA4luaOq/nbcfahPSf4TuL6q/ve8+jXAsaqaHE9n/fEMoE+fGHcD6tqvgDfNU1/VxjQi3gO4SiU5eqEhYOUoe5HO82HgUJJn+PWPQ/4B8BbgrrF11SEvAV2lkjwP3AK8cP4Q8G9VNd+/wKSRSPIa5n4efvAm8OGqenl8XfXHM4Cr18PA66vqyPkDSf519O1Iv1ZVvwIeHXcfvfMMQJI65U1gSeqUASBJnTIAJKlTBoAkder/APcMnNmwIY/4AAAAAElFTkSuQmCC\n", 249 | "text/plain": [ 250 | "
" 251 | ] 252 | }, 253 | "metadata": { 254 | "needs_background": "light" 255 | }, 256 | "output_type": "display_data" 257 | } 258 | ], 259 | "source": [ 260 | "ax=reviews_df.Score.value_counts().plot(kind='bar', colormap='Paired')\n", 261 | "plt.show()" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 13, 267 | "metadata": {}, 268 | "outputs": [ 269 | { 270 | "data": { 271 | "text/html": [ 272 | "
\n", 273 | "\n", 286 | "\n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | "
TextScore
0To be fair only one of my twins got gas from t...0
1Just recieved our first order of these (they d...1
2This is a fantastic Chai Masala. I am very pic...1
3Flavorful and has added nutrition! You use le...1
4I thought this soup tasted good. I liked the t...0
\n", 322 | "
" 323 | ], 324 | "text/plain": [ 325 | " Text Score\n", 326 | "0 To be fair only one of my twins got gas from t... 0\n", 327 | "1 Just recieved our first order of these (they d... 1\n", 328 | "2 This is a fantastic Chai Masala. I am very pic... 1\n", 329 | "3 Flavorful and has added nutrition! You use le... 1\n", 330 | "4 I thought this soup tasted good. I liked the t... 0" 331 | ] 332 | }, 333 | "execution_count": 13, 334 | "metadata": {}, 335 | "output_type": "execute_result" 336 | } 337 | ], 338 | "source": [ 339 | "reviews_df.head()" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 23, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [ 348 | "import spacy\n", 349 | "import random\n", 350 | "from spacy.training import Example \n", 351 | "from spacy.pipeline.textcat_multilabel import DEFAULT_MULTI_TEXTCAT_MODEL " 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 25, 357 | "metadata": {}, 358 | "outputs": [], 359 | "source": [ 360 | "nlp = spacy.load(\"en_core_web_md\") \n", 361 | "\n", 362 | "config = { \n", 363 | "\n", 364 | " \"threshold\": 0.5, \n", 365 | "\n", 366 | " \"model\": DEFAULT_MULTI_TEXTCAT_MODEL \n", 367 | "\n", 368 | "} \n", 369 | "\n", 370 | "textcat = nlp.add_pipe(\"textcat_multilabel\", config=config) " 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 26, 376 | "metadata": {}, 377 | "outputs": [], 378 | "source": [ 379 | "train_examples = []\n", 380 | "\n", 381 | "for index, row in reviews_df.iterrows():\n", 382 | " text = row[\"Text\"]\n", 383 | " rating = row[\"Score\"]\n", 384 | " label = {\"POS\": True, \"NEG\": False} if rating == 1 else {\"NEG\": True, \"POS\": False}\n", 385 | " train_examples.append(Example.from_dict(nlp.make_doc(text), {\"cats\": label}))\n", 386 | " " 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 28, 392 | "metadata": {}, 393 | "outputs": [ 394 | { 395 | "data": { 396 | "text/plain": [ 397 | "{'doc_annotation': {'cats': {'NEG': True, 'POS': False}, 'entities': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'links': {}}, 'token_annotation': {'ORTH': ['To', 'be', 'fair', 'only', 'one', 'of', 'my', 'twins', 'got', 'gas', 'from', 'this', 'but', 'it', 'was', 'horrible', '.', 'Up', 'all', 'night', 'screaming', 'from', 'gas', 'pains', '.', 'Garbanzo', 'beans', 'are', 'not', 'an', 'ideal', 'food', 'for', 'young', 'babies', '.'], 'SPACY': [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, True, True, True, True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, False, False], 'TAG': ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], 'LEMMA': ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], 'POS': ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], 'MORPH': ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], 'HEAD': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], 'DEP': ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], 'SENT_START': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}}" 398 | ] 399 | }, 400 | "execution_count": 28, 401 | "metadata": {}, 402 | "output_type": "execute_result" 403 | } 404 | ], 405 | "source": [ 406 | "train_examples[0]" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 29, 412 | "metadata": {}, 413 | "outputs": [], 414 | "source": [ 415 | "textcat.add_label(\"POS\")\n", 416 | "textcat.add_label(\"NEG\")\n", 417 | "textcat.initialize(lambda: train_examples, nlp=nlp) " 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": 30, 423 | "metadata": {}, 424 | "outputs": [], 425 | "source": [ 426 | "epochs = 2\n", 427 | "\n", 428 | "with nlp.select_pipes(enable=\"textcat_multilabel\"): \n", 429 | "\n", 430 | " optimizer = nlp.resume_training() \n", 431 | "\n", 432 | " for i in range(epochs): \n", 433 | "\n", 434 | " random.shuffle(train_examples) \n", 435 | "\n", 436 | " for example in train_examples: \n", 437 | "\n", 438 | " nlp.update([example], sgd=optimizer) " 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": 31, 444 | "metadata": {}, 445 | "outputs": [], 446 | "source": [ 447 | "doc2 = nlp(\"This is the best food I ever ate\")" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": 32, 453 | "metadata": {}, 454 | "outputs": [ 455 | { 456 | "data": { 457 | "text/plain": [ 458 | "{'POS': 0.9553419947624207, 'NEG': 0.061326123774051666}" 459 | ] 460 | }, 461 | "execution_count": 32, 462 | "metadata": {}, 463 | "output_type": "execute_result" 464 | } 465 | ], 466 | "source": [ 467 | "doc2.cats" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": 33, 473 | "metadata": {}, 474 | "outputs": [], 475 | "source": [ 476 | "doc3 = nlp(\"This food is so bad\")" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": 34, 482 | "metadata": {}, 483 | "outputs": [ 484 | { 485 | "data": { 486 | "text/plain": [ 487 | "{'POS': 0.21204468607902527, 'NEG': 0.8010350465774536}" 488 | ] 489 | }, 490 | "execution_count": 34, 491 | "metadata": {}, 492 | "output_type": "execute_result" 493 | } 494 | ], 495 | "source": [ 496 | "doc3.cats" 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": 35, 502 | "metadata": {}, 503 | "outputs": [], 504 | "source": [] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "execution_count": null, 509 | "metadata": {}, 510 | "outputs": [], 511 | "source": [] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": null, 516 | "metadata": {}, 517 | "outputs": [], 518 | "source": [] 519 | } 520 | ], 521 | "metadata": { 522 | "kernelspec": { 523 | "display_name": "Python 3", 524 | "language": "python", 525 | "name": "python3" 526 | }, 527 | "language_info": { 528 | "codemirror_mode": { 529 | "name": "ipython", 530 | "version": 3 531 | }, 532 | "file_extension": ".py", 533 | "mimetype": "text/x-python", 534 | "name": "python", 535 | "nbconvert_exporter": "python", 536 | "pygments_lexer": "ipython3", 537 | "version": "3.6.9" 538 | } 539 | }, 540 | "nbformat": 4, 541 | "nbformat_minor": 2 542 | } 543 | -------------------------------------------------------------------------------- /Chapter08/data/Reviews.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Mastering-spaCy/eea4fab36b519c6d5d0cf86f5c330fdb6d791dd4/Chapter08/data/Reviews.zip -------------------------------------------------------------------------------- /Chapter09/bert_vectors.py: -------------------------------------------------------------------------------- 1 | from transformers import BertTokenizer, TFBertModel 2 | 3 | btokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 4 | bmodel = TFBertModel.from_pretrained("bert-base-uncased") 5 | 6 | sentence = "He was idle." 7 | 8 | encoded = btokenizer.encode_plus( 9 | text=sentence, 10 | add_special_tokens=True, 11 | max_length=10, 12 | pad_to_max_length=True, 13 | return_attention_mask=True, 14 | return_tensors="tf" 15 | ) 16 | 17 | inputs = encoded["input_ids"] 18 | 19 | outputs = bmodel(inputs) 20 | 21 | 22 | 23 | 24 | print(outputs[0].shape) 25 | print(outputs[1].shape) 26 | -------------------------------------------------------------------------------- /Chapter09/data/spam.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Mastering-spaCy/eea4fab36b519c6d5d0cf86f5c330fdb6d791dd4/Chapter09/data/spam.csv -------------------------------------------------------------------------------- /Chapter09/tokenizer_basic.py: -------------------------------------------------------------------------------- 1 | from transformers import BertTokenizer 2 | 3 | btokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 4 | sentence = "He lived characteristically idle and romantic." 5 | sentence = "[CLS] " + sentence + " [SEP]" 6 | tokens = btokenizer.tokenize(sentence) 7 | 8 | print(tokens) 9 | 10 | ids = btokenizer.convert_tokens_to_ids(tokens) 11 | print(ids) 12 | 13 | -------------------------------------------------------------------------------- /Chapter09/tokenizer_encode.py: -------------------------------------------------------------------------------- 1 | from transformers import BertTokenizer 2 | 3 | btokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 4 | sentence = "He lived characteristically idle and romantic." 5 | ids = btokenizer.encode(sentence) 6 | print(ids) 7 | 8 | -------------------------------------------------------------------------------- /Chapter09/tokenizer_encode_plus.py: -------------------------------------------------------------------------------- 1 | from transformers import BertTokenizer 2 | 3 | btokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 4 | sentence = "He lived characteristically idle and romantic." 5 | 6 | encoded = btokenizer.encode_plus( 7 | text=sentence, 8 | add_special_tokens=True, 9 | max_length=12, 10 | pad_to_max_length=True, 11 | return_tensors="tf" 12 | ) 13 | 14 | token_ids = encoded["input_ids"] 15 | 16 | print(token_ids) 17 | -------------------------------------------------------------------------------- /Chapter09/transformer_pipe.py: -------------------------------------------------------------------------------- 1 | from transformers import pipeline 2 | 3 | nlp = pipeline("sentiment-analysis") 4 | 5 | 6 | sent1 = "I hate you so much right now." 7 | sent2 = "I love fresh air and exercising." 8 | 9 | result1 = nlp(sent1) 10 | result2 = nlp(sent2) 11 | 12 | 13 | print(result1) 14 | print(result2) 15 | -------------------------------------------------------------------------------- /Chapter09/transformer_pipe_que.py: -------------------------------------------------------------------------------- 1 | from transformers import pipeline 2 | 3 | nlp = pipeline("question-answering") 4 | 5 | res = nlp({ 6 | 'question': 'What is the name of this book ?', 7 | 'context': "I'll publish my new book Mastering spaCy soon." 8 | }) 9 | 10 | print(res) 11 | -------------------------------------------------------------------------------- /Chapter10/Intent-classifier-char-LSTM.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from tensorflow.keras.preprocessing.text import Tokenizer\n", 10 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", 11 | "from tensorflow.keras.models import Model\n", 12 | "from tensorflow.keras.layers import LSTM,Dense, Conv1D, MaxPooling1D, Bidirectional, Dropout, Input, Embedding\n", 13 | "from tensorflow.keras import optimizers\n", 14 | "import numpy as np\n", 15 | "from tensorflow.keras.callbacks import TensorBoard\n", 16 | "from sklearn.utils import shuffle\n", 17 | "import json" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 8, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "utterances = []\n", 27 | "labels = []\n", 28 | "\n", 29 | "with open(\"data/restaurants.json\", \"r\") as jfile:\n", 30 | " data = json.load(jfile)\n", 31 | " \n", 32 | " for dialogue in data:\n", 33 | " turns = dialogue[\"turns\"]\n", 34 | " for turn in turns:\n", 35 | " speaker = turn[\"speaker\"]\n", 36 | " if speaker == \"USER\":\n", 37 | " utterance, intent = turn[\"utterance\"], turn[\"intent\"]\n", 38 | " label = 1 if intent == \"FindRestaurants\" else 0\n", 39 | " utterances.append(utterance)\n", 40 | " labels.append(label)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 9, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "data": { 50 | "text/plain": [ 51 | "['I am feeling hungry so I would like to find a place to eat.',\n", 52 | " 'I would like for it to be in San Jose.',\n", 53 | " 'I usually like eating the American type of food.',\n", 54 | " 'Can you give me the address of this restaurant.',\n", 55 | " 'Can you give me the phone number that I can contact them with?',\n", 56 | " 'Is there some other restaurant which you can suggest?',\n", 57 | " 'Do you have another restaurant matching my needs? For example a restaurant which is economical and is located in Palo Alto.',\n", 58 | " 'Alright, that seems good. I would like to make a booking at this restaurant.',\n", 59 | " 'I will be eating there at 11:30 am so make it for then.',\n", 60 | " 'That suits me well. Can you tell me if they feature live music?']" 61 | ] 62 | }, 63 | "execution_count": 9, 64 | "metadata": {}, 65 | "output_type": "execute_result" 66 | } 67 | ], 68 | "source": [ 69 | "utterances[:10]" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 10, 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "data": { 79 | "text/plain": [ 80 | "[1, 1, 1, 1, 1, 1, 1, 0, 0, 0]" 81 | ] 82 | }, 83 | "execution_count": 10, 84 | "metadata": {}, 85 | "output_type": "execute_result" 86 | } 87 | ], 88 | "source": [ 89 | "labels[:10]" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 11, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "data": { 99 | "text/plain": [ 100 | "(1233, 1233)" 101 | ] 102 | }, 103 | "execution_count": 11, 104 | "metadata": {}, 105 | "output_type": "execute_result" 106 | } 107 | ], 108 | "source": [ 109 | "len(utterances), len(labels)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 15, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 16, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "utterances, labels = shuffle(utterances, labels, random_state=0)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 20, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "tokenizer = Tokenizer(char_level=True,filters=\".,;'\\\"-\", lower=True)\n", 135 | "tokenizer.fit_on_texts(utterances)" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 21, 141 | "metadata": {}, 142 | "outputs": [ 143 | { 144 | "data": { 145 | "text/plain": [ 146 | "{' ': 1,\n", 147 | " 'e': 2,\n", 148 | " 'a': 3,\n", 149 | " 't': 4,\n", 150 | " 'o': 5,\n", 151 | " 'n': 6,\n", 152 | " 'i': 7,\n", 153 | " 'r': 8,\n", 154 | " 's': 9,\n", 155 | " 'h': 10,\n", 156 | " 'l': 11,\n", 157 | " 'd': 12,\n", 158 | " 'u': 13,\n", 159 | " '.': 14,\n", 160 | " 'm': 15,\n", 161 | " 'c': 16,\n", 162 | " 'y': 17,\n", 163 | " 'f': 18,\n", 164 | " 'p': 19,\n", 165 | " 'k': 20,\n", 166 | " 'g': 21,\n", 167 | " 'w': 22,\n", 168 | " 'v': 23,\n", 169 | " '?': 24,\n", 170 | " ',': 25,\n", 171 | " 'b': 26,\n", 172 | " \"'\": 27,\n", 173 | " '1': 28,\n", 174 | " ':': 29,\n", 175 | " '0': 30,\n", 176 | " '3': 31,\n", 177 | " '5': 32,\n", 178 | " 'x': 33,\n", 179 | " '4': 34,\n", 180 | " 'q': 35,\n", 181 | " '2': 36,\n", 182 | " '!': 37,\n", 183 | " 'z': 38,\n", 184 | " '7': 39,\n", 185 | " '6': 40,\n", 186 | " 'j': 41,\n", 187 | " '-': 42,\n", 188 | " '8': 43,\n", 189 | " '9': 44,\n", 190 | " '\"': 45,\n", 191 | " '`': 46}" 192 | ] 193 | }, 194 | "execution_count": 21, 195 | "metadata": {}, 196 | "output_type": "execute_result" 197 | } 198 | ], 199 | "source": [ 200 | "tokenizer.word_index" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 22, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "utterances = tokenizer.texts_to_sequences(utterances)" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 23, 215 | "metadata": {}, 216 | "outputs": [ 217 | { 218 | "name": "stdout", 219 | "output_type": "stream", 220 | "text": [ 221 | "156\n" 222 | ] 223 | } 224 | ], 225 | "source": [ 226 | "mutt_len = max([len(ans) for ans in utterances])\n", 227 | "\n", 228 | "print(mutt_len)" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 24, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "MAX_LEN = 150" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 25, 243 | "metadata": {}, 244 | "outputs": [ 245 | { 246 | "data": { 247 | "text/plain": [ 248 | "[17,\n", 249 | " 2,\n", 250 | " 9,\n", 251 | " 25,\n", 252 | " 1,\n", 253 | " 7,\n", 254 | " 1,\n", 255 | " 22,\n", 256 | " 3,\n", 257 | " 6,\n", 258 | " 4,\n", 259 | " 1,\n", 260 | " 7,\n", 261 | " 4,\n", 262 | " 1,\n", 263 | " 5,\n", 264 | " 6,\n", 265 | " 1,\n", 266 | " 4,\n", 267 | " 10,\n", 268 | " 2,\n", 269 | " 1,\n", 270 | " 28,\n", 271 | " 28,\n", 272 | " 4,\n", 273 | " 10]" 274 | ] 275 | }, 276 | "execution_count": 25, 277 | "metadata": {}, 278 | "output_type": "execute_result" 279 | } 280 | ], 281 | "source": [ 282 | "utterances[0]" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 26, 288 | "metadata": {}, 289 | "outputs": [], 290 | "source": [ 291 | "utterances = pad_sequences(utterances, MAX_LEN, padding=\"post\")" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 27, 297 | "metadata": {}, 298 | "outputs": [ 299 | { 300 | "data": { 301 | "text/plain": [ 302 | "array([17, 2, 9, 25, 1, 7, 1, 22, 3, 6, 4, 1, 7, 4, 1, 5, 6,\n", 303 | " 1, 4, 10, 2, 1, 28, 28, 4, 10, 0, 0, 0, 0, 0, 0, 0, 0,\n", 304 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 305 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 306 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 307 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 308 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 309 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 310 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", 311 | " dtype=int32)" 312 | ] 313 | }, 314 | "execution_count": 27, 315 | "metadata": {}, 316 | "output_type": "execute_result" 317 | } 318 | ], 319 | "source": [ 320 | "utterances[0]" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 28, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "utterances, labels = np.array(utterances), np.array(labels)" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 29, 335 | "metadata": {}, 336 | "outputs": [ 337 | { 338 | "data": { 339 | "text/plain": [ 340 | "((1233, 150), (1233,))" 341 | ] 342 | }, 343 | "execution_count": 29, 344 | "metadata": {}, 345 | "output_type": "execute_result" 346 | } 347 | ], 348 | "source": [ 349 | "utterances.shape, labels.shape" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": 30, 355 | "metadata": {}, 356 | "outputs": [], 357 | "source": [ 358 | "utt_in = Input(shape=(MAX_LEN,))\n", 359 | "\n", 360 | "embedding_layer = Embedding(input_dim = len(tokenizer.word_index)+1, output_dim = 100, input_length=MAX_LEN)\n", 361 | "lstm = Bidirectional(LSTM(units=100, return_sequences=False))\n", 362 | "\n", 363 | "utt_embedding = embedding_layer(utt_in)\n", 364 | "utt_encoded = lstm(utt_embedding)\n", 365 | "\n", 366 | "output = Dense(1, activation='sigmoid')(utt_encoded)" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": 31, 372 | "metadata": {}, 373 | "outputs": [], 374 | "source": [ 375 | "model = Model(utt_in, output)" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": 32, 381 | "metadata": {}, 382 | "outputs": [], 383 | "source": [ 384 | "model.compile(loss = 'binary_crossentropy', optimizer = \"adam\", metrics=[\"accuracy\"])" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": 33, 390 | "metadata": {}, 391 | "outputs": [ 392 | { 393 | "name": "stdout", 394 | "output_type": "stream", 395 | "text": [ 396 | "Model: \"model\"\n", 397 | "_________________________________________________________________\n", 398 | "Layer (type) Output Shape Param # \n", 399 | "=================================================================\n", 400 | "input_1 (InputLayer) [(None, 150)] 0 \n", 401 | "_________________________________________________________________\n", 402 | "embedding (Embedding) (None, 150, 100) 4700 \n", 403 | "_________________________________________________________________\n", 404 | "bidirectional (Bidirectional (None, 200) 160800 \n", 405 | "_________________________________________________________________\n", 406 | "classification_layer (Dense) (None, 1) 201 \n", 407 | "=================================================================\n", 408 | "Total params: 165,701\n", 409 | "Trainable params: 165,701\n", 410 | "Non-trainable params: 0\n", 411 | "_________________________________________________________________\n" 412 | ] 413 | } 414 | ], 415 | "source": [ 416 | "model.summary()" 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": 35, 422 | "metadata": {}, 423 | "outputs": [ 424 | { 425 | "name": "stdout", 426 | "output_type": "stream", 427 | "text": [ 428 | "Epoch 1/10\n", 429 | "18/18 [==============================] - 4s 228ms/step - loss: 0.6782 - accuracy: 0.5654 - val_loss: 0.6419 - val_accuracy: 0.7016\n", 430 | "Epoch 2/10\n", 431 | "18/18 [==============================] - 3s 193ms/step - loss: 0.5948 - accuracy: 0.7160 - val_loss: 0.5413 - val_accuracy: 0.6935\n", 432 | "Epoch 3/10\n", 433 | "18/18 [==============================] - 3s 193ms/step - loss: 0.5163 - accuracy: 0.7520 - val_loss: 0.5143 - val_accuracy: 0.7661\n", 434 | "Epoch 4/10\n", 435 | "18/18 [==============================] - 4s 201ms/step - loss: 0.4497 - accuracy: 0.7962 - val_loss: 0.4036 - val_accuracy: 0.8226\n", 436 | "Epoch 5/10\n", 437 | "18/18 [==============================] - 3s 194ms/step - loss: 0.3802 - accuracy: 0.8512 - val_loss: 0.3389 - val_accuracy: 0.8710\n", 438 | "Epoch 6/10\n", 439 | "18/18 [==============================] - 3s 194ms/step - loss: 0.3342 - accuracy: 0.8747 - val_loss: 0.3117 - val_accuracy: 0.8871\n", 440 | "Epoch 7/10\n", 441 | "18/18 [==============================] - 4s 202ms/step - loss: 0.3090 - accuracy: 0.8765 - val_loss: 0.3222 - val_accuracy: 0.8790\n", 442 | "Epoch 8/10\n", 443 | "18/18 [==============================] - 3s 194ms/step - loss: 0.3511 - accuracy: 0.8638 - val_loss: 0.4440 - val_accuracy: 0.8145\n", 444 | "Epoch 9/10\n", 445 | "18/18 [==============================] - 4s 195ms/step - loss: 0.4497 - accuracy: 0.8088 - val_loss: 0.4392 - val_accuracy: 0.8226\n", 446 | "Epoch 10/10\n", 447 | "18/18 [==============================] - 4s 210ms/step - loss: 0.3869 - accuracy: 0.8431 - val_loss: 0.4058 - val_accuracy: 0.8226\n" 448 | ] 449 | }, 450 | { 451 | "data": { 452 | "text/plain": [ 453 | "" 454 | ] 455 | }, 456 | "execution_count": 35, 457 | "metadata": {}, 458 | "output_type": "execute_result" 459 | } 460 | ], 461 | "source": [ 462 | "model.fit(utterances, labels, validation_split=0.1, epochs = 10, batch_size = 64)" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": null, 468 | "metadata": {}, 469 | "outputs": [], 470 | "source": [] 471 | } 472 | ], 473 | "metadata": { 474 | "kernelspec": { 475 | "display_name": "Python 3", 476 | "language": "python", 477 | "name": "python3" 478 | }, 479 | "language_info": { 480 | "codemirror_mode": { 481 | "name": "ipython", 482 | "version": 3 483 | }, 484 | "file_extension": ".py", 485 | "mimetype": "text/x-python", 486 | "name": "python", 487 | "nbconvert_exporter": "python", 488 | "pygments_lexer": "ipython3", 489 | "version": "3.6.9" 490 | } 491 | }, 492 | "nbformat": 4, 493 | "nbformat_minor": 2 494 | } 495 | -------------------------------------------------------------------------------- /Chapter10/extract_city_ents.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | 3 | nlp = spacy.load("en_core_web_md") 4 | 5 | 6 | with open("data/utterances.txt", "r") as utterances: 7 | for utterance in utterances: 8 | utterance = utterance.strip() 9 | doc = nlp(utterance) 10 | ents = doc.ents 11 | if ents: 12 | for ent in ents: 13 | if ent.label_ == "GPE": 14 | print(ent.text, "\t", utterance) 15 | -------------------------------------------------------------------------------- /Chapter10/extract_date_times.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | 3 | nlp = spacy.load("en_core_web_md") 4 | 5 | 6 | with open("data/utterances.txt", "r") as utterances: 7 | for utterance in utterances: 8 | utterance = utterance.strip() 9 | doc = nlp(utterance) 10 | ents = doc.ents 11 | if ents: 12 | for ent in ents: 13 | if ent.label_ == "TIME" or ent.label_ == "DATE": 14 | print(ent.text, "\t", utterance) 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # Mastering spaCy 5 | 6 | Mastering spaCy 7 | 8 | This is the code repository for [Mastering spaCy](https://www.packtpub.com/product/mastering-spacy/9781800563353), published by Packt. 9 | 10 | **An end-to-end practical guide to implementing NLP applications using the Python ecosystem** 11 | 12 | ## What is this book about? 13 | spaCy is an industrial-grade, efficient NLP Python library. It offers various pre-trained models and ready-to-use features. Mastering spaCy provides you with end-to-end coverage of spaCy's features and real-world applications. 14 | 15 | This book covers the following exciting features: 16 | * Install spaCy, get started easily, and write your first Python script 17 | * Understand core linguistic operations of spaCy 18 | * Discover how to combine rule-based components with spaCy statistical models 19 | * Become well-versed with named entity and keyword extraction 20 | * Build your own ML pipelines using spaCy 21 | 22 | If you feel this book is for you, get your [copy](https://www.amazon.com/dp/1800563353) today! 23 | 24 | https://www.packtpub.com/ 26 | 27 | ## Errata 28 | 29 | page 10 30 | 31 | How it looks like: word.index(e) 32 | How it should be: word.index("e") 33 | 34 | How it looks like: vecs = np.vstack([word.vector for word in vocab if word.has_vector]) 35 | How it should be: vecs = np.vstack([word.vector for word in vocab if word.has_vector]) 36 | 37 | 38 | 39 | ## Instructions and Navigations 40 | All of the code is organized into folders. For example, Chapter02. 41 | 42 | The code will look like the following: 43 | ``` 44 | import spacy 45 | nlp = spacy.load("en_subwords_wiki_lg" 46 | ``` 47 | 48 | **Following is what you need for this book:** 49 | This book is for data scientists and machine learners who want to excel in NLP as well as NLP developers who want to master spaCy and build applications with it. Language and speech professionals who want to get hands-on with Python and spaCy and software developers who want to quickly prototype applications with spaCy will also find this book helpful. Beginner-level knowledge of the Python programming language is required to get the most out of this book. A beginner-level understanding of linguistics such as parsing, POS tags, and semantic similarity will also be useful. 50 | 51 | With the following software and hardware list you can run all code files present in the book (Chapter 1-15). 52 | 53 | ### Software and Hardware List 54 | 55 | | Chapter | Software required | OS required | 56 | | -------- | ------------------------------------| -----------------------------------| 57 | | 1 | Python>=3.6 | Windows, Mac OS X, and Linux (Any) | 58 | | 2 | spaCy v3.0 | Windows, Mac OS X, and Linux (Any) | 59 | | 3 | Tensorflow 2.0 | Windows, Mac OS X, and Linux (Any) | 60 | | 4 | Transformers | Windows, Mac OS X, and Linux (Any) | 61 | | 5 | scikit-learn | Windows, Mac OS X, and Linux (Any) | 62 | | 6 | pandas | Windows, Mac OS X, and Linux (Any) | 63 | | 7 | NumPy | Windows, Mac OS X, and Linux (Any) | 64 | | 8 | matplotlib | Windows, Mac OS X, and Linux (Any) | 65 | | 9 | Jupyter | Windows, Mac OS X, and Linux (Any) | 66 | 67 | 68 | We also provide a PDF file that has color images of the screenshots/diagrams used in this book. [Click here to download it]( https://static.packt-cdn.com/downloads/9781800563353_ColorImages.pdf). 69 | 70 | 71 | ### Related products 72 | * Python Natural Language Processing Cookbook [[Packt]](https://www.packtpub.com/product/python-natural-language-processing-cookbook/9781838987312) [[Amazon]](https://www.amazon.com/dp/1838987312) 73 | 74 | * Getting Started with Google BERT [[Packt]](https://www.packtpub.com/product/getting-started-with-google-bert/9781838821593) [[Amazon]](https://www.amazon.com/dp/1838821597) 75 | 76 | ## Get to Know the Author 77 | **Duygu Altınok** 78 | is a senior Natural Language Processing (NLP) engineer with 12 years of experience in almost all areas of NLP, including search engine technology, speech recognition, text analytics, and conversational AI. She has published several publications in the NLP area at conferences such as LREC and CLNLP. She also enjoys working on open source projects and is a contributor to the spaCy library. Duygu earned her undergraduate degree in computer engineering from METU, Ankara, in 2010 and later earned her master's degree in mathematics from Bilkent University, Ankara, in 2012. She is currently a senior engineer at German Autolabs with a focus on conversational AI for voice assistants. Originally from Istanbul, Duygu currently resides in Berlin, Germany, with her cute dog Adele. 79 | 80 | 81 | ### Download a free PDF 82 | 83 | If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.
Simply click on the link to claim your free PDF.
84 |

https://packt.link/free-ebook/9781800563353

--------------------------------------------------------------------------------