├── BehindSpacy.jpg ├── Intent Classification With Rasa - Spacy ├── config_spacy.yaml ├── nlu_nlp_explain.png ├── projects │ └── default │ │ └── model_20180602-072117 │ │ ├── entity_synonyms.json │ │ ├── crf_model.pkl │ │ ├── intent_classifier_sklearn.pkl │ │ ├── regex_featurizer.json │ │ ├── metadata.json │ │ └── training_data.json ├── demo-rasa.json ├── rasa_dataset.json └── Intent Classification With Rasa NLU and SpaCy.ipynb ├── SpaCy_logo.png ├── quotesfile.txt ├── textacylogo1.png ├── NLP_in_French ├── SpaCy_logo.png └── BehindSpacy.jpg ├── NLP_with_SpaCy ├── BehindSpacy.jpg ├── SpaCy_logo.png ├── quotesfile.txt ├── imageredacted.jpg ├── samplefile.txt ├── quotesfiles.txt ├── spacy_summarizer.py ├── spacy_pipeline.svg ├── How to detect languages with SpaCy.ipynb ├── NLP with SpaCy- Adding Extensions Attributes in SpaCy(How to use sentiment analysis in SpaCy).ipynb ├── Automatic Redaction & Sanitization of Document Using Spacy NER.ipynb ├── How to Find the Most Common Words Using Spacy.ipynb └── Training the Named Entity Recognizer in SpaCy.ipynb ├── NLP_with_Textacy ├── textacylogo1.png ├── README.md ├── example.txt └── example1.txt ├── samplefile.txt ├── NLP_with_Flair └── text_classification_with_flair_workflow_jcharistech.png ├── NLP with JavaScript ├── index.js ├── NLP-with-JavaScript.md └── index.html ├── quotesfiles.txt ├── NLP-with-JavaScript.md ├── README.md ├── index.html ├── example.txt ├── spacy_pipeline.svg ├── example1.txt ├── NLP with SpaCy- Adding Extensions Attributes in SpaCy(How to use sentiment analysis in SpaCy).ipynb ├── Text Summarization with Sumy Python .ipynb ├── How to Find the Most Common Words Using Spacy.ipynb ├── NLP_with_Polyglot └── NLP with Polyglot .ipynb └── Training the Named Entity Recognizer in SpaCy.ipynb /BehindSpacy.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jcharis/Natural-Language-Processing-Tutorials/HEAD/BehindSpacy.jpg -------------------------------------------------------------------------------- /Intent Classification With Rasa - Spacy/config_spacy.yaml: -------------------------------------------------------------------------------- 1 | language: "en" 2 | 3 | pipeline: "spacy_sklearn" 4 | -------------------------------------------------------------------------------- /SpaCy_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jcharis/Natural-Language-Processing-Tutorials/HEAD/SpaCy_logo.png -------------------------------------------------------------------------------- /quotesfile.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jcharis/Natural-Language-Processing-Tutorials/HEAD/quotesfile.txt -------------------------------------------------------------------------------- /textacylogo1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jcharis/Natural-Language-Processing-Tutorials/HEAD/textacylogo1.png -------------------------------------------------------------------------------- /NLP_in_French/SpaCy_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jcharis/Natural-Language-Processing-Tutorials/HEAD/NLP_in_French/SpaCy_logo.png -------------------------------------------------------------------------------- /NLP_in_French/BehindSpacy.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jcharis/Natural-Language-Processing-Tutorials/HEAD/NLP_in_French/BehindSpacy.jpg -------------------------------------------------------------------------------- /NLP_with_SpaCy/BehindSpacy.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jcharis/Natural-Language-Processing-Tutorials/HEAD/NLP_with_SpaCy/BehindSpacy.jpg -------------------------------------------------------------------------------- /NLP_with_SpaCy/SpaCy_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jcharis/Natural-Language-Processing-Tutorials/HEAD/NLP_with_SpaCy/SpaCy_logo.png -------------------------------------------------------------------------------- /NLP_with_SpaCy/quotesfile.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jcharis/Natural-Language-Processing-Tutorials/HEAD/NLP_with_SpaCy/quotesfile.txt -------------------------------------------------------------------------------- /NLP_with_SpaCy/imageredacted.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jcharis/Natural-Language-Processing-Tutorials/HEAD/NLP_with_SpaCy/imageredacted.jpg -------------------------------------------------------------------------------- /NLP_with_Textacy/textacylogo1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jcharis/Natural-Language-Processing-Tutorials/HEAD/NLP_with_Textacy/textacylogo1.png -------------------------------------------------------------------------------- /Intent Classification With Rasa - Spacy/nlu_nlp_explain.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jcharis/Natural-Language-Processing-Tutorials/HEAD/Intent Classification With Rasa - Spacy/nlu_nlp_explain.png -------------------------------------------------------------------------------- /samplefile.txt: -------------------------------------------------------------------------------- 1 | The best error message is the one that never shows up. 2 | You Learn More From Failure Than From Success. 3 | The purpose of software engineering is to control complexity, not to create it -------------------------------------------------------------------------------- /Intent Classification With Rasa - Spacy/projects/default/model_20180602-072117/entity_synonyms.json: -------------------------------------------------------------------------------- 1 | { 2 | "chinese": "chinese", 3 | "chines": "chinese", 4 | "veggie": "vegetarian", 5 | "vegg": "vegetarian" 6 | } -------------------------------------------------------------------------------- /NLP_with_Flair/text_classification_with_flair_workflow_jcharistech.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jcharis/Natural-Language-Processing-Tutorials/HEAD/NLP_with_Flair/text_classification_with_flair_workflow_jcharistech.png -------------------------------------------------------------------------------- /NLP_with_SpaCy/samplefile.txt: -------------------------------------------------------------------------------- 1 | The best error message is the one that never shows up. 2 | You Learn More From Failure Than From Success. 3 | The purpose of software engineering is to control complexity, not to create it -------------------------------------------------------------------------------- /Intent Classification With Rasa - Spacy/projects/default/model_20180602-072117/crf_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jcharis/Natural-Language-Processing-Tutorials/HEAD/Intent Classification With Rasa - Spacy/projects/default/model_20180602-072117/crf_model.pkl -------------------------------------------------------------------------------- /Intent Classification With Rasa - Spacy/projects/default/model_20180602-072117/intent_classifier_sklearn.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jcharis/Natural-Language-Processing-Tutorials/HEAD/Intent Classification With Rasa - Spacy/projects/default/model_20180602-072117/intent_classifier_sklearn.pkl -------------------------------------------------------------------------------- /Intent Classification With Rasa - Spacy/projects/default/model_20180602-072117/regex_featurizer.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "name": "greet", 4 | "pattern": "hey[^\\s]*" 5 | }, 6 | { 7 | "name": "zipcode", 8 | "pattern": "[0-9]{5}" 9 | } 10 | ] -------------------------------------------------------------------------------- /NLP_with_Textacy/README.md: -------------------------------------------------------------------------------- 1 | ### Natural Language Processing with Textacy & SpaCy 2 | - Textacy is a library built on top of the high performant NLP library spaCy. 3 | - Useful for text-preprocessing 4 | - Topic modelling 5 | - Information Extraction 6 | - Keyterm 7 | - Emotional Valency Analysis 8 | - Many more 9 | -------------------------------------------------------------------------------- /NLP with JavaScript/index.js: -------------------------------------------------------------------------------- 1 | var Sentiment = require('sentiment'); 2 | var sentiment = new Sentiment(); 3 | 4 | var docx = sentiment.analyze("I like apples"); 5 | console.log(docx); 6 | 7 | // Applying to An Array 8 | var mydocx = ["I love apples","I don't eat pepper","the movie was very nice","this book is the best"] 9 | 10 | mydocx.forEach(function(s){ 11 | console.log(sentiment.analyze(s)); 12 | }) 13 | 14 | -------------------------------------------------------------------------------- /quotesfiles.txt: -------------------------------------------------------------------------------- 1 | First, solve the problem. Then, write the code. 2 | Fix the cause, not the symptom. 3 | Simplicity is the soul of efficiency. 4 | Good design adds value faster than it adds cost. 5 | In theory, theory and practice are the same. In practice, they’re not. 6 | There are two ways of constructing a software design. 7 | One way is to make it so simple that there are obviously no deficiencies. 8 | And the other way is to make it so complicated that there are no obvious deficiencies. -------------------------------------------------------------------------------- /NLP_with_SpaCy/quotesfiles.txt: -------------------------------------------------------------------------------- 1 | First, solve the problem. Then, write the code. 2 | Fix the cause, not the symptom. 3 | Simplicity is the soul of efficiency. 4 | Good design adds value faster than it adds cost. 5 | In theory, theory and practice are the same. In practice, they’re not. 6 | There are two ways of constructing a software design. 7 | One way is to make it so simple that there are obviously no deficiencies. 8 | And the other way is to make it so complicated that there are no obvious deficiencies. -------------------------------------------------------------------------------- /NLP-with-JavaScript.md: -------------------------------------------------------------------------------- 1 | ## Natural Language Processing with JavaScript 2 | + understanding everyday language 3 | 4 | #### Common Libraries & Packages 5 | + compromise.js 6 | + natural 7 | + sentiment 8 | + franc 9 | + talisman 10 | + etc 11 | 12 | #### NLP with Compromise.js 13 | + Tokenization 14 | + Part of Speech Tagging 15 | + Word transformation 16 | + Entity Recognition 17 | + Match Finding 18 | + etc 19 | 20 | #### NLP with Sentiment.js 21 | + For Sentiment Analysis 22 | 23 | #### NLP with Franc 24 | + Language Detection 25 | 26 | 27 | 28 | ###### . 29 | + J-Secur1ty 30 | + Jesus Saves @ JCharisTech 31 | 32 | -------------------------------------------------------------------------------- /NLP with JavaScript/NLP-with-JavaScript.md: -------------------------------------------------------------------------------- 1 | ## Natural Language Processing with JavaScript 2 | + understanding everyday language 3 | 4 | #### Common Libraries & Packages 5 | + compromise.js 6 | + natural 7 | + sentiment 8 | + franc 9 | + talisman 10 | + etc 11 | 12 | #### NLP with Compromise.js 13 | + Tokenization 14 | + Part of Speech Tagging 15 | + Word transformation 16 | + Entity Recognition 17 | + Match Finding 18 | + etc 19 | 20 | #### NLP with Sentiment.js 21 | + For Sentiment Analysis 22 | 23 | #### NLP with Franc 24 | + Language Detection 25 | 26 | 27 | 28 | ###### . 29 | + J-Secur1ty 30 | + Jesus Saves @ JCharisTech 31 | 32 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Natural-Language-Process-Tutorials 2 | Natural Language Processing Tutorials(NLP) with Python,Julia and JavaScripts 3 | 4 | 5 | #### Contents 6 | + NLP with Python 7 | - Natural Language Processing with SpaCy 8 | - Natural Language Processing with TextBlob 9 | - Natural Language Processing with PolyGlot 10 | - Natural Language Processing with TextaCy 11 | 12 | + NLP with JavaScript 13 | - Natural Language Processing with Compromise.js 14 | - Natural Language Processing with Natural.js 15 | - Natural Language Processing with Sentiment.js 16 | 17 | + NLP with Julia 18 | - Natural Language Processing with TextAnalysis.jl 19 | - TextSummarization.jl 20 | -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | NLP with JavaScript 5 | 6 | 7 | 56 | 57 | 58 | 59 | 60 |
61 |

NLP with JavaScript

62 |
63 | 64 | 65 | -------------------------------------------------------------------------------- /NLP with JavaScript/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | NLP with JavaScript 5 | 6 | 7 | 56 | 57 | 58 | 59 | 60 |
61 |

NLP with JavaScript

62 |
63 | 64 | 65 | -------------------------------------------------------------------------------- /NLP_with_SpaCy/spacy_summarizer.py: -------------------------------------------------------------------------------- 1 | # NLP Pkgs 2 | import spacy 3 | nlp = spacy.load('en') 4 | # Pkgs for Normalizing Text 5 | from spacy.lang.en.stop_words import STOP_WORDS 6 | from string import punctuation 7 | # Finding the Top N Sentences 8 | from heapq import nlargest 9 | 10 | 11 | 12 | def text_summarizer(raw_docx): 13 | """ usage: text_summarizer(yourtext) """ 14 | raw_text = raw_docx 15 | docx = nlp(raw_text) 16 | stopwords = list(STOP_WORDS) 17 | # Build Word Frequency # word.text is tokenization in spacy 18 | word_frequencies = {} 19 | for word in docx: 20 | if word.text not in stopwords: 21 | if word.text not in word_frequencies.keys(): 22 | word_frequencies[word.text] = 1 23 | else: 24 | word_frequencies[word.text] += 1 25 | 26 | 27 | maximum_frequncy = max(word_frequencies.values()) 28 | 29 | for word in word_frequencies.keys(): 30 | word_frequencies[word] = (word_frequencies[word]/maximum_frequncy) 31 | # Sentence Tokens 32 | sentence_list = [ sentence for sentence in docx.sents ] 33 | 34 | #Calculate Sentence Scores 35 | sentence_scores = {} 36 | for sent in sentence_list: 37 | for word in sent: 38 | if word.text.lower() in word_frequencies.keys(): 39 | if len(sent.text.split(' ')) < 30: 40 | if sent not in sentence_scores.keys(): 41 | sentence_scores[sent] = word_frequencies[word.text.lower()] 42 | else: 43 | sentence_scores[sent] += word_frequencies[word.text.lower()] 44 | 45 | # Find N Largest and Join Sentences 46 | summarized_sentences = nlargest(7, sentence_scores, key=sentence_scores.get) 47 | final_sentences = [ w.text for w in summarized_sentences ] 48 | summary = ' '.join(final_sentences) 49 | print("Original Document\n") 50 | print(raw_docx) 51 | print("Total Length:",len(raw_docx)) 52 | print('\n\nSummarized Document\n') 53 | print(summary) 54 | print("Total Length:",len(summary)) 55 | 56 | 57 | 58 | 59 | 60 | # Jesse JCharis 61 | # Jesus Saves@JCharisTech -------------------------------------------------------------------------------- /example.txt: -------------------------------------------------------------------------------- 1 | The nativity of Jesus or birth of Jesus is described in the gospels of Luke and Matthew. The two accounts agree that Jesus was born in Bethlehem in the time of Herod the Great, that his mother Mary was married to Joseph, who was of Davidic descent and was not his biological father, and that his birth was effected by divine intervention, but the two gospels agree on little else.[1] Matthew does not mention the census, annunciation to the shepherds or presentation in the Temple, and does not give the name of the angel that appeared to Joseph to foretell the birth. In Luke there is no mention of Magi, no flight into Egypt, or Massacre of the Innocents, and the angel who announces the coming birth to Mary is named (as Gabriel).[1] 2 | 3 | The consensus of scholars is that both gospels were written about AD 75-85,[2] and while it is possible that one account might be based on the other, or that the two share common source material, the majority conclusion is that, in respect of the nativity story, the two are independent of each other.[1] 4 | 5 | In Christian theology the nativity marks the birth of Jesus in fulfillment of the divine will of God, to save the world from sin. The artistic depiction of the nativity has been an important subject for Christian artists since the 4th century. Since the 13th century, the nativity scene has emphasized the humility of Jesus and promoted a more tender image of him, as a major turning point from the early "Lord and Master" image, mirroring changes in the common approaches taken by Christian pastoral ministry.[3][4][5] 6 | 7 | The nativity plays a major role in the Christian liturgical year. Christian congregations of the Western tradition (including the Catholic Church, the Western Rite Orthodox, the Anglican Communion, and many Protestants) begin observing the season of Advent four Sundays before Christmas, the traditional feast-day of his birth, which falls on December 25. 8 | 9 | Christians of the Eastern Orthodox Church and Oriental Orthodox Church observe a similar season, sometimes called Advent but also called the "Nativity Fast", which begins forty days before Christmas. Some Eastern Orthodox Christians (e.g. Greeks and Syrians) celebrate Christmas on December 25. Other Orthodox (e.g. Copts, Ethiopians, Georgians, and Russians) celebrate Christmas on (the Gregorian) January 7 (Koiak 29 on coptic calendar)[6] as a result of their churches continuing to follow the Julian calendar, rather than the modern day Gregorian calendar. 10 | 11 | -------------------------------------------------------------------------------- /NLP_with_Textacy/example.txt: -------------------------------------------------------------------------------- 1 | The nativity of Jesus or birth of Jesus is described in the gospels of Luke and Matthew. The two accounts agree that Jesus was born in Bethlehem in the time of Herod the Great, that his mother Mary was married to Joseph, who was of Davidic descent and was not his biological father, and that his birth was effected by divine intervention, but the two gospels agree on little else.[1] Matthew does not mention the census, annunciation to the shepherds or presentation in the Temple, and does not give the name of the angel that appeared to Joseph to foretell the birth. In Luke there is no mention of Magi, no flight into Egypt, or Massacre of the Innocents, and the angel who announces the coming birth to Mary is named (as Gabriel).[1] 2 | 3 | The consensus of scholars is that both gospels were written about AD 75-85,[2] and while it is possible that one account might be based on the other, or that the two share common source material, the majority conclusion is that, in respect of the nativity story, the two are independent of each other.[1] 4 | 5 | In Christian theology the nativity marks the birth of Jesus in fulfillment of the divine will of God, to save the world from sin. The artistic depiction of the nativity has been an important subject for Christian artists since the 4th century. Since the 13th century, the nativity scene has emphasized the humility of Jesus and promoted a more tender image of him, as a major turning point from the early "Lord and Master" image, mirroring changes in the common approaches taken by Christian pastoral ministry.[3][4][5] 6 | 7 | The nativity plays a major role in the Christian liturgical year. Christian congregations of the Western tradition (including the Catholic Church, the Western Rite Orthodox, the Anglican Communion, and many Protestants) begin observing the season of Advent four Sundays before Christmas, the traditional feast-day of his birth, which falls on December 25. 8 | 9 | Christians of the Eastern Orthodox Church and Oriental Orthodox Church observe a similar season, sometimes called Advent but also called the "Nativity Fast", which begins forty days before Christmas. Some Eastern Orthodox Christians (e.g. Greeks and Syrians) celebrate Christmas on December 25. Other Orthodox (e.g. Copts, Ethiopians, Georgians, and Russians) celebrate Christmas on (the Gregorian) January 7 (Koiak 29 on coptic calendar)[6] as a result of their churches continuing to follow the Julian calendar, rather than the modern day Gregorian calendar. 10 | 11 | -------------------------------------------------------------------------------- /Intent Classification With Rasa - Spacy/projects/default/model_20180602-072117/metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "language": "en", 3 | "pipeline": [ 4 | { 5 | "model": "en", 6 | "case_sensitive": false, 7 | "name": "nlp_spacy", 8 | "class": "rasa_nlu.utils.spacy_utils.SpacyNLP" 9 | }, 10 | { 11 | "name": "tokenizer_spacy", 12 | "class": "rasa_nlu.tokenizers.spacy_tokenizer.SpacyTokenizer" 13 | }, 14 | { 15 | "name": "intent_featurizer_spacy", 16 | "class": "rasa_nlu.featurizers.spacy_featurizer.SpacyFeaturizer" 17 | }, 18 | { 19 | "name": "intent_entity_featurizer_regex", 20 | "regex_file": "regex_featurizer.json", 21 | "class": "rasa_nlu.featurizers.regex_featurizer.RegexFeaturizer" 22 | }, 23 | { 24 | "BILOU_flag": true, 25 | "features": [ 26 | [ 27 | "low", 28 | "title", 29 | "upper", 30 | "pos", 31 | "pos2" 32 | ], 33 | [ 34 | "bias", 35 | "low", 36 | "word3", 37 | "word2", 38 | "upper", 39 | "title", 40 | "digit", 41 | "pos", 42 | "pos2", 43 | "pattern" 44 | ], 45 | [ 46 | "low", 47 | "title", 48 | "upper", 49 | "pos", 50 | "pos2" 51 | ] 52 | ], 53 | "max_iterations": 50, 54 | "L1_c": 1, 55 | "L2_c": 0.001, 56 | "name": "ner_crf", 57 | "classifier_file": "crf_model.pkl", 58 | "class": "rasa_nlu.extractors.crf_entity_extractor.CRFEntityExtractor" 59 | }, 60 | { 61 | "name": "ner_synonyms", 62 | "synonyms_file": "entity_synonyms.json", 63 | "class": "rasa_nlu.extractors.entity_synonyms.EntitySynonymMapper" 64 | }, 65 | { 66 | "C": [ 67 | 1, 68 | 2, 69 | 5, 70 | 10, 71 | 20, 72 | 100 73 | ], 74 | "kernels": [ 75 | "linear" 76 | ], 77 | "max_cross_validation_folds": 5, 78 | "name": "intent_classifier_sklearn", 79 | "classifier_file": "intent_classifier_sklearn.pkl", 80 | "class": "rasa_nlu.classifiers.sklearn_intent_classifier.SklearnIntentClassifier" 81 | } 82 | ], 83 | "training_data": "training_data.json", 84 | "trained_at": "20180602-072117", 85 | "rasa_nlu_version": "0.12.3" 86 | } -------------------------------------------------------------------------------- /spacy_pipeline.svg: -------------------------------------------------------------------------------- 1 | 2 | 7 | 8 | 9 | Doc 10 | 11 | 12 | 13 | Text 14 | 15 | 16 | 17 | nlp 18 | 19 | tokenizer 20 | 21 | tagger 22 | 23 | 24 | 25 | parser 26 | 27 | ner 28 | 29 | ... 30 | 31 | -------------------------------------------------------------------------------- /NLP_with_SpaCy/spacy_pipeline.svg: -------------------------------------------------------------------------------- 1 | 2 | 7 | 8 | 9 | Doc 10 | 11 | 12 | 13 | Text 14 | 15 | 16 | 17 | nlp 18 | 19 | tokenizer 20 | 21 | tagger 22 | 23 | 24 | 25 | parser 26 | 27 | ner 28 | 29 | ... 30 | 31 | -------------------------------------------------------------------------------- /example1.txt: -------------------------------------------------------------------------------- 1 | Jerusalem is a city in the Middle East, located on a plateau in the Judaean Mountains between the Mediterranean and the Dead Sea. It is one of the oldest cities in the world, and is considered holy to the three major Abrahamic religions—Judaism, Christianity, and Islam. Both Israel and the Palestinian Authority claim Jerusalem as their capital, as Israel maintains its primary governmental institutions there and the State of Palestine ultimately foresees it as its seat of power; however, neither claim is widely recognized internationally.[note 3][9] 2 | 3 | During its long history, Jerusalem has been destroyed at least twice, besieged 23 times, attacked 52 times, and captured and recaptured 44 times.[10] The part of Jerusalem called the City of David was settled in the 4th millennium BCE.[11] Jerusalem was named as "Urusalim" on ancient Egyptian tablets, probably meaning "City of Shalem" after a Canaanite deity, during the Canaanite period (14th century BCE). During the Israelite period, significant construction activity in Jerusalem began in the 9th century BCE (Iron Age II), and in the 8th century the city developed into the religious and administrative center of the Kingdom of Judah.[12] In 1538, the city walls were rebuilt for a last time around Jerusalem under Suleiman the Magnificent. Today those walls define the Old City, which has been traditionally divided into four quarters—known since the early 19th century as the Armenian, Christian, Jewish, and Muslim Quarters.[13] The Old City became a World Heritage Site in 1981, and is on the List of World Heritage in Danger.[14] Since 1860 Jerusalem has grown far beyond the Old City's boundaries. In 2015, Jerusalem had a population of some 850,000 residents, comprising approximately 200,000 secular Jewish Israelis, 350,000 Haredi Jews and 300,000 Palestinians.[15][note 4] In 2011, the population numbered 801,000, of which Jews comprised 497,000 (62%), Muslims 281,000 (35%), Christians 14,000 (around 2%) and 9,000 (1%) were not classified by religion.[17] 4 | 5 | According to the Bible, King David conquered the city from the Jebusites and established it as the capital of the united kingdom of Israel, and his son, King Solomon, commissioned the building of the First Temple.[note 5] These foundational events, straddling the dawn of the 1st millennium BCE, assumed central symbolic importance for the Jewish people.[19][20] The sobriquet of holy city (עיר הקודש, transliterated ‘ir haqodesh) was probably attached to Jerusalem in post-exilic times.[21][22][23] The holiness of Jerusalem in Christianity, conserved in the Septuagint[24] which Christians adopted as their own authority,[25] was reinforced by the New Testament account of Jesus's crucifixion there. In Sunni Islam, Jerusalem is the third-holiest city, after Mecca and Medina.[26][27] In Islamic tradition, in 610 CE it became the first qibla, the focal point for Muslim prayer (salat),[28] and Muhammad made his Night Journey there ten years later, ascending to heaven where he speaks to God, according to the Quran.[29][30] As a result, despite having an area of only 0.9 square kilometres (0.35 sq mi),[31] the Old City is home to many sites of seminal religious importance, among them the Temple Mount with its Western Wall, Dome of the Rock and al-Aqsa Mosque, and the Church of the Holy Sepulchre. Outside the Old City stands the Garden Tomb. -------------------------------------------------------------------------------- /NLP_with_Textacy/example1.txt: -------------------------------------------------------------------------------- 1 | Jerusalem is a city in the Middle East, located on a plateau in the Judaean Mountains between the Mediterranean and the Dead Sea. It is one of the oldest cities in the world, and is considered holy to the three major Abrahamic religions—Judaism, Christianity, and Islam. Both Israel and the Palestinian Authority claim Jerusalem as their capital, as Israel maintains its primary governmental institutions there and the State of Palestine ultimately foresees it as its seat of power; however, neither claim is widely recognized internationally.[note 3][9] 2 | 3 | During its long history, Jerusalem has been destroyed at least twice, besieged 23 times, attacked 52 times, and captured and recaptured 44 times.[10] The part of Jerusalem called the City of David was settled in the 4th millennium BCE.[11] Jerusalem was named as "Urusalim" on ancient Egyptian tablets, probably meaning "City of Shalem" after a Canaanite deity, during the Canaanite period (14th century BCE). During the Israelite period, significant construction activity in Jerusalem began in the 9th century BCE (Iron Age II), and in the 8th century the city developed into the religious and administrative center of the Kingdom of Judah.[12] In 1538, the city walls were rebuilt for a last time around Jerusalem under Suleiman the Magnificent. Today those walls define the Old City, which has been traditionally divided into four quarters—known since the early 19th century as the Armenian, Christian, Jewish, and Muslim Quarters.[13] The Old City became a World Heritage Site in 1981, and is on the List of World Heritage in Danger.[14] Since 1860 Jerusalem has grown far beyond the Old City's boundaries. In 2015, Jerusalem had a population of some 850,000 residents, comprising approximately 200,000 secular Jewish Israelis, 350,000 Haredi Jews and 300,000 Palestinians.[15][note 4] In 2011, the population numbered 801,000, of which Jews comprised 497,000 (62%), Muslims 281,000 (35%), Christians 14,000 (around 2%) and 9,000 (1%) were not classified by religion.[17] 4 | 5 | According to the Bible, King David conquered the city from the Jebusites and established it as the capital of the united kingdom of Israel, and his son, King Solomon, commissioned the building of the First Temple.[note 5] These foundational events, straddling the dawn of the 1st millennium BCE, assumed central symbolic importance for the Jewish people.[19][20] The sobriquet of holy city (עיר הקודש, transliterated ‘ir haqodesh) was probably attached to Jerusalem in post-exilic times.[21][22][23] The holiness of Jerusalem in Christianity, conserved in the Septuagint[24] which Christians adopted as their own authority,[25] was reinforced by the New Testament account of Jesus's crucifixion there. In Sunni Islam, Jerusalem is the third-holiest city, after Mecca and Medina.[26][27] In Islamic tradition, in 610 CE it became the first qibla, the focal point for Muslim prayer (salat),[28] and Muhammad made his Night Journey there ten years later, ascending to heaven where he speaks to God, according to the Quran.[29][30] As a result, despite having an area of only 0.9 square kilometres (0.35 sq mi),[31] the Old City is home to many sites of seminal religious importance, among them the Temple Mount with its Western Wall, Dome of the Rock and al-Aqsa Mosque, and the Church of the Holy Sepulchre. Outside the Old City stands the Garden Tomb. -------------------------------------------------------------------------------- /NLP_with_SpaCy/How to detect languages with SpaCy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Natural Language Processing with SpaCy\n", 8 | "## How to detect languages with SpaCy\n", 9 | "![](SpaCy_logo.png)\n", 10 | "\n", 11 | "+ Pkgs(2)\n", 12 | "- pip install spacy_cld\n", 13 | "- pip install spacy-langdetect" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 1, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "import spacy" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "# Load our languages\n", 32 | "from spacy_cld import LanguageDetector" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 3, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "nlp = spacy.load('en')" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 4, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "language_detector = LanguageDetector()" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 5, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "nlp.add_pipe(language_detector)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 6, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "docx = nlp(\"He works in London\")" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 7, 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "data": { 78 | "text/plain": [ 79 | "['en']" 80 | ] 81 | }, 82 | "execution_count": 7, 83 | "metadata": {}, 84 | "output_type": "execute_result" 85 | } 86 | ], 87 | "source": [ 88 | "docx._.languages" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 8, 94 | "metadata": {}, 95 | "outputs": [ 96 | { 97 | "data": { 98 | "text/plain": [ 99 | "0.95" 100 | ] 101 | }, 102 | "execution_count": 8, 103 | "metadata": {}, 104 | "output_type": "execute_result" 105 | } 106 | ], 107 | "source": [ 108 | "docx._.language_scores['en']" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 9, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "docx2 = nlp(\"он работает в москве\")" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 10, 123 | "metadata": {}, 124 | "outputs": [ 125 | { 126 | "data": { 127 | "text/plain": [ 128 | "['ru']" 129 | ] 130 | }, 131 | "execution_count": 10, 132 | "metadata": {}, 133 | "output_type": "execute_result" 134 | } 135 | ], 136 | "source": [ 137 | "docx2._.languages" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 11, 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "data": { 147 | "text/plain": [ 148 | "0.97" 149 | ] 150 | }, 151 | "execution_count": 11, 152 | "metadata": {}, 153 | "output_type": "execute_result" 154 | } 155 | ], 156 | "source": [ 157 | "docx2._.language_scores['ru']" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "### Using Spacy-langdetect" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "from spacy_langdetect import LanguageDetector\n", 174 | "nlp = spacy.load(\"en\")\n", 175 | "nlp.add_pipe(LanguageDetector(), name=\"language_detector\", last=True)" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "doc = nlp(\"He works in London\")" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "print(doc._.language)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 12, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "#### Thanks\n", 210 | "### J-Secur1ty\n", 211 | "### Jesus Saves @JCharisTech" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [] 220 | } 221 | ], 222 | "metadata": { 223 | "kernelspec": { 224 | "display_name": "Python 3", 225 | "language": "python", 226 | "name": "python3" 227 | }, 228 | "language_info": { 229 | "codemirror_mode": { 230 | "name": "ipython", 231 | "version": 3 232 | }, 233 | "file_extension": ".py", 234 | "mimetype": "text/x-python", 235 | "name": "python", 236 | "nbconvert_exporter": "python", 237 | "pygments_lexer": "ipython3", 238 | "version": "3.6.5" 239 | } 240 | }, 241 | "nbformat": 4, 242 | "nbformat_minor": 2 243 | } 244 | -------------------------------------------------------------------------------- /NLP with SpaCy- Adding Extensions Attributes in SpaCy(How to use sentiment analysis in SpaCy).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### NLP With SpaCy -Extending SpaCy\n", 8 | "+ Doc Document \n", 9 | "+ Tokens\n", 10 | "+ Span\n", 11 | "\n", 12 | "#### Usefulness\n", 13 | "+ Allows you to add extra functionality to SpaCy\n", 14 | " + eg sentiment analysis\n", 15 | "+ extend the API to become more accessible \n", 16 | "\n" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "Doc\n", 24 | "![](spacy_pipeline.svg)\n", 25 | "\n", 26 | "#### Creating an Extension\n", 27 | "+ set_extension\n", 28 | "+ 3 Types \n", 29 | " - Attribute Extension\n", 30 | " - Property Extension (getter,setter)\n", 31 | " - Method Extension(method)\n", 32 | " \n", 33 | "#### Calling the extension\n", 34 | " ._\n", 35 | " (Doc._.myextension)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 1, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "import spacy" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 2, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "nlp1 = spacy.load('en')" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 3, 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "data": { 63 | "text/plain": [ 64 | "[('tagger', ),\n", 65 | " ('parser', ),\n", 66 | " ('ner', )]" 67 | ] 68 | }, 69 | "execution_count": 3, 70 | "metadata": {}, 71 | "output_type": "execute_result" 72 | } 73 | ], 74 | "source": [ 75 | "nlp1.pipeline" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 4, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "## Adding Extension\n", 85 | "from spacy.tokens import Doc\n" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "pip install nltk\n", 95 | "nltk.download()\n", 96 | "pip install twython" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 5, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "from nltk.sentiment.vader import SentimentIntensityAnalyzer" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 6, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "sent_analyzer = SentimentIntensityAnalyzer()" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 7, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "def sentiment_scores(docx):\n", 124 | " return sent_analyzer.polarity_scores(docx.text)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 8, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "Doc.set_extension(\"sentimenter\",getter=sentiment_scores)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 9, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "nlp = spacy.load('en')" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 10, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "ex1 = nlp(\"This movie was very nice\")" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 11, 157 | "metadata": {}, 158 | "outputs": [ 159 | { 160 | "name": "stdout", 161 | "output_type": "stream", 162 | "text": [ 163 | "{'neg': 0.0, 'neu': 0.564, 'pos': 0.436, 'compound': 0.4754}\n" 164 | ] 165 | } 166 | ], 167 | "source": [ 168 | "# Calling\n", 169 | "print(ex1._.sentimenter)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 12, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "ex2 = nlp(\"I don't like onions\")" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 13, 184 | "metadata": {}, 185 | "outputs": [ 186 | { 187 | "name": "stdout", 188 | "output_type": "stream", 189 | "text": [ 190 | "{'neg': 0.513, 'neu': 0.487, 'pos': 0.0, 'compound': -0.2755}\n" 191 | ] 192 | } 193 | ], 194 | "source": [ 195 | "print(ex2._.sentimenter)" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "# By Jesse JCharis\n", 205 | "# J-Secur1ty\n", 206 | "# Jesus Saves @JCharisTech\n" 207 | ] 208 | } 209 | ], 210 | "metadata": { 211 | "kernelspec": { 212 | "display_name": "Python 3", 213 | "language": "python", 214 | "name": "python3" 215 | }, 216 | "language_info": { 217 | "codemirror_mode": { 218 | "name": "ipython", 219 | "version": 3 220 | }, 221 | "file_extension": ".py", 222 | "mimetype": "text/x-python", 223 | "name": "python", 224 | "nbconvert_exporter": "python", 225 | "pygments_lexer": "ipython3", 226 | "version": "3.6.3" 227 | } 228 | }, 229 | "nbformat": 4, 230 | "nbformat_minor": 2 231 | } 232 | -------------------------------------------------------------------------------- /NLP_with_SpaCy/NLP with SpaCy- Adding Extensions Attributes in SpaCy(How to use sentiment analysis in SpaCy).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### NLP With SpaCy -Extending SpaCy\n", 8 | "+ Doc Document \n", 9 | "+ Tokens\n", 10 | "+ Span\n", 11 | "\n", 12 | "#### Usefulness\n", 13 | "+ Allows you to add extra functionality to SpaCy\n", 14 | " + eg sentiment analysis\n", 15 | "+ extend the API to become more accessible \n", 16 | "\n" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "Doc\n", 24 | "![](spacy_pipeline.svg)\n", 25 | "\n", 26 | "#### Creating an Extension\n", 27 | "+ set_extension\n", 28 | "+ 3 Types \n", 29 | " - Attribute Extension\n", 30 | " - Property Extension (getter,setter)\n", 31 | " - Method Extension(method)\n", 32 | " \n", 33 | "#### Calling the extension\n", 34 | " ._\n", 35 | " (Doc._.myextension)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 1, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "import spacy" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 2, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "nlp1 = spacy.load('en')" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 3, 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "data": { 63 | "text/plain": [ 64 | "[('tagger', ),\n", 65 | " ('parser', ),\n", 66 | " ('ner', )]" 67 | ] 68 | }, 69 | "execution_count": 3, 70 | "metadata": {}, 71 | "output_type": "execute_result" 72 | } 73 | ], 74 | "source": [ 75 | "nlp1.pipeline" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 4, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "## Adding Extension\n", 85 | "from spacy.tokens import Doc\n" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "pip install nltk\n", 95 | "nltk.download()\n", 96 | "pip install twython" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 5, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "from nltk.sentiment.vader import SentimentIntensityAnalyzer" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 6, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "sent_analyzer = SentimentIntensityAnalyzer()" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 7, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "def sentiment_scores(docx):\n", 124 | " return sent_analyzer.polarity_scores(docx.text)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 8, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "Doc.set_extension(\"sentimenter\",getter=sentiment_scores)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 9, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "nlp = spacy.load('en')" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 10, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "ex1 = nlp(\"This movie was very nice\")" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 11, 157 | "metadata": {}, 158 | "outputs": [ 159 | { 160 | "name": "stdout", 161 | "output_type": "stream", 162 | "text": [ 163 | "{'neg': 0.0, 'neu': 0.564, 'pos': 0.436, 'compound': 0.4754}\n" 164 | ] 165 | } 166 | ], 167 | "source": [ 168 | "# Calling\n", 169 | "print(ex1._.sentimenter)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 12, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "ex2 = nlp(\"I don't like onions\")" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 13, 184 | "metadata": {}, 185 | "outputs": [ 186 | { 187 | "name": "stdout", 188 | "output_type": "stream", 189 | "text": [ 190 | "{'neg': 0.513, 'neu': 0.487, 'pos': 0.0, 'compound': -0.2755}\n" 191 | ] 192 | } 193 | ], 194 | "source": [ 195 | "print(ex2._.sentimenter)" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "# By Jesse JCharis\n", 205 | "# J-Secur1ty\n", 206 | "# Jesus Saves @JCharisTech\n" 207 | ] 208 | } 209 | ], 210 | "metadata": { 211 | "kernelspec": { 212 | "display_name": "Python 3", 213 | "language": "python", 214 | "name": "python3" 215 | }, 216 | "language_info": { 217 | "codemirror_mode": { 218 | "name": "ipython", 219 | "version": 3 220 | }, 221 | "file_extension": ".py", 222 | "mimetype": "text/x-python", 223 | "name": "python", 224 | "nbconvert_exporter": "python", 225 | "pygments_lexer": "ipython3", 226 | "version": "3.6.3" 227 | } 228 | }, 229 | "nbformat": 4, 230 | "nbformat_minor": 2 231 | } 232 | -------------------------------------------------------------------------------- /Intent Classification With Rasa - Spacy/projects/default/model_20180602-072117/training_data.json: -------------------------------------------------------------------------------- 1 | { 2 | "rasa_nlu_data": { 3 | "common_examples": [ 4 | { 5 | "intent": "greet", 6 | "text": "hey" 7 | }, 8 | { 9 | "intent": "greet", 10 | "text": "howdy" 11 | }, 12 | { 13 | "intent": "greet", 14 | "text": "hey there" 15 | }, 16 | { 17 | "intent": "greet", 18 | "text": "hello" 19 | }, 20 | { 21 | "intent": "greet", 22 | "text": "hi" 23 | }, 24 | { 25 | "intent": "greet", 26 | "text": "good morning" 27 | }, 28 | { 29 | "intent": "greet", 30 | "text": "good evening" 31 | }, 32 | { 33 | "intent": "greet", 34 | "text": "dear sir" 35 | }, 36 | { 37 | "intent": "affirm", 38 | "text": "yes" 39 | }, 40 | { 41 | "intent": "affirm", 42 | "text": "yep" 43 | }, 44 | { 45 | "intent": "affirm", 46 | "text": "yeah" 47 | }, 48 | { 49 | "intent": "affirm", 50 | "text": "indeed" 51 | }, 52 | { 53 | "intent": "affirm", 54 | "text": "that's right" 55 | }, 56 | { 57 | "intent": "affirm", 58 | "text": "ok" 59 | }, 60 | { 61 | "intent": "affirm", 62 | "text": "great" 63 | }, 64 | { 65 | "intent": "affirm", 66 | "text": "right, thank you" 67 | }, 68 | { 69 | "intent": "affirm", 70 | "text": "correct" 71 | }, 72 | { 73 | "intent": "affirm", 74 | "text": "great choice" 75 | }, 76 | { 77 | "intent": "affirm", 78 | "text": "sounds really good" 79 | }, 80 | { 81 | "intent": "restaurant_search", 82 | "text": "i'm looking for a place to eat" 83 | }, 84 | { 85 | "intent": "restaurant_search", 86 | "text": "I want to grab lunch" 87 | }, 88 | { 89 | "intent": "restaurant_search", 90 | "text": "I am searching for a dinner spot" 91 | }, 92 | { 93 | "intent": "restaurant_search", 94 | "entities": [ 95 | { 96 | "start": 31, 97 | "end": 36, 98 | "value": "north", 99 | "entity": "location" 100 | } 101 | ], 102 | "text": "i'm looking for a place in the north of town" 103 | }, 104 | { 105 | "intent": "restaurant_search", 106 | "entities": [ 107 | { 108 | "start": 8, 109 | "end": 15, 110 | "value": "chinese", 111 | "entity": "cuisine" 112 | } 113 | ], 114 | "text": "show me chinese restaurants" 115 | }, 116 | { 117 | "intent": "restaurant_search", 118 | "entities": [ 119 | { 120 | "start": 8, 121 | "end": 14, 122 | "value": "chinese", 123 | "entity": "cuisine" 124 | }, 125 | { 126 | "start": 34, 127 | "end": 39, 128 | "value": "north", 129 | "entity": "location" 130 | } 131 | ], 132 | "text": "show me chines restaurants in the north" 133 | }, 134 | { 135 | "intent": "restaurant_search", 136 | "entities": [ 137 | { 138 | "start": 31, 139 | "end": 37, 140 | "value": "centre", 141 | "entity": "location" 142 | }, 143 | { 144 | "start": 10, 145 | "end": 17, 146 | "value": "mexican", 147 | "entity": "cuisine" 148 | } 149 | ], 150 | "text": "show me a mexican place in the centre" 151 | }, 152 | { 153 | "intent": "restaurant_search", 154 | "entities": [ 155 | { 156 | "start": 20, 157 | "end": 26, 158 | "value": "indian", 159 | "entity": "cuisine" 160 | } 161 | ], 162 | "text": "i am looking for an indian spot called olaolaolaolaolaola" 163 | }, 164 | { 165 | "intent": "restaurant_search", 166 | "text": "search for restaurants" 167 | }, 168 | { 169 | "intent": "restaurant_search", 170 | "entities": [ 171 | { 172 | "start": 16, 173 | "end": 20, 174 | "value": "west", 175 | "entity": "location" 176 | } 177 | ], 178 | "text": "anywhere in the west" 179 | }, 180 | { 181 | "intent": "restaurant_search", 182 | "entities": [ 183 | { 184 | "start": 14, 185 | "end": 19, 186 | "value": "18328", 187 | "entity": "location" 188 | } 189 | ], 190 | "text": "anywhere near 18328" 191 | }, 192 | { 193 | "intent": "restaurant_search", 194 | "entities": [ 195 | { 196 | "start": 17, 197 | "end": 29, 198 | "value": "asian fusion", 199 | "entity": "cuisine" 200 | } 201 | ], 202 | "text": "I am looking for asian fusion food" 203 | }, 204 | { 205 | "intent": "restaurant_search", 206 | "entities": [ 207 | { 208 | "start": 29, 209 | "end": 34, 210 | "value": "29432", 211 | "entity": "location" 212 | } 213 | ], 214 | "text": "I am looking a restaurant in 29432" 215 | }, 216 | { 217 | "intent": "restaurant_search", 218 | "entities": [ 219 | { 220 | "start": 17, 221 | "end": 38, 222 | "value": "mexican indian fusion", 223 | "entity": "cuisine" 224 | } 225 | ], 226 | "text": "I am looking for mexican indian fusion" 227 | }, 228 | { 229 | "intent": "restaurant_search", 230 | "entities": [ 231 | { 232 | "start": 0, 233 | "end": 7, 234 | "value": "central", 235 | "entity": "location" 236 | }, 237 | { 238 | "start": 8, 239 | "end": 14, 240 | "value": "indian", 241 | "entity": "cuisine" 242 | } 243 | ], 244 | "text": "central indian restaurant" 245 | }, 246 | { 247 | "intent": "goodbye", 248 | "text": "bye" 249 | }, 250 | { 251 | "intent": "goodbye", 252 | "text": "goodbye" 253 | }, 254 | { 255 | "intent": "goodbye", 256 | "text": "good bye" 257 | }, 258 | { 259 | "intent": "goodbye", 260 | "text": "stop" 261 | }, 262 | { 263 | "intent": "goodbye", 264 | "text": "end" 265 | }, 266 | { 267 | "intent": "goodbye", 268 | "text": "farewell" 269 | }, 270 | { 271 | "intent": "goodbye", 272 | "text": "Bye bye" 273 | }, 274 | { 275 | "intent": "goodbye", 276 | "text": "have a good one" 277 | } 278 | ], 279 | "regex_features": [ 280 | { 281 | "name": "greet", 282 | "pattern": "hey[^\\s]*" 283 | }, 284 | { 285 | "name": "zipcode", 286 | "pattern": "[0-9]{5}" 287 | } 288 | ], 289 | "entity_synonyms": [ 290 | { 291 | "value": "chinese", 292 | "synonyms": [ 293 | "Chinese", 294 | "Chines", 295 | "chines" 296 | ] 297 | }, 298 | { 299 | "value": "vegetarian", 300 | "synonyms": [ 301 | "veggie", 302 | "vegg" 303 | ] 304 | } 305 | ] 306 | } 307 | } -------------------------------------------------------------------------------- /Intent Classification With Rasa - Spacy/demo-rasa.json: -------------------------------------------------------------------------------- 1 | { 2 | "rasa_nlu_data": { 3 | "regex_features": [ 4 | { 5 | "name": "zipcode", 6 | "pattern": "[0-9]{5}" 7 | }, 8 | { 9 | "name": "greet", 10 | "pattern": "hey[^\\s]*" 11 | } 12 | ], 13 | "entity_synonyms": [ 14 | { 15 | "value": "chinese", 16 | "synonyms": ["Chinese", "Chines", "chines"] 17 | }, 18 | { 19 | "value": "vegetarian", 20 | "synonyms": ["veggie", "vegg"] 21 | } 22 | ], 23 | "common_examples": [ 24 | { 25 | "text": "hey", 26 | "intent": "greet", 27 | "entities": [] 28 | }, 29 | { 30 | "text": "howdy", 31 | "intent": "greet", 32 | "entities": [] 33 | }, 34 | { 35 | "text": "hey there", 36 | "intent": "greet", 37 | "entities": [] 38 | }, 39 | { 40 | "text": "hello", 41 | "intent": "greet", 42 | "entities": [] 43 | }, 44 | { 45 | "text": "hi", 46 | "intent": "greet", 47 | "entities": [] 48 | }, 49 | { 50 | "text": "good morning", 51 | "intent": "greet", 52 | "entities": [] 53 | }, 54 | { 55 | "text": "good evening", 56 | "intent": "greet", 57 | "entities": [] 58 | }, 59 | { 60 | "text": "dear sir", 61 | "intent": "greet", 62 | "entities": [] 63 | }, 64 | { 65 | "text": "yes", 66 | "intent": "affirm", 67 | "entities": [] 68 | }, 69 | { 70 | "text": "yep", 71 | "intent": "affirm", 72 | "entities": [] 73 | }, 74 | { 75 | "text": "yeah", 76 | "intent": "affirm", 77 | "entities": [] 78 | }, 79 | { 80 | "text": "indeed", 81 | "intent": "affirm", 82 | "entities": [] 83 | }, 84 | { 85 | "text": "that's right", 86 | "intent": "affirm", 87 | "entities": [] 88 | }, 89 | { 90 | "text": "ok", 91 | "intent": "affirm", 92 | "entities": [] 93 | }, 94 | { 95 | "text": "great", 96 | "intent": "affirm", 97 | "entities": [] 98 | }, 99 | { 100 | "text": "right, thank you", 101 | "intent": "affirm", 102 | "entities": [] 103 | }, 104 | { 105 | "text": "correct", 106 | "intent": "affirm", 107 | "entities": [] 108 | }, 109 | { 110 | "text": "great choice", 111 | "intent": "affirm", 112 | "entities": [] 113 | }, 114 | { 115 | "text": "sounds really good", 116 | "intent": "affirm", 117 | "entities": [] 118 | }, 119 | { 120 | "text": "i'm looking for a place to eat", 121 | "intent": "restaurant_search", 122 | "entities": [] 123 | }, 124 | { 125 | "text": "I want to grab lunch", 126 | "intent": "restaurant_search", 127 | "entities": [] 128 | }, 129 | { 130 | "text": "I am searching for a dinner spot", 131 | "intent": "restaurant_search", 132 | "entities": [] 133 | }, 134 | { 135 | "text": "i'm looking for a place in the north of town", 136 | "intent": "restaurant_search", 137 | "entities": [ 138 | { 139 | "start": 31, 140 | "end": 36, 141 | "value": "north", 142 | "entity": "location" 143 | } 144 | ] 145 | }, 146 | { 147 | "text": "show me chinese restaurants", 148 | "intent": "restaurant_search", 149 | "entities": [ 150 | { 151 | "start": 8, 152 | "end": 15, 153 | "value": "chinese", 154 | "entity": "cuisine" 155 | } 156 | ] 157 | }, 158 | { 159 | "text": "show me chines restaurants in the north", 160 | "intent": "restaurant_search", 161 | "entities": [ 162 | { 163 | "start": 8, 164 | "end": 14, 165 | "value": "chinese", 166 | "entity": "cuisine" 167 | }, 168 | { 169 | "start": 34, 170 | "end": 39, 171 | "value": "north", 172 | "entity": "location" 173 | } 174 | ] 175 | }, 176 | { 177 | "text": "show me a mexican place in the centre", 178 | "intent": "restaurant_search", 179 | "entities": [ 180 | { 181 | "start": 31, 182 | "end": 37, 183 | "value": "centre", 184 | "entity": "location" 185 | }, 186 | { 187 | "start": 10, 188 | "end": 17, 189 | "value": "mexican", 190 | "entity": "cuisine" 191 | } 192 | ] 193 | }, 194 | { 195 | "text": "i am looking for an indian spot called olaolaolaolaolaola", 196 | "intent": "restaurant_search", 197 | "entities": [ 198 | { 199 | "start": 20, 200 | "end": 26, 201 | "value": "indian", 202 | "entity": "cuisine" 203 | } 204 | ] 205 | }, { 206 | "text": "search for restaurants", 207 | "intent": "restaurant_search", 208 | "entities": [] 209 | }, 210 | { 211 | "text": "anywhere in the west", 212 | "intent": "restaurant_search", 213 | "entities": [ 214 | { 215 | "start": 16, 216 | "end": 20, 217 | "value": "west", 218 | "entity": "location" 219 | } 220 | ] 221 | }, 222 | { 223 | "text": "anywhere near 18328", 224 | "intent": "restaurant_search", 225 | "entities": [ 226 | { 227 | "start": 14, 228 | "end": 19, 229 | "value": "18328", 230 | "entity": "location" 231 | } 232 | ] 233 | }, 234 | { 235 | "text": "I am looking for asian fusion food", 236 | "intent": "restaurant_search", 237 | "entities": [ 238 | { 239 | "start": 17, 240 | "end": 29, 241 | "value": "asian fusion", 242 | "entity": "cuisine" 243 | } 244 | ] 245 | }, 246 | { 247 | "text": "I am looking a restaurant in 29432", 248 | "intent": "restaurant_search", 249 | "entities": [ 250 | { 251 | "start": 29, 252 | "end": 34, 253 | "value": "29432", 254 | "entity": "location" 255 | } 256 | ] 257 | }, 258 | { 259 | "text": "I am looking for mexican indian fusion", 260 | "intent": "restaurant_search", 261 | "entities": [ 262 | { 263 | "start": 17, 264 | "end": 38, 265 | "value": "mexican indian fusion", 266 | "entity": "cuisine" 267 | } 268 | ] 269 | }, 270 | { 271 | "text": "central indian restaurant", 272 | "intent": "restaurant_search", 273 | "entities": [ 274 | { 275 | "start": 0, 276 | "end": 7, 277 | "value": "central", 278 | "entity": "location" 279 | }, 280 | { 281 | "start": 8, 282 | "end": 14, 283 | "value": "indian", 284 | "entity": "cuisine" 285 | } 286 | ] 287 | }, 288 | { 289 | "text": "bye", 290 | "intent": "goodbye", 291 | "entities": [] 292 | }, 293 | { 294 | "text": "goodbye", 295 | "intent": "goodbye", 296 | "entities": [] 297 | }, 298 | { 299 | "text": "good bye", 300 | "intent": "goodbye", 301 | "entities": [] 302 | }, 303 | { 304 | "text": "stop", 305 | "intent": "goodbye", 306 | "entities": [] 307 | }, 308 | { 309 | "text": "end", 310 | "intent": "goodbye", 311 | "entities": [] 312 | }, 313 | { 314 | "text": "farewell", 315 | "intent": "goodbye", 316 | "entities": [] 317 | }, 318 | { 319 | "text": "Bye bye", 320 | "intent": "goodbye", 321 | "entities": [] 322 | }, 323 | { 324 | "text": "have a good one", 325 | "intent": "goodbye", 326 | "entities": [] 327 | } 328 | ] 329 | } 330 | } 331 | -------------------------------------------------------------------------------- /Intent Classification With Rasa - Spacy/rasa_dataset.json: -------------------------------------------------------------------------------- 1 | { 2 | "rasa_nlu_data": { 3 | "regex_features": [ 4 | { 5 | "name": "zipcode", 6 | "pattern": "[0-9]{5}" 7 | }, 8 | { 9 | "name": "greet", 10 | "pattern": "hey[^\\s]*" 11 | } 12 | ], 13 | "entity_synonyms": [ 14 | { 15 | "value": "chinese", 16 | "synonyms": ["Chinese", "Chines", "chines"] 17 | }, 18 | { 19 | "value": "vegetarian", 20 | "synonyms": ["veggie", "vegg"] 21 | } 22 | ], 23 | "common_examples": [ 24 | { 25 | "text": "hey", 26 | "intent": "greet", 27 | "entities": [] 28 | }, 29 | { 30 | "text": "howdy", 31 | "intent": "greet", 32 | "entities": [] 33 | }, 34 | { 35 | "text": "hey there", 36 | "intent": "greet", 37 | "entities": [] 38 | }, 39 | { 40 | "text": "hello", 41 | "intent": "greet", 42 | "entities": [] 43 | }, 44 | { 45 | "text": "hi", 46 | "intent": "greet", 47 | "entities": [] 48 | }, 49 | { 50 | "text": "good morning", 51 | "intent": "greet", 52 | "entities": [] 53 | }, 54 | { 55 | "text": "good evening", 56 | "intent": "greet", 57 | "entities": [] 58 | }, 59 | { 60 | "text": "dear sir", 61 | "intent": "greet", 62 | "entities": [] 63 | }, 64 | { 65 | "text": "yes", 66 | "intent": "affirm", 67 | "entities": [] 68 | }, 69 | { 70 | "text": "yep", 71 | "intent": "affirm", 72 | "entities": [] 73 | }, 74 | { 75 | "text": "yeah", 76 | "intent": "affirm", 77 | "entities": [] 78 | }, 79 | { 80 | "text": "indeed", 81 | "intent": "affirm", 82 | "entities": [] 83 | }, 84 | { 85 | "text": "that's right", 86 | "intent": "affirm", 87 | "entities": [] 88 | }, 89 | { 90 | "text": "ok", 91 | "intent": "affirm", 92 | "entities": [] 93 | }, 94 | { 95 | "text": "great", 96 | "intent": "affirm", 97 | "entities": [] 98 | }, 99 | { 100 | "text": "right, thank you", 101 | "intent": "affirm", 102 | "entities": [] 103 | }, 104 | { 105 | "text": "correct", 106 | "intent": "affirm", 107 | "entities": [] 108 | }, 109 | { 110 | "text": "great choice", 111 | "intent": "affirm", 112 | "entities": [] 113 | }, 114 | { 115 | "text": "sounds really good", 116 | "intent": "affirm", 117 | "entities": [] 118 | }, 119 | { 120 | "text": "i'm looking for a place to eat", 121 | "intent": "restaurant_search", 122 | "entities": [] 123 | }, 124 | { 125 | "text": "I want to grab lunch", 126 | "intent": "restaurant_search", 127 | "entities": [] 128 | }, 129 | { 130 | "text": "I am searching for a dinner spot", 131 | "intent": "restaurant_search", 132 | "entities": [] 133 | }, 134 | { 135 | "text": "i'm looking for a place in the north of town", 136 | "intent": "restaurant_search", 137 | "entities": [ 138 | { 139 | "start": 31, 140 | "end": 36, 141 | "value": "north", 142 | "entity": "location" 143 | } 144 | ] 145 | }, 146 | { 147 | "text": "show me chinese restaurants", 148 | "intent": "restaurant_search", 149 | "entities": [ 150 | { 151 | "start": 8, 152 | "end": 15, 153 | "value": "chinese", 154 | "entity": "cuisine" 155 | } 156 | ] 157 | }, 158 | { 159 | "text": "show me chines restaurants in the north", 160 | "intent": "restaurant_search", 161 | "entities": [ 162 | { 163 | "start": 8, 164 | "end": 14, 165 | "value": "chinese", 166 | "entity": "cuisine" 167 | }, 168 | { 169 | "start": 34, 170 | "end": 39, 171 | "value": "north", 172 | "entity": "location" 173 | } 174 | ] 175 | }, 176 | { 177 | "text": "show me a mexican place in the centre", 178 | "intent": "restaurant_search", 179 | "entities": [ 180 | { 181 | "start": 31, 182 | "end": 37, 183 | "value": "centre", 184 | "entity": "location" 185 | }, 186 | { 187 | "start": 10, 188 | "end": 17, 189 | "value": "mexican", 190 | "entity": "cuisine" 191 | } 192 | ] 193 | }, 194 | { 195 | "text": "i am looking for an indian spot called olaolaolaolaolaola", 196 | "intent": "restaurant_search", 197 | "entities": [ 198 | { 199 | "start": 20, 200 | "end": 26, 201 | "value": "indian", 202 | "entity": "cuisine" 203 | } 204 | ] 205 | }, { 206 | "text": "search for restaurants", 207 | "intent": "restaurant_search", 208 | "entities": [] 209 | }, 210 | { 211 | "text": "anywhere in the west", 212 | "intent": "restaurant_search", 213 | "entities": [ 214 | { 215 | "start": 16, 216 | "end": 20, 217 | "value": "west", 218 | "entity": "location" 219 | } 220 | ] 221 | }, 222 | { 223 | "text": "anywhere near 18328", 224 | "intent": "restaurant_search", 225 | "entities": [ 226 | { 227 | "start": 14, 228 | "end": 19, 229 | "value": "18328", 230 | "entity": "location" 231 | } 232 | ] 233 | }, 234 | { 235 | "text": "I am looking for asian fusion food", 236 | "intent": "restaurant_search", 237 | "entities": [ 238 | { 239 | "start": 17, 240 | "end": 29, 241 | "value": "asian fusion", 242 | "entity": "cuisine" 243 | } 244 | ] 245 | }, 246 | { 247 | "text": "I am looking a restaurant in 29432", 248 | "intent": "restaurant_search", 249 | "entities": [ 250 | { 251 | "start": 29, 252 | "end": 34, 253 | "value": "29432", 254 | "entity": "location" 255 | } 256 | ] 257 | }, 258 | { 259 | "text": "I am looking for mexican indian fusion", 260 | "intent": "restaurant_search", 261 | "entities": [ 262 | { 263 | "start": 17, 264 | "end": 38, 265 | "value": "mexican indian fusion", 266 | "entity": "cuisine" 267 | } 268 | ] 269 | }, 270 | { 271 | "text": "central indian restaurant", 272 | "intent": "restaurant_search", 273 | "entities": [ 274 | { 275 | "start": 0, 276 | "end": 7, 277 | "value": "central", 278 | "entity": "location" 279 | }, 280 | { 281 | "start": 8, 282 | "end": 14, 283 | "value": "indian", 284 | "entity": "cuisine" 285 | } 286 | ] 287 | }, 288 | { 289 | "text": "bye", 290 | "intent": "goodbye", 291 | "entities": [] 292 | }, 293 | { 294 | "text": "goodbye", 295 | "intent": "goodbye", 296 | "entities": [] 297 | }, 298 | { 299 | "text": "good bye", 300 | "intent": "goodbye", 301 | "entities": [] 302 | }, 303 | { 304 | "text": "stop", 305 | "intent": "goodbye", 306 | "entities": [] 307 | }, 308 | { 309 | "text": "end", 310 | "intent": "goodbye", 311 | "entities": [] 312 | }, 313 | { 314 | "text": "farewell", 315 | "intent": "goodbye", 316 | "entities": [] 317 | }, 318 | { 319 | "text": "Bye bye", 320 | "intent": "goodbye", 321 | "entities": [] 322 | }, 323 | { 324 | "text": "have a good one", 325 | "intent": "goodbye", 326 | "entities": [] 327 | } 328 | ] 329 | } 330 | } 331 | -------------------------------------------------------------------------------- /Text Summarization with Sumy Python .ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Text Summarization Using Sumy & Python\n", 8 | "##### Installation\n", 9 | "+ pip install sumy\n", 10 | "\n", 11 | "+ offers several algorithms and methods for summarization\n", 12 | " - Luhn – heurestic method\n", 13 | " - Latent Semantic Analysis\n", 14 | " - Edmundson heurestic method with previous statistic research\n", 15 | "\n", 16 | " - LexRank – Unsupervised approach inspired by algorithms PageRank and HITS\n", 17 | " - TextRank\n", 18 | " - SumBasic – Method that is often used as a baseline in the literature\n", 19 | " - KL-Sum – Method that greedily adds sentences to a summary so long as it decreases the KL Divergence. \n" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 1, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "# Load Packages\n", 29 | "import sumy" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 4, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "from sumy.parsers.plaintext import PlaintextParser\n", 39 | "from sumy.nlp.tokenizers import Tokenizer\n", 40 | "from sumy.summarizers.lex_rank import LexRankSummarizer " 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 5, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "document1 =\"\"\"Machine learning (ML) is the scientific study of algorithms and statistical models that computer systems use to progressively improve their performance on a specific task. Machine learning algorithms build a mathematical model of sample data, known as \"training data\", in order to make predictions or decisions without being explicitly programmed to perform the task. Machine learning algorithms are used in the applications of email filtering, detection of network intruders, and computer vision, where it is infeasible to develop an algorithm of specific instructions for performing the task. Machine learning is closely related to computational statistics, which focuses on making predictions using computers. The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning. Data mining is a field of study within machine learning, and focuses on exploratory data analysis through unsupervised learning.In its application across business problems, machine learning is also referred to as predictive analytics.\"\"\"" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 8, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "# For Strings\n", 59 | "parser = PlaintextParser.from_string(document1,Tokenizer(\"english\"))" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "# For Files\n", 69 | "parser = PlaintextParser.from_file(file, Tokenizer(\"english\"))" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "### Using LexRank\n", 77 | "+ unsupervised approach to text summarization based on graph-based centrality scoring of sentences. \n", 78 | "+ The main idea is that sentences \"recommend\" other similar sentences to the reader. Thus, if one sentence is very similar to many others, it will likely be a sentence of great importance\n", 79 | "+ Standalone pkg pip install lexrank" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 9, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "# Using LexRank\n", 89 | "summarizer = LexRankSummarizer()\n", 90 | "#Summarize the document with 2 sentences\n", 91 | "summary = summarizer(parser.document, 2) \n" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 10, 97 | "metadata": {}, 98 | "outputs": [ 99 | { 100 | "name": "stdout", 101 | "output_type": "stream", 102 | "text": [ 103 | "Machine learning (ML) is the scientific study of algorithms and statistical models that computer systems use to progressively improve their performance on a specific task.\n", 104 | "Machine learning algorithms build a mathematical model of sample data, known as \"training data\", in order to make predictions or decisions without being explicitly programmed to perform the task.\n" 105 | ] 106 | } 107 | ], 108 | "source": [ 109 | "for sentence in summary:\n", 110 | " print(sentence)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "#### Using Lung\n", 118 | "+ Based on frequency of most important words" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 11, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "from sumy.summarizers.luhn import LuhnSummarizer" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 12, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "summarizer_luhn = LuhnSummarizer()\n", 137 | "summary_1 =summarizer_luhn(parser.document,2)\n" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 13, 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "name": "stdout", 147 | "output_type": "stream", 148 | "text": [ 149 | "Machine learning algorithms build a mathematical model of sample data, known as \"training data\", in order to make predictions or decisions without being explicitly programmed to perform the task.\n", 150 | "Data mining is a field of study within machine learning, and focuses on exploratory data analysis through unsupervised learning.In its application across business problems, machine learning is also referred to as predictive analytics.\n" 151 | ] 152 | } 153 | ], 154 | "source": [ 155 | "for sentence in summary_1:\n", 156 | " print(sentence)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": {}, 162 | "source": [ 163 | "#### Using LSA\n", 164 | "+ Based on term frequency techniques with singular value decomposition to summarize texts." 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 15, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "from sumy.summarizers.lsa import LsaSummarizer\n" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 16, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "summarizer_lsa = LsaSummarizer()\n", 183 | "summary_2 =summarizer_lsa(parser.document,2)\n" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 17, 189 | "metadata": {}, 190 | "outputs": [ 191 | { 192 | "name": "stdout", 193 | "output_type": "stream", 194 | "text": [ 195 | "Machine learning (ML) is the scientific study of algorithms and statistical models that computer systems use to progressively improve their performance on a specific task.\n", 196 | "Machine learning is closely related to computational statistics, which focuses on making predictions using computers.\n" 197 | ] 198 | } 199 | ], 200 | "source": [ 201 | "for sentence in summary_2:\n", 202 | " print(sentence)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 20, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "## Alternative Method using stopwords\n", 212 | "from sumy.nlp.stemmers import Stemmer\n", 213 | "from sumy.utils import get_stop_words\n", 214 | "summarizer_lsa2 = LsaSummarizer()\n", 215 | "summarizer_lsa2 = LsaSummarizer(Stemmer(\"english\"))\n", 216 | "summarizer_lsa2.stop_words = get_stop_words(\"english\")\n" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 21, 222 | "metadata": {}, 223 | "outputs": [ 224 | { 225 | "name": "stdout", 226 | "output_type": "stream", 227 | "text": [ 228 | "Machine learning (ML) is the scientific study of algorithms and statistical models that computer systems use to progressively improve their performance on a specific task.\n", 229 | "The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning.\n" 230 | ] 231 | } 232 | ], 233 | "source": [ 234 | "for sentence in summarizer_lsa2(parser.document,2):\n", 235 | " print(sentence)" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 22, 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "# Thanks\n", 245 | "# Jesse JCharis\n", 246 | "# J-Secur1ty\n", 247 | "# Jesus Saves@JCharisTech\n" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "metadata": {}, 254 | "outputs": [], 255 | "source": [] 256 | } 257 | ], 258 | "metadata": { 259 | "kernelspec": { 260 | "display_name": "Python 3", 261 | "language": "python", 262 | "name": "python3" 263 | }, 264 | "language_info": { 265 | "codemirror_mode": { 266 | "name": "ipython", 267 | "version": 3 268 | }, 269 | "file_extension": ".py", 270 | "mimetype": "text/x-python", 271 | "name": "python", 272 | "nbconvert_exporter": "python", 273 | "pygments_lexer": "ipython3", 274 | "version": "3.6.7" 275 | } 276 | }, 277 | "nbformat": 4, 278 | "nbformat_minor": 2 279 | } 280 | -------------------------------------------------------------------------------- /NLP_with_SpaCy/Automatic Redaction & Sanitization of Document Using Spacy NER.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## NLP with SpaCy\n", 8 | "### (Automatic Redaction/Sanitization of Documents Using SpaCy)\n", 9 | "\n", 10 | "#### Applications of Named Entity Recognition \n", 11 | "+ Sanitization is the process of removing sensitive information from a document or other message (or sometimes encrypting it), so that the document may be distributed to a broader audience\n", 12 | "#### Purpose of Sanitization/Redaction of Document\n", 13 | "- For anonymity of source in document\n", 14 | "- To ensure there is no sensitive or personally identifiable information in the document\n", 15 | "- Censorship\n" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | " ![](imageredacted.jpg)" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 1, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "# Load NLP Pkg\n", 32 | "import spacy" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 2, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "# Create NLP object\n", 42 | "nlp = spacy.load('en')" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "##### Procedure\n", 50 | "+ Using NER\n", 51 | "+ Locate Entities eg.Person or Place\n", 52 | "+ Replace with our word" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 3, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "ex1 = \"The reporter said that it was John Mark that gave him the news in London last year\"" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 4, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "docx1 = nlp(ex1)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 5, 76 | "metadata": {}, 77 | "outputs": [ 78 | { 79 | "name": "stdout", 80 | "output_type": "stream", 81 | "text": [ 82 | "John Mark PERSON\n", 83 | "London GPE\n", 84 | "last year DATE\n" 85 | ] 86 | } 87 | ], 88 | "source": [ 89 | "# Find Entities\n", 90 | "for ent in docx1.ents:\n", 91 | " print(ent.text,ent.label_)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 6, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "# Function to Sanitize/Redact Names\n", 101 | "def sanitize_names(text):\n", 102 | " docx = nlp(text)\n", 103 | " redacted_sentences = []\n", 104 | " for ent in docx.ents:\n", 105 | " ent.merge()\n", 106 | " for token in docx:\n", 107 | " if token.ent_type_ == 'PERSON':\n", 108 | " redacted_sentences.append(\"[REDACTED]\")\n", 109 | " else:\n", 110 | " redacted_sentences.append(token.string)\n", 111 | " return \"\".join(redacted_sentences)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 7, 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "data": { 121 | "text/plain": [ 122 | "'The reporter said that it was John Mark that gave him the news in London last year'" 123 | ] 124 | }, 125 | "execution_count": 7, 126 | "metadata": {}, 127 | "output_type": "execute_result" 128 | } 129 | ], 130 | "source": [ 131 | "ex1" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 8, 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "data": { 141 | "text/plain": [ 142 | "'The reporter said that it was [REDACTED]that gave him the news in London last year'" 143 | ] 144 | }, 145 | "execution_count": 8, 146 | "metadata": {}, 147 | "output_type": "execute_result" 148 | } 149 | ], 150 | "source": [ 151 | "# Redact the Names\n", 152 | "sanitize_names(ex1)" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 9, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "# Visualization of Entities\n", 162 | "from spacy import displacy" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 10, 168 | "metadata": {}, 169 | "outputs": [ 170 | { 171 | "data": { 172 | "text/html": [ 173 | "
The reporter said that it was \n", 174 | "\n", 175 | " John Mark\n", 176 | " PERSON\n", 177 | "\n", 178 | " that gave him the news in \n", 179 | "\n", 180 | " London\n", 181 | " GPE\n", 182 | "\n", 183 | " \n", 184 | "\n", 185 | " last year\n", 186 | " DATE\n", 187 | "\n", 188 | "
" 189 | ], 190 | "text/plain": [ 191 | "" 192 | ] 193 | }, 194 | "metadata": {}, 195 | "output_type": "display_data" 196 | } 197 | ], 198 | "source": [ 199 | "displacy.render(nlp(ex1),style='ent',jupyter=True)" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 11, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "# Apply the function and visualize it\n", 209 | "docx2 = sanitize_names(ex1)" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 12, 215 | "metadata": {}, 216 | "outputs": [ 217 | { 218 | "data": { 219 | "text/html": [ 220 | "
The reporter said that it was [REDACTED]that gave him the news in \n", 221 | "\n", 222 | " London\n", 223 | " GPE\n", 224 | "\n", 225 | " \n", 226 | "\n", 227 | " last year\n", 228 | " DATE\n", 229 | "\n", 230 | "
" 231 | ], 232 | "text/plain": [ 233 | "" 234 | ] 235 | }, 236 | "metadata": {}, 237 | "output_type": "display_data" 238 | } 239 | ], 240 | "source": [ 241 | "displacy.render(nlp(docx2),style='ent',jupyter=True)" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "#### Redaction/Sanitization of Location/GPE" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 13, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "# Redaction of Location/GPE\n", 258 | "def sanitize_locations(text):\n", 259 | " docx = nlp(text)\n", 260 | " redacted_sentences = []\n", 261 | " for ent in docx.ents:\n", 262 | " ent.merge()\n", 263 | " for token in docx:\n", 264 | " if token.ent_type_ == 'GPE':\n", 265 | " redacted_sentences.append(\"[REDACTED]\")\n", 266 | " else:\n", 267 | " redacted_sentences.append(token.string)\n", 268 | " return \"\".join(redacted_sentences)" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 14, 274 | "metadata": {}, 275 | "outputs": [ 276 | { 277 | "data": { 278 | "text/plain": [ 279 | "'The reporter said that it was John Mark that gave him the news in [REDACTED]last year'" 280 | ] 281 | }, 282 | "execution_count": 14, 283 | "metadata": {}, 284 | "output_type": "execute_result" 285 | } 286 | ], 287 | "source": [ 288 | "sanitize_locations(ex1)" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 15, 294 | "metadata": {}, 295 | "outputs": [], 296 | "source": [ 297 | "## Thanks For Watching\n", 298 | "# Jesse JCharis\n", 299 | "# J-Secur1ty\n", 300 | "# Jesus Saves@JCharisTech" 301 | ] 302 | } 303 | ], 304 | "metadata": { 305 | "kernelspec": { 306 | "display_name": "Python 3", 307 | "language": "python", 308 | "name": "python3" 309 | }, 310 | "language_info": { 311 | "codemirror_mode": { 312 | "name": "ipython", 313 | "version": 3 314 | }, 315 | "file_extension": ".py", 316 | "mimetype": "text/x-python", 317 | "name": "python", 318 | "nbconvert_exporter": "python", 319 | "pygments_lexer": "ipython3", 320 | "version": "3.6.7" 321 | } 322 | }, 323 | "nbformat": 4, 324 | "nbformat_minor": 2 325 | } 326 | -------------------------------------------------------------------------------- /How to Find the Most Common Words Using Spacy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### How to Find the Most Common Words Using SpaCy & Python" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 7, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# Load Our Packages\n", 17 | "import spacy\n", 18 | "from collections import Counter" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 8, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "nlp = spacy.load('en')" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 9, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "docx = nlp(open('luke6.txt').read())" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 10, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "text/plain": [ 47 | "Blessings and Woes\n", 48 | "He went down with them and stood on a level place. A large crowd of his disciples was there and a great number of people from all over Judea, from Jerusalem, and from the coastal region around Tyre and Sidon, who had come to hear him and to be healed of their diseases. Those troubled by impure spirits were cured, and the people all tried to touch him, because power was coming from him and healing them all. Looking at his disciples, he said: \"Blessed are you who are poor, for yours is the kingdom of God. Blessed are you who hunger now, for you will be satisfied. Blessed are you who weep now, for you will laugh. Blessed are you when people hate you, when they exclude you and insult you and reject your name as evil, because of the Son of Man. \"Rejoice in that day and leap for joy, because great is your reward in heaven. For that is how their ancestors treated the prophets. \"But woe to you who are rich, for you have already received your comfort. Woe to you who are well fed now, for you will go hungry. Woe to you who laugh now, for you will mourn and weep. Woe to you when everyone speaks well of you, for that is how their ancestors treated the false prophets.\n", 49 | "Love for Enemies\n", 50 | "\"But to you who are listening I say: Love your enemies, do good to those who hate you, bless those who curse you, pray for those who mistreat you. If someone slaps you on one cheek, turn to them the other also. If someone takes your coat, do not withhold your shirt from them. Give to everyone who asks you, and if anyone takes what belongs to you, do not demand it back. Do to others as you would have them do to you. \"If you love those who love you, what credit is that to you? Even sinners love those who love them. And if you do good to those who are good to you, what credit is that to you? Even sinners do that. And if you lend to those from whom you expect repayment, what credit is that to you? Even sinners lend to sinners, expecting to be repaid in full. But love your enemies, do good to them, and lend to them without expecting to get anything back. Then your reward will be great, and you will be children of the Most High, because he is kind to the ungrateful and wicked. Be merciful, just as your Father is merciful.\n", 51 | "Judging Others\n", 52 | "\"Do not judge, and you will not be judged. Do not condemn, and you will not be condemned. Forgive, and you will be forgiven. Give, and it will be given to you. A good measure, pressed down, shaken together and running over, will be poured into your lap. For with the measure you use, it will be measured to you.\" He also told them this parable: \"Can the blind lead the blind? Will they not both fall into a pit? The student is not above the teacher, but everyone who is fully trained will be like their teacher. \"Why do you look at the speck of sawdust in your brother’s eye and pay no attention to the plank in your own eye? How can you say to your brother, 'Brother, let me take the speck out of your eye,'when you yourself fail to see the plank in your own eye? You hypocrite, first take the plank out of your eye, and then you will see clearly to remove the speck from your brother’s eye.\n", 53 | "A Tree and Its Fruit\n", 54 | "\"No good tree bears bad fruit, nor does a bad tree bear good fruit. Each tree is recognized by its own fruit. People do not pick figs from thornbushes, or grapes from briers. A good man brings good things out of the good stored up in his heart, and an evil man brings evil things out of the evil stored up in his heart. For the mouth speaks what the heart is full of.\n", 55 | "The Wise and Foolish Builders\n", 56 | "\"Why do you call me, 'Lord, Lord,'and do not do what I say? As for everyone who comes to me and hears my words and puts them into practice, I will show you what they are like. They are like a man building a house, who dug down deep and laid the foundation on rock. When a flood came, the torrent struck that house but could not shake it, because it was well built. But the one who hears my words and does not put them into practice is like a man who built a house on the ground without a foundation. The moment the torrent struck that house, it collapsed and its destruction was complete.\"" 57 | ] 58 | }, 59 | "execution_count": 10, 60 | "metadata": {}, 61 | "output_type": "execute_result" 62 | } 63 | ], 64 | "source": [ 65 | "docx" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 12, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "# Remove Punct,Stop \n", 75 | "# Nouns\n", 76 | "nouns = [ token.text for token in docx if token.is_stop != True and token.is_punct !=True and token.pos_ == 'NOUN']\n" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 13, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "data": { 86 | "text/plain": [ 87 | "['Blessings',\n", 88 | " 'level',\n", 89 | " 'place',\n", 90 | " 'crowd',\n", 91 | " 'disciples',\n", 92 | " 'number',\n", 93 | " 'people',\n", 94 | " 'region',\n", 95 | " 'diseases',\n", 96 | " 'impure',\n", 97 | " 'spirits',\n", 98 | " 'people',\n", 99 | " 'power',\n", 100 | " 'disciples',\n", 101 | " 'kingdom',\n", 102 | " 'people',\n", 103 | " 'evil',\n", 104 | " 'day',\n", 105 | " 'leap',\n", 106 | " 'joy',\n", 107 | " 'reward',\n", 108 | " 'ancestors',\n", 109 | " 'prophets',\n", 110 | " 'woe',\n", 111 | " 'comfort',\n", 112 | " 'Woe',\n", 113 | " 'Woe',\n", 114 | " 'ancestors',\n", 115 | " 'prophets',\n", 116 | " 'enemies',\n", 117 | " 'good',\n", 118 | " 'cheek',\n", 119 | " 'coat',\n", 120 | " 'shirt',\n", 121 | " 'credit',\n", 122 | " 'sinners',\n", 123 | " 'good',\n", 124 | " 'credit',\n", 125 | " 'sinners',\n", 126 | " 'repayment',\n", 127 | " 'credit',\n", 128 | " 'sinners',\n", 129 | " 'sinners',\n", 130 | " 'enemies',\n", 131 | " 'good',\n", 132 | " 'reward',\n", 133 | " 'children',\n", 134 | " 'Others',\n", 135 | " 'measure',\n", 136 | " 'lap',\n", 137 | " 'measure',\n", 138 | " 'parable',\n", 139 | " 'pit',\n", 140 | " 'student',\n", 141 | " 'teacher',\n", 142 | " 'teacher',\n", 143 | " 'speck',\n", 144 | " 'sawdust',\n", 145 | " 'brotherâ€',\n", 146 | " 'eye',\n", 147 | " 'attention',\n", 148 | " 'plank',\n", 149 | " 'eye',\n", 150 | " 'brother',\n", 151 | " 'Brother',\n", 152 | " 'speck',\n", 153 | " 'plank',\n", 154 | " 'eye',\n", 155 | " 'plank',\n", 156 | " 'eye',\n", 157 | " 'speck',\n", 158 | " 'brotherâ€',\n", 159 | " 'eye',\n", 160 | " 'tree',\n", 161 | " 'fruit',\n", 162 | " 'tree',\n", 163 | " 'fruit',\n", 164 | " 'tree',\n", 165 | " 'fruit',\n", 166 | " 'People',\n", 167 | " 'figs',\n", 168 | " 'thornbushes',\n", 169 | " 'grapes',\n", 170 | " 'briers',\n", 171 | " 'man',\n", 172 | " 'things',\n", 173 | " 'good',\n", 174 | " 'heart',\n", 175 | " 'man',\n", 176 | " 'things',\n", 177 | " 'evil',\n", 178 | " 'heart',\n", 179 | " 'mouth',\n", 180 | " 'heart',\n", 181 | " 'words',\n", 182 | " 'practice',\n", 183 | " 'man',\n", 184 | " 'house',\n", 185 | " 'foundation',\n", 186 | " 'rock',\n", 187 | " 'flood',\n", 188 | " 'torrent',\n", 189 | " 'house',\n", 190 | " 'words',\n", 191 | " 'practice',\n", 192 | " 'man',\n", 193 | " 'house',\n", 194 | " 'ground',\n", 195 | " 'foundation',\n", 196 | " 'moment',\n", 197 | " 'torrent',\n", 198 | " 'house',\n", 199 | " 'destruction']" 200 | ] 201 | }, 202 | "execution_count": 13, 203 | "metadata": {}, 204 | "output_type": "execute_result" 205 | } 206 | ], 207 | "source": [ 208 | "nouns" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 14, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "word_freq = Counter(nouns)" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 15, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "common_nouns = word_freq.most_common(10)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 16, 232 | "metadata": {}, 233 | "outputs": [ 234 | { 235 | "name": "stdout", 236 | "output_type": "stream", 237 | "text": [ 238 | "[('eye', 5), ('good', 4), ('sinners', 4), ('man', 4), ('house', 4), ('people', 3), ('credit', 3), ('speck', 3), ('plank', 3), ('tree', 3)]\n" 239 | ] 240 | } 241 | ], 242 | "source": [ 243 | "print(common_nouns)" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "### Most Common Verbs\n", 258 | "+ Some stops words can also be verbs" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 17, 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [ 267 | "# Remove Punct,Stop \n", 268 | "# verbs\n", 269 | "verbs = [ token.text for token in docx if token.is_punct !=True and token.pos_ == 'VERB']\n" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 18, 275 | "metadata": {}, 276 | "outputs": [ 277 | { 278 | "name": "stdout", 279 | "output_type": "stream", 280 | "text": [ 281 | "[('will', 15), ('is', 14), ('be', 12), ('do', 12), ('are', 11), ('love', 5), ('was', 4), ('Blessed', 4), ('say', 3), ('Do', 3)]\n" 282 | ] 283 | } 284 | ], 285 | "source": [ 286 | "print(Counter(verbs).most_common(10))" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 19, 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "verbs_with_stopword = [ token.text for token in docx if token.is_stop != True and token.is_punct !=True and token.pos_ == 'VERB']" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 20, 301 | "metadata": {}, 302 | "outputs": [ 303 | { 304 | "name": "stdout", 305 | "output_type": "stream", 306 | "text": [ 307 | "[('love', 5), ('Blessed', 4), ('Do', 3), ('lend', 3), ('weep', 2), ('laugh', 2), ('hate', 2), ('treated', 2), ('speaks', 2), ('takes', 2)]\n" 308 | ] 309 | } 310 | ], 311 | "source": [ 312 | "print(Counter(verbs_with_stopword).most_common(10))" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [] 321 | } 322 | ], 323 | "metadata": { 324 | "kernelspec": { 325 | "display_name": "Python 3", 326 | "language": "python", 327 | "name": "python3" 328 | }, 329 | "language_info": { 330 | "codemirror_mode": { 331 | "name": "ipython", 332 | "version": 3 333 | }, 334 | "file_extension": ".py", 335 | "mimetype": "text/x-python", 336 | "name": "python", 337 | "nbconvert_exporter": "python", 338 | "pygments_lexer": "ipython3", 339 | "version": "3.6.3" 340 | } 341 | }, 342 | "nbformat": 4, 343 | "nbformat_minor": 2 344 | } 345 | -------------------------------------------------------------------------------- /NLP_with_SpaCy/How to Find the Most Common Words Using Spacy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### How to Find the Most Common Words Using SpaCy & Python" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 7, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# Load Our Packages\n", 17 | "import spacy\n", 18 | "from collections import Counter" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 8, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "nlp = spacy.load('en')" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 9, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "docx = nlp(open('luke6.txt').read())" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 10, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "text/plain": [ 47 | "Blessings and Woes\n", 48 | "He went down with them and stood on a level place. A large crowd of his disciples was there and a great number of people from all over Judea, from Jerusalem, and from the coastal region around Tyre and Sidon, who had come to hear him and to be healed of their diseases. Those troubled by impure spirits were cured, and the people all tried to touch him, because power was coming from him and healing them all. Looking at his disciples, he said: \"Blessed are you who are poor, for yours is the kingdom of God. Blessed are you who hunger now, for you will be satisfied. Blessed are you who weep now, for you will laugh. Blessed are you when people hate you, when they exclude you and insult you and reject your name as evil, because of the Son of Man. \"Rejoice in that day and leap for joy, because great is your reward in heaven. For that is how their ancestors treated the prophets. \"But woe to you who are rich, for you have already received your comfort. Woe to you who are well fed now, for you will go hungry. Woe to you who laugh now, for you will mourn and weep. Woe to you when everyone speaks well of you, for that is how their ancestors treated the false prophets.\n", 49 | "Love for Enemies\n", 50 | "\"But to you who are listening I say: Love your enemies, do good to those who hate you, bless those who curse you, pray for those who mistreat you. If someone slaps you on one cheek, turn to them the other also. If someone takes your coat, do not withhold your shirt from them. Give to everyone who asks you, and if anyone takes what belongs to you, do not demand it back. Do to others as you would have them do to you. \"If you love those who love you, what credit is that to you? Even sinners love those who love them. And if you do good to those who are good to you, what credit is that to you? Even sinners do that. And if you lend to those from whom you expect repayment, what credit is that to you? Even sinners lend to sinners, expecting to be repaid in full. But love your enemies, do good to them, and lend to them without expecting to get anything back. Then your reward will be great, and you will be children of the Most High, because he is kind to the ungrateful and wicked. Be merciful, just as your Father is merciful.\n", 51 | "Judging Others\n", 52 | "\"Do not judge, and you will not be judged. Do not condemn, and you will not be condemned. Forgive, and you will be forgiven. Give, and it will be given to you. A good measure, pressed down, shaken together and running over, will be poured into your lap. For with the measure you use, it will be measured to you.\" He also told them this parable: \"Can the blind lead the blind? Will they not both fall into a pit? The student is not above the teacher, but everyone who is fully trained will be like their teacher. \"Why do you look at the speck of sawdust in your brother’s eye and pay no attention to the plank in your own eye? How can you say to your brother, 'Brother, let me take the speck out of your eye,'when you yourself fail to see the plank in your own eye? You hypocrite, first take the plank out of your eye, and then you will see clearly to remove the speck from your brother’s eye.\n", 53 | "A Tree and Its Fruit\n", 54 | "\"No good tree bears bad fruit, nor does a bad tree bear good fruit. Each tree is recognized by its own fruit. People do not pick figs from thornbushes, or grapes from briers. A good man brings good things out of the good stored up in his heart, and an evil man brings evil things out of the evil stored up in his heart. For the mouth speaks what the heart is full of.\n", 55 | "The Wise and Foolish Builders\n", 56 | "\"Why do you call me, 'Lord, Lord,'and do not do what I say? As for everyone who comes to me and hears my words and puts them into practice, I will show you what they are like. They are like a man building a house, who dug down deep and laid the foundation on rock. When a flood came, the torrent struck that house but could not shake it, because it was well built. But the one who hears my words and does not put them into practice is like a man who built a house on the ground without a foundation. The moment the torrent struck that house, it collapsed and its destruction was complete.\"" 57 | ] 58 | }, 59 | "execution_count": 10, 60 | "metadata": {}, 61 | "output_type": "execute_result" 62 | } 63 | ], 64 | "source": [ 65 | "docx" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 12, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "# Remove Punct,Stop \n", 75 | "# Nouns\n", 76 | "nouns = [ token.text for token in docx if token.is_stop != True and token.is_punct !=True and token.pos_ == 'NOUN']\n" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 13, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "data": { 86 | "text/plain": [ 87 | "['Blessings',\n", 88 | " 'level',\n", 89 | " 'place',\n", 90 | " 'crowd',\n", 91 | " 'disciples',\n", 92 | " 'number',\n", 93 | " 'people',\n", 94 | " 'region',\n", 95 | " 'diseases',\n", 96 | " 'impure',\n", 97 | " 'spirits',\n", 98 | " 'people',\n", 99 | " 'power',\n", 100 | " 'disciples',\n", 101 | " 'kingdom',\n", 102 | " 'people',\n", 103 | " 'evil',\n", 104 | " 'day',\n", 105 | " 'leap',\n", 106 | " 'joy',\n", 107 | " 'reward',\n", 108 | " 'ancestors',\n", 109 | " 'prophets',\n", 110 | " 'woe',\n", 111 | " 'comfort',\n", 112 | " 'Woe',\n", 113 | " 'Woe',\n", 114 | " 'ancestors',\n", 115 | " 'prophets',\n", 116 | " 'enemies',\n", 117 | " 'good',\n", 118 | " 'cheek',\n", 119 | " 'coat',\n", 120 | " 'shirt',\n", 121 | " 'credit',\n", 122 | " 'sinners',\n", 123 | " 'good',\n", 124 | " 'credit',\n", 125 | " 'sinners',\n", 126 | " 'repayment',\n", 127 | " 'credit',\n", 128 | " 'sinners',\n", 129 | " 'sinners',\n", 130 | " 'enemies',\n", 131 | " 'good',\n", 132 | " 'reward',\n", 133 | " 'children',\n", 134 | " 'Others',\n", 135 | " 'measure',\n", 136 | " 'lap',\n", 137 | " 'measure',\n", 138 | " 'parable',\n", 139 | " 'pit',\n", 140 | " 'student',\n", 141 | " 'teacher',\n", 142 | " 'teacher',\n", 143 | " 'speck',\n", 144 | " 'sawdust',\n", 145 | " 'brotherâ€',\n", 146 | " 'eye',\n", 147 | " 'attention',\n", 148 | " 'plank',\n", 149 | " 'eye',\n", 150 | " 'brother',\n", 151 | " 'Brother',\n", 152 | " 'speck',\n", 153 | " 'plank',\n", 154 | " 'eye',\n", 155 | " 'plank',\n", 156 | " 'eye',\n", 157 | " 'speck',\n", 158 | " 'brotherâ€',\n", 159 | " 'eye',\n", 160 | " 'tree',\n", 161 | " 'fruit',\n", 162 | " 'tree',\n", 163 | " 'fruit',\n", 164 | " 'tree',\n", 165 | " 'fruit',\n", 166 | " 'People',\n", 167 | " 'figs',\n", 168 | " 'thornbushes',\n", 169 | " 'grapes',\n", 170 | " 'briers',\n", 171 | " 'man',\n", 172 | " 'things',\n", 173 | " 'good',\n", 174 | " 'heart',\n", 175 | " 'man',\n", 176 | " 'things',\n", 177 | " 'evil',\n", 178 | " 'heart',\n", 179 | " 'mouth',\n", 180 | " 'heart',\n", 181 | " 'words',\n", 182 | " 'practice',\n", 183 | " 'man',\n", 184 | " 'house',\n", 185 | " 'foundation',\n", 186 | " 'rock',\n", 187 | " 'flood',\n", 188 | " 'torrent',\n", 189 | " 'house',\n", 190 | " 'words',\n", 191 | " 'practice',\n", 192 | " 'man',\n", 193 | " 'house',\n", 194 | " 'ground',\n", 195 | " 'foundation',\n", 196 | " 'moment',\n", 197 | " 'torrent',\n", 198 | " 'house',\n", 199 | " 'destruction']" 200 | ] 201 | }, 202 | "execution_count": 13, 203 | "metadata": {}, 204 | "output_type": "execute_result" 205 | } 206 | ], 207 | "source": [ 208 | "nouns" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 14, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "word_freq = Counter(nouns)" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 15, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "common_nouns = word_freq.most_common(10)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 16, 232 | "metadata": {}, 233 | "outputs": [ 234 | { 235 | "name": "stdout", 236 | "output_type": "stream", 237 | "text": [ 238 | "[('eye', 5), ('good', 4), ('sinners', 4), ('man', 4), ('house', 4), ('people', 3), ('credit', 3), ('speck', 3), ('plank', 3), ('tree', 3)]\n" 239 | ] 240 | } 241 | ], 242 | "source": [ 243 | "print(common_nouns)" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "### Most Common Verbs\n", 258 | "+ Some stops words can also be verbs" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 17, 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [ 267 | "# Remove Punct,Stop \n", 268 | "# verbs\n", 269 | "verbs = [ token.text for token in docx if token.is_punct !=True and token.pos_ == 'VERB']\n" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 18, 275 | "metadata": {}, 276 | "outputs": [ 277 | { 278 | "name": "stdout", 279 | "output_type": "stream", 280 | "text": [ 281 | "[('will', 15), ('is', 14), ('be', 12), ('do', 12), ('are', 11), ('love', 5), ('was', 4), ('Blessed', 4), ('say', 3), ('Do', 3)]\n" 282 | ] 283 | } 284 | ], 285 | "source": [ 286 | "print(Counter(verbs).most_common(10))" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 19, 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "verbs_with_stopword = [ token.text for token in docx if token.is_stop != True and token.is_punct !=True and token.pos_ == 'VERB']" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 20, 301 | "metadata": {}, 302 | "outputs": [ 303 | { 304 | "name": "stdout", 305 | "output_type": "stream", 306 | "text": [ 307 | "[('love', 5), ('Blessed', 4), ('Do', 3), ('lend', 3), ('weep', 2), ('laugh', 2), ('hate', 2), ('treated', 2), ('speaks', 2), ('takes', 2)]\n" 308 | ] 309 | } 310 | ], 311 | "source": [ 312 | "print(Counter(verbs_with_stopword).most_common(10))" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [] 321 | } 322 | ], 323 | "metadata": { 324 | "kernelspec": { 325 | "display_name": "Python 3", 326 | "language": "python", 327 | "name": "python3" 328 | }, 329 | "language_info": { 330 | "codemirror_mode": { 331 | "name": "ipython", 332 | "version": 3 333 | }, 334 | "file_extension": ".py", 335 | "mimetype": "text/x-python", 336 | "name": "python", 337 | "nbconvert_exporter": "python", 338 | "pygments_lexer": "ipython3", 339 | "version": "3.6.3" 340 | } 341 | }, 342 | "nbformat": 4, 343 | "nbformat_minor": 2 344 | } 345 | -------------------------------------------------------------------------------- /NLP_with_Polyglot/NLP with Polyglot .ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Natural Language Processing with Polyglot\n", 8 | "\n", 9 | "#### Installation on Unix\n", 10 | "+ sudo apt-get install python-numpy libicu-dev\n", 11 | "+ pip install polyglot\n", 12 | "\n", 13 | "#### Installation on Windows\n", 14 | "\n", 15 | "##### Download the PyCLD2 and PyICU From \n", 16 | " - https://www.lfd.uci.edu/~gohlke/pythonlibs/\n", 17 | "- pip install pycld2-0.31-cp36-cp36m-win_amd64.whl\n", 18 | "- pip install PyICU-1.9.8-cp36-cp36m-win_amd64.whl\n", 19 | "- pip install Morfessor-2.0.4-py2.py3-none-any.whl\n", 20 | "- git clone https://github.com/aboSamoor/polyglot.git\n", 21 | "- python setup.py install\n", 22 | "\n", 23 | "\n", 24 | "- polyglot download embeddings2.en\n", 25 | "- polyglot download ner2.en\n", 26 | "- polyglot download sentiment2.en\n", 27 | "- polyglot download pos2.en\n", 28 | "- polyglot download morph2.en\n", 29 | "- polyglot download transliteration2.ar\n", 30 | "\n", 31 | "#### Uses and Application\n", 32 | "+ Fundamentals or Basics of NLP\n", 33 | "+ Transliteration\n", 34 | "+ Named Entity Recognition\n", 35 | "+ Sentiment Analysis\n", 36 | "\n", 37 | "##### NB similar learning curve like TextBlob API" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "#### Tokenization\n", 45 | "+ Splitting text into words" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 47, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "# Load packages\n", 55 | "import polyglot\n", 56 | "from polyglot.text import Text,Word" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 48, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "# Word Tokens\n", 66 | "docx = Text(u\"He likes reading and painting\")\n" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 49, 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "data": { 76 | "text/plain": [ 77 | "WordList(['He', 'likes', 'reading', 'and', 'painting'])" 78 | ] 79 | }, 80 | "execution_count": 49, 81 | "metadata": {}, 82 | "output_type": "execute_result" 83 | } 84 | ], 85 | "source": [ 86 | "docx.words" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 50, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "docx2 = Text(u\"He exclaimed, 'what're you doing? Reading?'.\")" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 51, 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "data": { 105 | "text/plain": [ 106 | "WordList(['He', 'exclaimed', ',', \"'\", \"what're\", 'you', 'doing', '?', 'Reading', '?', \"'\", '.'])" 107 | ] 108 | }, 109 | "execution_count": 51, 110 | "metadata": {}, 111 | "output_type": "execute_result" 112 | } 113 | ], 114 | "source": [ 115 | "docx2.words" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 52, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "# Sentence tokens\n", 125 | "docx3 = Text(u\"He likes reading and painting.He exclaimed, 'what're you doing? Reading?'.\")" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 53, 131 | "metadata": {}, 132 | "outputs": [ 133 | { 134 | "data": { 135 | "text/plain": [ 136 | "[Sentence(\"He likes reading and painting.He exclaimed, 'what're you doing?\"),\n", 137 | " Sentence(\"Reading?'.\")]" 138 | ] 139 | }, 140 | "execution_count": 53, 141 | "metadata": {}, 142 | "output_type": "execute_result" 143 | } 144 | ], 145 | "source": [ 146 | "docx3.sentences" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "#### Parts of Speech Tagging\n", 161 | "+ polyglot download embeddings2.la\n", 162 | "+ pos_tags\n" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 54, 168 | "metadata": {}, 169 | "outputs": [ 170 | { 171 | "data": { 172 | "text/plain": [ 173 | "Text(\"He likes reading and painting\")" 174 | ] 175 | }, 176 | "execution_count": 54, 177 | "metadata": {}, 178 | "output_type": "execute_result" 179 | } 180 | ], 181 | "source": [ 182 | "docx" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 55, 188 | "metadata": {}, 189 | "outputs": [ 190 | { 191 | "data": { 192 | "text/plain": [ 193 | "[('He', 'PRON'),\n", 194 | " ('likes', 'VERB'),\n", 195 | " ('reading', 'VERB'),\n", 196 | " ('and', 'CONJ'),\n", 197 | " ('painting', 'NOUN')]" 198 | ] 199 | }, 200 | "execution_count": 55, 201 | "metadata": {}, 202 | "output_type": "execute_result" 203 | } 204 | ], 205 | "source": [ 206 | "docx.pos_tags\n", 207 | " " 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "#### Language Detection\n", 215 | "+ polyglot.detect\n", 216 | "+ language.name\n", 217 | "+ language.code" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 56, 223 | "metadata": {}, 224 | "outputs": [ 225 | { 226 | "data": { 227 | "text/plain": [ 228 | "Text(\"He likes reading and painting\")" 229 | ] 230 | }, 231 | "execution_count": 56, 232 | "metadata": {}, 233 | "output_type": "execute_result" 234 | } 235 | ], 236 | "source": [ 237 | "docx" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 57, 243 | "metadata": {}, 244 | "outputs": [ 245 | { 246 | "data": { 247 | "text/plain": [ 248 | "'English'" 249 | ] 250 | }, 251 | "execution_count": 57, 252 | "metadata": {}, 253 | "output_type": "execute_result" 254 | } 255 | ], 256 | "source": [ 257 | "docx.language.name" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 58, 263 | "metadata": {}, 264 | "outputs": [ 265 | { 266 | "data": { 267 | "text/plain": [ 268 | "'en'" 269 | ] 270 | }, 271 | "execution_count": 58, 272 | "metadata": {}, 273 | "output_type": "execute_result" 274 | } 275 | ], 276 | "source": [ 277 | "docx.language.code" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 59, 283 | "metadata": {}, 284 | "outputs": [], 285 | "source": [ 286 | "from polyglot.detect import Detector" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 60, 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "en_text = \"He is a student \"\n", 296 | "fr_text = \"Il est un étudiant\"\n", 297 | "ru_text = \"Он студент\"" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 67, 303 | "metadata": {}, 304 | "outputs": [ 305 | { 306 | "name": "stderr", 307 | "output_type": "stream", 308 | "text": [ 309 | "Detector is not able to detect the language reliably.\n", 310 | "Detector is not able to detect the language reliably.\n" 311 | ] 312 | } 313 | ], 314 | "source": [ 315 | "detect_en = Detector(en_text)\n", 316 | "detect_fr = Detector(fr_text)\n", 317 | "detect_ru = Detector(ru_text)" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 63, 323 | "metadata": {}, 324 | "outputs": [ 325 | { 326 | "name": "stdout", 327 | "output_type": "stream", 328 | "text": [ 329 | "name: English code: en confidence: 94.0 read bytes: 704\n" 330 | ] 331 | } 332 | ], 333 | "source": [ 334 | "print(detect_en.language)" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 66, 340 | "metadata": {}, 341 | "outputs": [ 342 | { 343 | "name": "stdout", 344 | "output_type": "stream", 345 | "text": [ 346 | "name: French code: fr confidence: 95.0 read bytes: 870\n" 347 | ] 348 | } 349 | ], 350 | "source": [ 351 | "print(detect_fr.language)" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 68, 357 | "metadata": {}, 358 | "outputs": [ 359 | { 360 | "name": "stdout", 361 | "output_type": "stream", 362 | "text": [ 363 | "name: Serbian code: sr confidence: 95.0 read bytes: 614\n" 364 | ] 365 | } 366 | ], 367 | "source": [ 368 | "print(detect_ru.language)" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": null, 374 | "metadata": {}, 375 | "outputs": [], 376 | "source": [] 377 | }, 378 | { 379 | "cell_type": "markdown", 380 | "metadata": {}, 381 | "source": [ 382 | "#### Sentiment Analysis\n", 383 | "+ polarity" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 71, 389 | "metadata": {}, 390 | "outputs": [], 391 | "source": [ 392 | "docx4 = Text(u\"He hates reading and playing\")" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 69, 398 | "metadata": {}, 399 | "outputs": [ 400 | { 401 | "data": { 402 | "text/plain": [ 403 | "Text(\"He likes reading and painting\")" 404 | ] 405 | }, 406 | "execution_count": 69, 407 | "metadata": {}, 408 | "output_type": "execute_result" 409 | } 410 | ], 411 | "source": [ 412 | "docx" 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": 70, 418 | "metadata": {}, 419 | "outputs": [ 420 | { 421 | "data": { 422 | "text/plain": [ 423 | "1.0" 424 | ] 425 | }, 426 | "execution_count": 70, 427 | "metadata": {}, 428 | "output_type": "execute_result" 429 | } 430 | ], 431 | "source": [ 432 | "docx.polarity" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": 72, 438 | "metadata": {}, 439 | "outputs": [ 440 | { 441 | "data": { 442 | "text/plain": [ 443 | "-1.0" 444 | ] 445 | }, 446 | "execution_count": 72, 447 | "metadata": {}, 448 | "output_type": "execute_result" 449 | } 450 | ], 451 | "source": [ 452 | "docx4.polarity" 453 | ] 454 | }, 455 | { 456 | "cell_type": "markdown", 457 | "metadata": {}, 458 | "source": [ 459 | "#### Named Entities\n", 460 | "+ entities" 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": 73, 466 | "metadata": {}, 467 | "outputs": [], 468 | "source": [ 469 | "docx5 = Text(u\"John Jones was a FBI detector\")" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": 74, 475 | "metadata": {}, 476 | "outputs": [ 477 | { 478 | "data": { 479 | "text/plain": [ 480 | "[I-PER(['John', 'Jones']), I-ORG(['FBI'])]" 481 | ] 482 | }, 483 | "execution_count": 74, 484 | "metadata": {}, 485 | "output_type": "execute_result" 486 | } 487 | ], 488 | "source": [ 489 | "docx5.entities" 490 | ] 491 | }, 492 | { 493 | "cell_type": "markdown", 494 | "metadata": {}, 495 | "source": [ 496 | "#### Morphology\n", 497 | "+ morpheme is the smallest grammatical unit in a language. \n", 498 | "+ morpheme may or may not stand alone, word, by definition, is freestanding. \n", 499 | "+ morphemes" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": 75, 505 | "metadata": {}, 506 | "outputs": [], 507 | "source": [ 508 | "docx6 = Text(u\"preprocessing\")" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": 76, 514 | "metadata": {}, 515 | "outputs": [ 516 | { 517 | "data": { 518 | "text/plain": [ 519 | "WordList(['pre', 'process', 'ing'])" 520 | ] 521 | }, 522 | "execution_count": 76, 523 | "metadata": {}, 524 | "output_type": "execute_result" 525 | } 526 | ], 527 | "source": [ 528 | "docx6.morphemes" 529 | ] 530 | }, 531 | { 532 | "cell_type": "markdown", 533 | "metadata": {}, 534 | "source": [ 535 | "#### Transliteration" 536 | ] 537 | }, 538 | { 539 | "cell_type": "code", 540 | "execution_count": 77, 541 | "metadata": {}, 542 | "outputs": [], 543 | "source": [ 544 | "# Load \n", 545 | "from polyglot.transliteration import Transliterator\n", 546 | "translit = Transliterator(source_lang='en',target_lang='fr')" 547 | ] 548 | }, 549 | { 550 | "cell_type": "code", 551 | "execution_count": 78, 552 | "metadata": {}, 553 | "outputs": [ 554 | { 555 | "data": { 556 | "text/plain": [ 557 | "'working'" 558 | ] 559 | }, 560 | "execution_count": 78, 561 | "metadata": {}, 562 | "output_type": "execute_result" 563 | } 564 | ], 565 | "source": [ 566 | "translit.transliterate(u\"working\")" 567 | ] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "execution_count": null, 572 | "metadata": {}, 573 | "outputs": [], 574 | "source": [] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": null, 579 | "metadata": {}, 580 | "outputs": [], 581 | "source": [ 582 | "# Jesse JCharis\n", 583 | "# J-Secur1ty\n", 584 | "# Jesus Saves @JCharisTect" 585 | ] 586 | } 587 | ], 588 | "metadata": { 589 | "kernelspec": { 590 | "display_name": "Python 3", 591 | "language": "python", 592 | "name": "python3" 593 | }, 594 | "language_info": { 595 | "codemirror_mode": { 596 | "name": "ipython", 597 | "version": 3 598 | }, 599 | "file_extension": ".py", 600 | "mimetype": "text/x-python", 601 | "name": "python", 602 | "nbconvert_exporter": "python", 603 | "pygments_lexer": "ipython3", 604 | "version": "3.6.7" 605 | } 606 | }, 607 | "nbformat": 4, 608 | "nbformat_minor": 2 609 | } 610 | -------------------------------------------------------------------------------- /Intent Classification With Rasa - Spacy/Intent Classification With Rasa NLU and SpaCy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Intent Classification with Rasa NLU and SpaCy \n", 8 | "+ + A Libary for intent recognition and entity extraction based on SpaCy and Sklearn\n", 9 | "\n", 10 | "##### NLP = NLU+NLG+ More\n", 11 | "+ NLP = understand,process,interprete everyday human language\n", 12 | "+ NLU = unstructured inputs and convert them into a structured form that a machine can understand and act upon\n", 13 | "\n", 14 | "#### Uses\n", 15 | "+ Chatbot task\n", 16 | "+ NL understanding\n", 17 | "+ Intent classification\n", 18 | "\n" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "![alt text](nlu_nlp_explain.png \"Title\")" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "#### Installation\n", 33 | "+ pip install rasa_nlu\n", 34 | "+ python -m rasa_nlu.server &\n", 35 | "+ sklearn_crfsuite\n", 36 | "\n", 37 | "#### using spacy as backend\n", 38 | "+ pip install rasa_nlu[spacy]\n", 39 | "+ python -m spacy download en_core_web_md\n", 40 | "+ python -m spacy link en_core_web_md en\n", 41 | " \n", 42 | " = = Dataset = =\n", 43 | "+ demo-rasa.json\n", 44 | "+ config_spacy.yaml" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 15, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "# Load the Packages\n", 54 | "from rasa_nlu.training_data import load_data\n", 55 | "from rasa_nlu.config import RasaNLUModelConfig\n", 56 | "from rasa_nlu.model import Trainer\n", 57 | "from rasa_nlu import config" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 16, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "name": "stdout", 67 | "output_type": "stream", 68 | "text": [ 69 | "{\n", 70 | " \"rasa_nlu_data\": {\n", 71 | " \"regex_features\": [\n", 72 | " {\n", 73 | " \"name\": \"zipcode\",\n", 74 | " \"pattern\": \"[0-9]{5}\"\n", 75 | " },\n", 76 | " {\n", 77 | " \"name\": \"greet\",\n", 78 | " \"pattern\": \"hey[^\\\\s]*\"\n", 79 | " }\n", 80 | " ],\n", 81 | " \"entity_synonyms\": [\n", 82 | " {\n", 83 | " \"value\": \"chinese\",\n", 84 | " \"synonyms\": [\"Chinese\", \"Chines\", \"chines\"]\n", 85 | " },\n", 86 | " {\n", 87 | " \"value\": \"vegetarian\",\n", 88 | " \"synonyms\": [\"veggie\", \"vegg\"]\n", 89 | " }\n", 90 | " ],\n", 91 | " \"common_examples\": [\n", 92 | " {\n", 93 | " \"text\": \"hey\", \n", 94 | " \"intent\": \"greet\", \n", 95 | " \"entities\": []\n", 96 | " }, \n", 97 | " {\n", 98 | " \"text\": \"howdy\", \n", 99 | " \"intent\": \"greet\", \n", 100 | " \"entities\": []\n", 101 | " }, \n", 102 | " {\n", 103 | " \"text\": \"hey there\",\n", 104 | " \"intent\": \"greet\", \n", 105 | " \"entities\": []\n", 106 | " }, \n", 107 | " {\n", 108 | " \"text\": \"hello\", \n", 109 | " \"intent\": \"greet\", \n", 110 | " \"entities\": []\n", 111 | " }, \n", 112 | " {\n", 113 | " \"text\": \"hi\", \n", 114 | " \"intent\": \"greet\", \n", 115 | " \"entities\": []\n", 116 | " },\n", 117 | " {\n", 118 | " \"text\": \"good morning\",\n", 119 | " \"intent\": \"greet\",\n", 120 | " \"entities\": []\n", 121 | " },\n", 122 | " {\n", 123 | " \"text\": \"good evening\",\n", 124 | " \"intent\": \"greet\",\n", 125 | " \"entities\": []\n", 126 | " },\n", 127 | " {\n", 128 | " \"text\": \"dear sir\",\n", 129 | " \"intent\": \"greet\",\n", 130 | " \"entities\": []\n", 131 | " },\n", 132 | " {\n", 133 | " \"text\": \"yes\", \n", 134 | " \"intent\": \"affirm\", \n", 135 | " \"entities\": []\n", 136 | " }, \n", 137 | " {\n", 138 | " \"text\": \"yep\", \n", 139 | " \"intent\": \"affirm\", \n", 140 | " \"entities\": []\n", 141 | " }, \n", 142 | " {\n", 143 | " \"text\": \"yeah\", \n", 144 | " \"intent\": \"affirm\", \n", 145 | " \"entities\": []\n", 146 | " },\n", 147 | " {\n", 148 | " \"text\": \"indeed\",\n", 149 | " \"intent\": \"affirm\",\n", 150 | " \"entities\": []\n", 151 | " },\n", 152 | " {\n", 153 | " \"text\": \"that's right\",\n", 154 | " \"intent\": \"affirm\",\n", 155 | " \"entities\": []\n", 156 | " },\n", 157 | " {\n", 158 | " \"text\": \"ok\",\n", 159 | " \"intent\": \"affirm\",\n", 160 | " \"entities\": []\n", 161 | " },\n", 162 | " {\n", 163 | " \"text\": \"great\",\n", 164 | " \"intent\": \"affirm\",\n", 165 | " \"entities\": []\n", 166 | " },\n", 167 | " {\n", 168 | " \"text\": \"right, thank you\",\n", 169 | " \"intent\": \"affirm\",\n", 170 | " \"entities\": []\n", 171 | " },\n", 172 | " {\n", 173 | " \"text\": \"correct\",\n", 174 | " \"intent\": \"affirm\",\n", 175 | " \"entities\": []\n", 176 | " },\n", 177 | " {\n", 178 | " \"text\": \"great choice\",\n", 179 | " \"intent\": \"affirm\",\n", 180 | " \"entities\": []\n", 181 | " },\n", 182 | " {\n", 183 | " \"text\": \"sounds really good\",\n", 184 | " \"intent\": \"affirm\",\n", 185 | " \"entities\": []\n", 186 | " },\n", 187 | " {\n", 188 | " \"text\": \"i'm looking for a place to eat\",\n", 189 | " \"intent\": \"restaurant_search\",\n", 190 | " \"entities\": []\n", 191 | " },\n", 192 | " {\n", 193 | " \"text\": \"I want to grab lunch\",\n", 194 | " \"intent\": \"restaurant_search\",\n", 195 | " \"entities\": []\n", 196 | " },\n", 197 | " {\n", 198 | " \"text\": \"I am searching for a dinner spot\",\n", 199 | " \"intent\": \"restaurant_search\",\n", 200 | " \"entities\": []\n", 201 | " },\n", 202 | " {\n", 203 | " \"text\": \"i'm looking for a place in the north of town\",\n", 204 | " \"intent\": \"restaurant_search\",\n", 205 | " \"entities\": [\n", 206 | " {\n", 207 | " \"start\": 31,\n", 208 | " \"end\": 36,\n", 209 | " \"value\": \"north\",\n", 210 | " \"entity\": \"location\"\n", 211 | " }\n", 212 | " ]\n", 213 | " },\n", 214 | " {\n", 215 | " \"text\": \"show me chinese restaurants\",\n", 216 | " \"intent\": \"restaurant_search\",\n", 217 | " \"entities\": [\n", 218 | " {\n", 219 | " \"start\": 8,\n", 220 | " \"end\": 15,\n", 221 | " \"value\": \"chinese\",\n", 222 | " \"entity\": \"cuisine\"\n", 223 | " }\n", 224 | " ]\n", 225 | " },\n", 226 | " {\n", 227 | " \"text\": \"show me chines restaurants in the north\",\n", 228 | " \"intent\": \"restaurant_search\",\n", 229 | " \"entities\": [\n", 230 | " {\n", 231 | " \"start\": 8,\n", 232 | " \"end\": 14,\n", 233 | " \"value\": \"chinese\",\n", 234 | " \"entity\": \"cuisine\"\n", 235 | " },\n", 236 | " {\n", 237 | " \"start\": 34,\n", 238 | " \"end\": 39,\n", 239 | " \"value\": \"north\",\n", 240 | " \"entity\": \"location\"\n", 241 | " }\n", 242 | " ]\n", 243 | " },\n", 244 | " {\n", 245 | " \"text\": \"show me a mexican place in the centre\", \n", 246 | " \"intent\": \"restaurant_search\", \n", 247 | " \"entities\": [\n", 248 | " {\n", 249 | " \"start\": 31, \n", 250 | " \"end\": 37, \n", 251 | " \"value\": \"centre\", \n", 252 | " \"entity\": \"location\"\n", 253 | " }, \n", 254 | " {\n", 255 | " \"start\": 10, \n", 256 | " \"end\": 17, \n", 257 | " \"value\": \"mexican\", \n", 258 | " \"entity\": \"cuisine\"\n", 259 | " }\n", 260 | " ]\n", 261 | " },\n", 262 | " {\n", 263 | " \"text\": \"i am looking for an indian spot called olaolaolaolaolaola\",\n", 264 | " \"intent\": \"restaurant_search\",\n", 265 | " \"entities\": [\n", 266 | " {\n", 267 | " \"start\": 20,\n", 268 | " \"end\": 26,\n", 269 | " \"value\": \"indian\",\n", 270 | " \"entity\": \"cuisine\"\n", 271 | " }\n", 272 | " ]\n", 273 | " }, {\n", 274 | " \"text\": \"search for restaurants\",\n", 275 | " \"intent\": \"restaurant_search\",\n", 276 | " \"entities\": []\n", 277 | " },\n", 278 | " {\n", 279 | " \"text\": \"anywhere in the west\",\n", 280 | " \"intent\": \"restaurant_search\",\n", 281 | " \"entities\": [\n", 282 | " {\n", 283 | " \"start\": 16,\n", 284 | " \"end\": 20,\n", 285 | " \"value\": \"west\",\n", 286 | " \"entity\": \"location\"\n", 287 | " }\n", 288 | " ]\n", 289 | " },\n", 290 | " {\n", 291 | " \"text\": \"anywhere near 18328\",\n", 292 | " \"intent\": \"restaurant_search\",\n", 293 | " \"entities\": [\n", 294 | " {\n", 295 | " \"start\": 14,\n", 296 | " \"end\": 19,\n", 297 | " \"value\": \"18328\",\n", 298 | " \"entity\": \"location\"\n", 299 | " }\n", 300 | " ]\n", 301 | " },\n", 302 | " {\n", 303 | " \"text\": \"I am looking for asian fusion food\",\n", 304 | " \"intent\": \"restaurant_search\",\n", 305 | " \"entities\": [\n", 306 | " {\n", 307 | " \"start\": 17,\n", 308 | " \"end\": 29,\n", 309 | " \"value\": \"asian fusion\",\n", 310 | " \"entity\": \"cuisine\"\n", 311 | " }\n", 312 | " ]\n", 313 | " },\n", 314 | " {\n", 315 | " \"text\": \"I am looking a restaurant in 29432\",\n", 316 | " \"intent\": \"restaurant_search\",\n", 317 | " \"entities\": [\n", 318 | " {\n", 319 | " \"start\": 29,\n", 320 | " \"end\": 34,\n", 321 | " \"value\": \"29432\",\n", 322 | " \"entity\": \"location\"\n", 323 | " }\n", 324 | " ]\n", 325 | " },\n", 326 | " {\n", 327 | " \"text\": \"I am looking for mexican indian fusion\",\n", 328 | " \"intent\": \"restaurant_search\",\n", 329 | " \"entities\": [\n", 330 | " {\n", 331 | " \"start\": 17,\n", 332 | " \"end\": 38,\n", 333 | " \"value\": \"mexican indian fusion\",\n", 334 | " \"entity\": \"cuisine\"\n", 335 | " }\n", 336 | " ]\n", 337 | " },\n", 338 | " {\n", 339 | " \"text\": \"central indian restaurant\",\n", 340 | " \"intent\": \"restaurant_search\",\n", 341 | " \"entities\": [\n", 342 | " {\n", 343 | " \"start\": 0,\n", 344 | " \"end\": 7,\n", 345 | " \"value\": \"central\",\n", 346 | " \"entity\": \"location\"\n", 347 | " },\n", 348 | " {\n", 349 | " \"start\": 8,\n", 350 | " \"end\": 14,\n", 351 | " \"value\": \"indian\",\n", 352 | " \"entity\": \"cuisine\"\n", 353 | " }\n", 354 | " ]\n", 355 | " },\n", 356 | " {\n", 357 | " \"text\": \"bye\", \n", 358 | " \"intent\": \"goodbye\", \n", 359 | " \"entities\": []\n", 360 | " }, \n", 361 | " {\n", 362 | " \"text\": \"goodbye\", \n", 363 | " \"intent\": \"goodbye\", \n", 364 | " \"entities\": []\n", 365 | " }, \n", 366 | " {\n", 367 | " \"text\": \"good bye\", \n", 368 | " \"intent\": \"goodbye\", \n", 369 | " \"entities\": []\n", 370 | " }, \n", 371 | " {\n", 372 | " \"text\": \"stop\", \n", 373 | " \"intent\": \"goodbye\", \n", 374 | " \"entities\": []\n", 375 | " }, \n", 376 | " {\n", 377 | " \"text\": \"end\", \n", 378 | " \"intent\": \"goodbye\", \n", 379 | " \"entities\": []\n", 380 | " },\n", 381 | " {\n", 382 | " \"text\": \"farewell\",\n", 383 | " \"intent\": \"goodbye\",\n", 384 | " \"entities\": []\n", 385 | " },\n", 386 | " {\n", 387 | " \"text\": \"Bye bye\",\n", 388 | " \"intent\": \"goodbye\",\n", 389 | " \"entities\": []\n", 390 | " },\n", 391 | " {\n", 392 | " \"text\": \"have a good one\",\n", 393 | " \"intent\": \"goodbye\",\n", 394 | " \"entities\": []\n", 395 | " }\n", 396 | " ]\n", 397 | " }\n", 398 | "}\n" 399 | ] 400 | } 401 | ], 402 | "source": [ 403 | "# Load Data \n", 404 | "!cat rasa_dataset.json" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": 17, 410 | "metadata": {}, 411 | "outputs": [], 412 | "source": [ 413 | "# Loading DataSet\n", 414 | "train_data = load_data('rasa_dataset.json')" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": 18, 420 | "metadata": {}, 421 | "outputs": [], 422 | "source": [ 423 | "# Config Backend using Sklearn and Spacy\n", 424 | "trainer = Trainer(config.load(\"config_spacy.yaml\"))" 425 | ] 426 | }, 427 | { 428 | "cell_type": "markdown", 429 | "metadata": {}, 430 | "source": [ 431 | "#### Content on Config\n", 432 | " language: \"en\"\n", 433 | " pipeline: \"spacy_sklearn\"\n", 434 | "\n", 435 | " =======================\n", 436 | "\n", 437 | " language: \"en\"\n", 438 | "\n", 439 | " pipeline:\n", 440 | " - name: \"nlp_spacy\"\n", 441 | " - name: \"tokenizer_spacy\"\n", 442 | " - name: \"intent_entity_featurizer_regex\"\n", 443 | " - name: \"intent_featurizer_spacy\"\n", 444 | " - name: \"ner_crf\"\n", 445 | " - name: \"ner_synonyms\"\n", 446 | " - name: \"intent_classifier_sklearn\"" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": 19, 452 | "metadata": {}, 453 | "outputs": [ 454 | { 455 | "name": "stdout", 456 | "output_type": "stream", 457 | "text": [ 458 | "Fitting 2 folds for each of 6 candidates, totalling 12 fits\n" 459 | ] 460 | }, 461 | { 462 | "name": "stderr", 463 | "output_type": "stream", 464 | "text": [ 465 | "[Parallel(n_jobs=1)]: Done 12 out of 12 | elapsed: 0.3s finished\n" 466 | ] 467 | }, 468 | { 469 | "data": { 470 | "text/plain": [ 471 | "" 472 | ] 473 | }, 474 | "execution_count": 19, 475 | "metadata": {}, 476 | "output_type": "execute_result" 477 | } 478 | ], 479 | "source": [ 480 | "# Training Data\n", 481 | "trainer.train(train_data)" 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": 20, 487 | "metadata": {}, 488 | "outputs": [], 489 | "source": [ 490 | "# Returns the directory the model is stored in (Creat a folder to store model in)\n", 491 | "model_directory = trainer.persist('/projects/')" 492 | ] 493 | }, 494 | { 495 | "cell_type": "markdown", 496 | "metadata": {}, 497 | "source": [ 498 | "#### Entity Extraction With SpaCy" 499 | ] 500 | }, 501 | { 502 | "cell_type": "code", 503 | "execution_count": 22, 504 | "metadata": {}, 505 | "outputs": [], 506 | "source": [ 507 | "import spacy\n", 508 | "nlp = spacy.load('en')" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": 23, 514 | "metadata": {}, 515 | "outputs": [], 516 | "source": [ 517 | "docx = nlp(u\"I am looking for an Italian Restaurant where I can eat\")" 518 | ] 519 | }, 520 | { 521 | "cell_type": "code", 522 | "execution_count": 24, 523 | "metadata": {}, 524 | "outputs": [ 525 | { 526 | "name": "stdout", 527 | "output_type": "stream", 528 | "text": [ 529 | "value Italian entity NORP start 20 end 27\n" 530 | ] 531 | } 532 | ], 533 | "source": [ 534 | "for word in docx.ents:\n", 535 | " print(\"value\",word.text,\"entity\",word.label_,\"start\",word.start_char,\"end\",word.end_char)" 536 | ] 537 | }, 538 | { 539 | "cell_type": "markdown", 540 | "metadata": {}, 541 | "source": [ 542 | "#### Making Predictions With Model\n", 543 | "+ Interpreter.load().parse()" 544 | ] 545 | }, 546 | { 547 | "cell_type": "code", 548 | "execution_count": 21, 549 | "metadata": {}, 550 | "outputs": [], 551 | "source": [ 552 | "from rasa_nlu.model import Metadata, Interpreter" 553 | ] 554 | }, 555 | { 556 | "cell_type": "code", 557 | "execution_count": 25, 558 | "metadata": {}, 559 | "outputs": [], 560 | "source": [ 561 | "# where `model_directory points to the folder the model is persisted in\n", 562 | "interpreter = Interpreter.load(model_directory)" 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": 26, 568 | "metadata": {}, 569 | "outputs": [ 570 | { 571 | "data": { 572 | "text/plain": [ 573 | "{'intent': {'name': 'restaurant_search', 'confidence': 0.7455215289019911},\n", 574 | " 'entities': [{'start': 20,\n", 575 | " 'end': 27,\n", 576 | " 'value': 'italian',\n", 577 | " 'entity': 'cuisine',\n", 578 | " 'confidence': 0.6636828413532201,\n", 579 | " 'extractor': 'ner_crf'}],\n", 580 | " 'intent_ranking': [{'name': 'restaurant_search',\n", 581 | " 'confidence': 0.7455215289019911},\n", 582 | " {'name': 'affirm', 'confidence': 0.15019642212447237},\n", 583 | " {'name': 'greet', 'confidence': 0.058736824115844515},\n", 584 | " {'name': 'goodbye', 'confidence': 0.045545224857692024}],\n", 585 | " 'text': 'I am looking for an Italian Restaurant where I can eat'}" 586 | ] 587 | }, 588 | "execution_count": 26, 589 | "metadata": {}, 590 | "output_type": "execute_result" 591 | } 592 | ], 593 | "source": [ 594 | "# Prediction of Intent\n", 595 | "interpreter.parse(u\"I am looking for an Italian Restaurant where I can eat\")" 596 | ] 597 | }, 598 | { 599 | "cell_type": "code", 600 | "execution_count": 27, 601 | "metadata": {}, 602 | "outputs": [ 603 | { 604 | "data": { 605 | "text/plain": [ 606 | "{'intent': {'name': 'restaurant_search', 'confidence': 0.6874972430877329},\n", 607 | " 'entities': [{'start': 10,\n", 608 | " 'end': 17,\n", 609 | " 'value': 'african',\n", 610 | " 'entity': 'cuisine',\n", 611 | " 'confidence': 0.6470976966769572,\n", 612 | " 'extractor': 'ner_crf'}],\n", 613 | " 'intent_ranking': [{'name': 'restaurant_search',\n", 614 | " 'confidence': 0.6874972430877329},\n", 615 | " {'name': 'goodbye', 'confidence': 0.12400667696797882},\n", 616 | " {'name': 'affirm', 'confidence': 0.11357435021080386},\n", 617 | " {'name': 'greet', 'confidence': 0.07492172973348454}],\n", 618 | " 'text': 'I want an African Spot to eat'}" 619 | ] 620 | }, 621 | "execution_count": 27, 622 | "metadata": {}, 623 | "output_type": "execute_result" 624 | } 625 | ], 626 | "source": [ 627 | "interpreter.parse(u\"I want an African Spot to eat\")" 628 | ] 629 | }, 630 | { 631 | "cell_type": "code", 632 | "execution_count": 28, 633 | "metadata": {}, 634 | "outputs": [ 635 | { 636 | "data": { 637 | "text/plain": [ 638 | "{'intent': {'name': 'greet', 'confidence': 0.44328419685532383},\n", 639 | " 'entities': [],\n", 640 | " 'intent_ranking': [{'name': 'greet', 'confidence': 0.44328419685532383},\n", 641 | " {'name': 'goodbye', 'confidence': 0.31245698090344237},\n", 642 | " {'name': 'affirm', 'confidence': 0.1257434275305043},\n", 643 | " {'name': 'restaurant_search', 'confidence': 0.11851539471072912}],\n", 644 | " 'text': 'Good morning World'}" 645 | ] 646 | }, 647 | "execution_count": 28, 648 | "metadata": {}, 649 | "output_type": "execute_result" 650 | } 651 | ], 652 | "source": [ 653 | "interpreter.parse(u\"Good morning World\")" 654 | ] 655 | }, 656 | { 657 | "cell_type": "code", 658 | "execution_count": null, 659 | "metadata": {}, 660 | "outputs": [], 661 | "source": [] 662 | }, 663 | { 664 | "cell_type": "code", 665 | "execution_count": null, 666 | "metadata": {}, 667 | "outputs": [], 668 | "source": [ 669 | "### Credits Rasa_nlu\n", 670 | "#### By Jesse JCharis\n", 671 | "#### Jesus Saves @ JCharisTec" 672 | ] 673 | } 674 | ], 675 | "metadata": { 676 | "kernelspec": { 677 | "display_name": "Python 3", 678 | "language": "python", 679 | "name": "python3" 680 | }, 681 | "language_info": { 682 | "codemirror_mode": { 683 | "name": "ipython", 684 | "version": 3 685 | }, 686 | "file_extension": ".py", 687 | "mimetype": "text/x-python", 688 | "name": "python", 689 | "nbconvert_exporter": "python", 690 | "pygments_lexer": "ipython3", 691 | "version": "3.5.2" 692 | } 693 | }, 694 | "nbformat": 4, 695 | "nbformat_minor": 2 696 | } 697 | -------------------------------------------------------------------------------- /Training the Named Entity Recognizer in SpaCy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Natural Language Processing With SpaCy\n", 8 | "![title](SpaCy_logo.png)" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "#### Training the Named Entity Recognizer\n", 16 | "##### Updating our NER\n", 17 | "+ Load the model\n", 18 | " + spacy.load('en')\n", 19 | " - Disable existing pipe line (nlp.disable_pipes)\n", 20 | " + spacy.blank('en')\n", 21 | " - Added Entity Recognizer to Pipeline\n", 22 | "+ Shuffle and loop over the examples\n", 23 | " - update the model (nlp.update)\n", 24 | "+ Save the trained model (nlp.to_disk)\n", 25 | "+ Test" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "# Load Packages\n", 35 | "from __future__ import unicode_literals, print_function\n", 36 | "\n", 37 | "import plac # wrapper over argparse\n", 38 | "import random\n", 39 | "from pathlib import Path\n", 40 | "import spacy\n", 41 | "from tqdm import tqdm # loading bar" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "nlp1 = spacy.load('en')" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "docx1 = nlp1(u\"Who was Kofi Annan?\")" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "for token in docx1.ents:\n", 69 | " print(token.text,token.start_char, token.end_char,token.label_)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "docx2 = nlp1(u\"Who is Steve Jobs?\")" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "for token in docx2.ents:\n", 88 | " print(token.text,token.start_char, token.end_char,token.label_)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "docx3 = nlp1(u\"Who is Shaka Khan?\")" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "# training data\n", 107 | "TRAIN_DATA = [\n", 108 | " ('Who is Kofi Annan?', {\n", 109 | " 'entities': [(8, 18, 'PERSON')]\n", 110 | " }),\n", 111 | " ('Who is Steve Jobs?', {\n", 112 | " 'entities': [(7, 17, 'PERSON')]\n", 113 | " }),\n", 114 | " ('I like London and Berlin.', {\n", 115 | " 'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]\n", 116 | " })\n", 117 | "]" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "## plac is wrapper for argparser \n", 127 | "@plac.annotations(\n", 128 | " model=(\"Model name. Defaults to blank 'en' model.\", \"option\", \"m\", str),\n", 129 | " output_dir=(\"C:\\Users\\This PC\\Documents\\JLabs\\JFlow\", \"option\", \"o\", Path),\n", 130 | " n_iter=(\"Number of training iterations\", \"option\", \"n\", int))" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "# Define our variables\n", 140 | "model = None\n", 141 | "output_dir=Path(\"C:\\\\Users\\\\This PC\\\\Documents\\\\JLabs\\\\JFlow\")\n", 142 | "n_iter=100" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "#### Load the model" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "if model is not None:\n", 159 | " nlp = spacy.load(model) # load existing spaCy model\n", 160 | " print(\"Loaded model '%s'\" % model)\n", 161 | "else:\n", 162 | " nlp = spacy.blank('en') # create blank Language class\n", 163 | " print(\"Created blank 'en' model\")" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "#### Set Up the Pipeline" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "# create the built-in pipeline components and add them to the pipeline\n", 180 | " # nlp.create_pipe works for built-ins that are registered with spaCy\n", 181 | "if 'ner' not in nlp.pipe_names:\n", 182 | " ner = nlp.create_pipe('ner')\n", 183 | " nlp.add_pipe(ner, last=True)\n", 184 | "# otherwise, get it so we can add labels\n", 185 | "else:\n", 186 | " ner = nlp.get_pipe('ner')" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "#### Train the Recognizer\n", 194 | "+ Add labels,Annotate them\n", 195 | "+ Pipes\n", 196 | "+ Begin_training()" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "\n", 206 | " # add labels\n", 207 | "for _, annotations in TRAIN_DATA:\n", 208 | " for ent in annotations.get('entities'):\n", 209 | " ner.add_label(ent[2])\n", 210 | "\n", 211 | " # get names of other pipes to disable them during training\n", 212 | "other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']\n", 213 | "with nlp.disable_pipes(*other_pipes): # only train NER\n", 214 | " optimizer = nlp.begin_training()\n", 215 | " for itn in range(n_iter):\n", 216 | " random.shuffle(TRAIN_DATA)\n", 217 | " losses = {}\n", 218 | " for text, annotations in tqdm(TRAIN_DATA):\n", 219 | " nlp.update(\n", 220 | " [text], # batch of texts\n", 221 | " [annotations], # batch of annotations\n", 222 | " drop=0.5, # dropout - make it harder to memorise data\n", 223 | " sgd=optimizer, # callable to update weights\n", 224 | " losses=losses)\n", 225 | " print(losses)" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "#### Test the trained model" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "# test the trained model\n", 242 | "for text, _ in TRAIN_DATA:\n", 243 | " doc = nlp(text)\n", 244 | " print('Entities', [(ent.text, ent.label_) for ent in doc.ents])\n", 245 | " print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])\n" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "#### Save the Model" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "# save model to output directory\n", 262 | "if output_dir is not None:\n", 263 | " output_dir = Path(output_dir)\n", 264 | " if not output_dir.exists():\n", 265 | " output_dir.mkdir()\n", 266 | " nlp.to_disk(output_dir)\n", 267 | " print(\"Saved model to\", output_dir)\n", 268 | "\n", 269 | " " 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": {}, 275 | "source": [ 276 | "#### Test The Saved Model\n", 277 | "+ NB Output Directory" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": {}, 284 | "outputs": [], 285 | "source": [ 286 | "# test the saved model\n", 287 | "print(\"Loading from\", output_dir)\n", 288 | "nlp2 = spacy.load(output_dir)\n", 289 | "for text, _ in TRAIN_DATA:\n", 290 | " doc = nlp2(text)\n", 291 | " print('Entities', [(ent.text, ent.label_) for ent in doc.ents])\n", 292 | " print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])" 293 | ] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": {}, 298 | "source": [ 299 | "### Adding Additional Entity Types\n" 300 | ] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "metadata": {}, 305 | "source": [ 306 | "### Natural Language Processing With SpaCy\n", 307 | "![title](SpaCy_logo.png)" 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": {}, 313 | "source": [ 314 | "#### Training the Named Entity Recognizer (NER)\n", 315 | "##### Adding An Additional Entity (NER)\n", 316 | "+ Load the model\n", 317 | " + spacy.load('en')\n", 318 | " - Disable existing pipe line (nlp.disable_pipes)\n", 319 | " + spacy.blank('en')\n", 320 | " - Added Entity Recognizer to Pipeline\n", 321 | "+ Add a Label eg(ner.add_label(LABEL) & (nlp.begin_training())\n", 322 | "+ Shuffle and loop over the examples\n", 323 | " - update the model (nlp.update)\n", 324 | "+ Save the trained model (nlp.to_disk)\n", 325 | "+ Test" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 78, 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [ 334 | "from __future__ import unicode_literals, print_function\n", 335 | "\n", 336 | "import plac\n", 337 | "import random\n", 338 | "from pathlib import Path\n", 339 | "import spacy" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 79, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [ 348 | "# new entity label\n", 349 | "LABEL = 'ANIMAL'" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": 80, 355 | "metadata": {}, 356 | "outputs": [], 357 | "source": [ 358 | "TRAIN_DATA = [\n", 359 | " (\"Horses are too tall and they pretend to care about your feelings\", {\n", 360 | " 'entities': [(0, 6, 'ANIMAL')]\n", 361 | " }),\n", 362 | "\n", 363 | " (\"Do they bite?\", {\n", 364 | " 'entities': []\n", 365 | " }),\n", 366 | "\n", 367 | " (\"horses are too tall and they pretend to care about your feelings\", {\n", 368 | " 'entities': [(0, 6, 'ANIMAL')]\n", 369 | " }),\n", 370 | "\n", 371 | " (\"horses pretend to care about your feelings\", {\n", 372 | " 'entities': [(0, 6, 'ANIMAL')]\n", 373 | " }),\n", 374 | "\n", 375 | " (\"they pretend to care about your feelings, those horses\", {\n", 376 | " 'entities': [(48, 54, 'ANIMAL')]\n", 377 | " }),\n", 378 | "\n", 379 | " (\"horses?\", {\n", 380 | " 'entities': [(0, 6, 'ANIMAL')]\n", 381 | " })\n", 382 | "]" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": 82, 388 | "metadata": {}, 389 | "outputs": [], 390 | "source": [ 391 | "\n", 392 | "@plac.annotations(\n", 393 | " model=(\"Model name. Defaults to blank 'en' model.\", \"option\", \"m\", str),\n", 394 | " new_model_name=(\"New model name for model meta.\", \"option\", \"nm\", str),\n", 395 | " output_dir=(\"Optional output directory\", \"option\", \"o\", Path),\n", 396 | " n_iter=(\"Number of training iterations\", \"option\", \"n\", int))\n", 397 | "\n", 398 | "\n", 399 | "def main(model=None, new_model_name='animal', output_dir=None, n_iter=20):\n", 400 | " \"\"\"Set up the pipeline and entity recognizer, and train the new entity.\"\"\"\n", 401 | " if model is not None:\n", 402 | " nlp = spacy.load(model) # load existing spaCy model\n", 403 | " print(\"Loaded model '%s'\" % model)\n", 404 | " else:\n", 405 | " nlp = spacy.blank('en') # create blank Language class\n", 406 | " print(\"Created blank 'en' model\")\n", 407 | " # Add entity recognizer to model if it's not in the pipeline\n", 408 | " # nlp.create_pipe works for built-ins that are registered with spaCy\n", 409 | " if 'ner' not in nlp.pipe_names:\n", 410 | " ner = nlp.create_pipe('ner')\n", 411 | " nlp.add_pipe(ner)\n", 412 | " # otherwise, get it, so we can add labels to it\n", 413 | " else:\n", 414 | " ner = nlp.get_pipe('ner')\n", 415 | "\n", 416 | " ner.add_label(LABEL) # add new entity label to entity recognizer\n", 417 | " if model is None:\n", 418 | " optimizer = nlp.begin_training()\n", 419 | " else:\n", 420 | " # Note that 'begin_training' initializes the models, so it'll zero out\n", 421 | " # existing entity types.\n", 422 | " optimizer = nlp.entity.create_optimizer()\n", 423 | "\n", 424 | " # get names of other pipes to disable them during training\n", 425 | " other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']\n", 426 | " with nlp.disable_pipes(*other_pipes): # only train NER\n", 427 | " for itn in range(n_iter):\n", 428 | " random.shuffle(TRAIN_DATA)\n", 429 | " losses = {}\n", 430 | " for text, annotations in tqdm(TRAIN_DATA):\n", 431 | " nlp.update([text], [annotations], sgd=optimizer, drop=0.35,\n", 432 | " losses=losses)\n", 433 | " print(losses)\n", 434 | "\n", 435 | " # test the trained model\n", 436 | " test_text = 'Do you like horses?'\n", 437 | " doc = nlp(test_text)\n", 438 | " print(\"Entities in '%s'\" % test_text)\n", 439 | " for ent in doc.ents:\n", 440 | " print(ent.label_, ent.text)\n", 441 | "\n", 442 | " # save model to output directory\n", 443 | " if output_dir is not None:\n", 444 | " output_dir = Path(output_dir)\n", 445 | " if not output_dir.exists():\n", 446 | " output_dir.mkdir()\n", 447 | " nlp.meta['name'] = new_model_name # rename model\n", 448 | " nlp.to_disk(output_dir)\n", 449 | " print(\"Saved model to\", output_dir)\n", 450 | "\n", 451 | " # test the saved model\n", 452 | " print(\"Loading from\", output_dir)\n", 453 | " nlp2 = spacy.load(output_dir)\n", 454 | " doc2 = nlp2(test_text)\n", 455 | " for ent in doc2.ents:\n", 456 | " print(ent.label_, ent.text)\n", 457 | "\n", 458 | "\n", 459 | "# if __name__ == '__main__':\n", 460 | "# plac.call(main)" 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": 83, 466 | "metadata": {}, 467 | "outputs": [ 468 | { 469 | "name": "stdout", 470 | "output_type": "stream", 471 | "text": [ 472 | "Created blank 'en' model\n", 473 | "Warning: Unnamed vectors -- this won't allow multiple vectors models to be loaded. (Shape: (0, 0))\n" 474 | ] 475 | }, 476 | { 477 | "name": "stderr", 478 | "output_type": "stream", 479 | "text": [ 480 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:07<00:00, 1.22s/it]\n" 481 | ] 482 | }, 483 | { 484 | "name": "stdout", 485 | "output_type": "stream", 486 | "text": [ 487 | "{'ner': 26.770396717498016}\n" 488 | ] 489 | }, 490 | { 491 | "name": "stderr", 492 | "output_type": "stream", 493 | "text": [ 494 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:06<00:00, 1.02s/it]\n" 495 | ] 496 | }, 497 | { 498 | "name": "stdout", 499 | "output_type": "stream", 500 | "text": [ 501 | "{'ner': 8.593518038099443}\n" 502 | ] 503 | }, 504 | { 505 | "name": "stderr", 506 | "output_type": "stream", 507 | "text": [ 508 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.29it/s]\n" 509 | ] 510 | }, 511 | { 512 | "name": "stdout", 513 | "output_type": "stream", 514 | "text": [ 515 | "{'ner': 4.161424036550985}\n" 516 | ] 517 | }, 518 | { 519 | "name": "stderr", 520 | "output_type": "stream", 521 | "text": [ 522 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.29it/s]\n" 523 | ] 524 | }, 525 | { 526 | "name": "stdout", 527 | "output_type": "stream", 528 | "text": [ 529 | "{'ner': 3.8918851538918418}\n" 530 | ] 531 | }, 532 | { 533 | "name": "stderr", 534 | "output_type": "stream", 535 | "text": [ 536 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.30it/s]\n" 537 | ] 538 | }, 539 | { 540 | "name": "stdout", 541 | "output_type": "stream", 542 | "text": [ 543 | "{'ner': 2.01546711932046}\n" 544 | ] 545 | }, 546 | { 547 | "name": "stderr", 548 | "output_type": "stream", 549 | "text": [ 550 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.31it/s]\n" 551 | ] 552 | }, 553 | { 554 | "name": "stdout", 555 | "output_type": "stream", 556 | "text": [ 557 | "{'ner': 0.000131435854561013}\n" 558 | ] 559 | }, 560 | { 561 | "name": "stderr", 562 | "output_type": "stream", 563 | "text": [ 564 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.32it/s]\n" 565 | ] 566 | }, 567 | { 568 | "name": "stdout", 569 | "output_type": "stream", 570 | "text": [ 571 | "{'ner': 1.3692610842225425e-07}\n" 572 | ] 573 | }, 574 | { 575 | "name": "stderr", 576 | "output_type": "stream", 577 | "text": [ 578 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.08it/s]\n" 579 | ] 580 | }, 581 | { 582 | "name": "stdout", 583 | "output_type": "stream", 584 | "text": [ 585 | "{'ner': 0.019683124967466954}\n" 586 | ] 587 | }, 588 | { 589 | "name": "stderr", 590 | "output_type": "stream", 591 | "text": [ 592 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.07it/s]\n" 593 | ] 594 | }, 595 | { 596 | "name": "stdout", 597 | "output_type": "stream", 598 | "text": [ 599 | "{'ner': 2.078213820644416e-12}\n" 600 | ] 601 | }, 602 | { 603 | "name": "stderr", 604 | "output_type": "stream", 605 | "text": [ 606 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.11it/s]\n" 607 | ] 608 | }, 609 | { 610 | "name": "stdout", 611 | "output_type": "stream", 612 | "text": [ 613 | "{'ner': 1.5424355623930257e-05}\n" 614 | ] 615 | }, 616 | { 617 | "name": "stderr", 618 | "output_type": "stream", 619 | "text": [ 620 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.29it/s]\n" 621 | ] 622 | }, 623 | { 624 | "name": "stdout", 625 | "output_type": "stream", 626 | "text": [ 627 | "{'ner': 0.34855798227363266}\n" 628 | ] 629 | }, 630 | { 631 | "name": "stderr", 632 | "output_type": "stream", 633 | "text": [ 634 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.29it/s]\n" 635 | ] 636 | }, 637 | { 638 | "name": "stdout", 639 | "output_type": "stream", 640 | "text": [ 641 | "{'ner': 1.2020330928745637e-21}\n" 642 | ] 643 | }, 644 | { 645 | "name": "stderr", 646 | "output_type": "stream", 647 | "text": [ 648 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.23it/s]\n" 649 | ] 650 | }, 651 | { 652 | "name": "stdout", 653 | "output_type": "stream", 654 | "text": [ 655 | "{'ner': 1.1364459848434984e-19}\n" 656 | ] 657 | }, 658 | { 659 | "name": "stderr", 660 | "output_type": "stream", 661 | "text": [ 662 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.01it/s]\n" 663 | ] 664 | }, 665 | { 666 | "name": "stdout", 667 | "output_type": "stream", 668 | "text": [ 669 | "{'ner': 5.07038899221475e-16}\n" 670 | ] 671 | }, 672 | { 673 | "name": "stderr", 674 | "output_type": "stream", 675 | "text": [ 676 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.07it/s]\n" 677 | ] 678 | }, 679 | { 680 | "name": "stdout", 681 | "output_type": "stream", 682 | "text": [ 683 | "{'ner': 7.756965635961777e-18}\n" 684 | ] 685 | }, 686 | { 687 | "name": "stderr", 688 | "output_type": "stream", 689 | "text": [ 690 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.21it/s]\n" 691 | ] 692 | }, 693 | { 694 | "name": "stdout", 695 | "output_type": "stream", 696 | "text": [ 697 | "{'ner': 4.682540175328388e-13}\n" 698 | ] 699 | }, 700 | { 701 | "name": "stderr", 702 | "output_type": "stream", 703 | "text": [ 704 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.17it/s]\n" 705 | ] 706 | }, 707 | { 708 | "name": "stdout", 709 | "output_type": "stream", 710 | "text": [ 711 | "{'ner': 4.9982126736537605e-14}\n" 712 | ] 713 | }, 714 | { 715 | "name": "stderr", 716 | "output_type": "stream", 717 | "text": [ 718 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.15it/s]\n" 719 | ] 720 | }, 721 | { 722 | "name": "stdout", 723 | "output_type": "stream", 724 | "text": [ 725 | "{'ner': 5.766438963914882e-17}\n" 726 | ] 727 | }, 728 | { 729 | "name": "stderr", 730 | "output_type": "stream", 731 | "text": [ 732 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.25it/s]\n" 733 | ] 734 | }, 735 | { 736 | "name": "stdout", 737 | "output_type": "stream", 738 | "text": [ 739 | "{'ner': 4.4997379863434744e-20}\n" 740 | ] 741 | }, 742 | { 743 | "name": "stderr", 744 | "output_type": "stream", 745 | "text": [ 746 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.29it/s]\n" 747 | ] 748 | }, 749 | { 750 | "name": "stdout", 751 | "output_type": "stream", 752 | "text": [ 753 | "{'ner': 1.4565571602945852e-16}\n", 754 | "Entities in 'Do you like horses?'\n", 755 | "ANIMAL horses\n" 756 | ] 757 | } 758 | ], 759 | "source": [ 760 | "# Run our Function\n", 761 | "main()" 762 | ] 763 | }, 764 | { 765 | "cell_type": "code", 766 | "execution_count": null, 767 | "metadata": {}, 768 | "outputs": [], 769 | "source": [ 770 | "# Our model was able to recognize horses as ANIMAL" 771 | ] 772 | }, 773 | { 774 | "cell_type": "code", 775 | "execution_count": null, 776 | "metadata": {}, 777 | "outputs": [], 778 | "source": [] 779 | } 780 | ], 781 | "metadata": { 782 | "kernelspec": { 783 | "display_name": "Python 3", 784 | "language": "python", 785 | "name": "python3" 786 | }, 787 | "language_info": { 788 | "codemirror_mode": { 789 | "name": "ipython", 790 | "version": 3 791 | }, 792 | "file_extension": ".py", 793 | "mimetype": "text/x-python", 794 | "name": "python", 795 | "nbconvert_exporter": "python", 796 | "pygments_lexer": "ipython3", 797 | "version": "3.6.6" 798 | } 799 | }, 800 | "nbformat": 4, 801 | "nbformat_minor": 2 802 | } 803 | -------------------------------------------------------------------------------- /NLP_with_SpaCy/Training the Named Entity Recognizer in SpaCy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Natural Language Processing With SpaCy\n", 8 | "![title](SpaCy_logo.png)" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "#### Training the Named Entity Recognizer\n", 16 | "##### Updating our NER\n", 17 | "+ Load the model\n", 18 | " + spacy.load('en')\n", 19 | " - Disable existing pipe line (nlp.disable_pipes)\n", 20 | " + spacy.blank('en')\n", 21 | " - Added Entity Recognizer to Pipeline\n", 22 | "+ Shuffle and loop over the examples\n", 23 | " - update the model (nlp.update)\n", 24 | "+ Save the trained model (nlp.to_disk)\n", 25 | "+ Test" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "# Load Packages\n", 35 | "from __future__ import unicode_literals, print_function\n", 36 | "\n", 37 | "import plac # wrapper over argparse\n", 38 | "import random\n", 39 | "from pathlib import Path\n", 40 | "import spacy\n", 41 | "from tqdm import tqdm # loading bar" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "nlp1 = spacy.load('en')" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "docx1 = nlp1(u\"Who was Kofi Annan?\")" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "for token in docx1.ents:\n", 69 | " print(token.text,token.start_char, token.end_char,token.label_)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "docx2 = nlp1(u\"Who is Steve Jobs?\")" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "for token in docx2.ents:\n", 88 | " print(token.text,token.start_char, token.end_char,token.label_)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "docx3 = nlp1(u\"Who is Shaka Khan?\")" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "# training data\n", 107 | "TRAIN_DATA = [\n", 108 | " ('Who is Kofi Annan?', {\n", 109 | " 'entities': [(8, 18, 'PERSON')]\n", 110 | " }),\n", 111 | " ('Who is Steve Jobs?', {\n", 112 | " 'entities': [(7, 17, 'PERSON')]\n", 113 | " }),\n", 114 | " ('I like London and Berlin.', {\n", 115 | " 'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]\n", 116 | " })\n", 117 | "]" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "## plac is wrapper for argparser \n", 127 | "@plac.annotations(\n", 128 | " model=(\"Model name. Defaults to blank 'en' model.\", \"option\", \"m\", str),\n", 129 | " output_dir=(\"C:\\Users\\This PC\\Documents\\JLabs\\JFlow\", \"option\", \"o\", Path),\n", 130 | " n_iter=(\"Number of training iterations\", \"option\", \"n\", int))" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "# Define our variables\n", 140 | "model = None\n", 141 | "output_dir=Path(\"C:\\\\Users\\\\This PC\\\\Documents\\\\JLabs\\\\JFlow\")\n", 142 | "n_iter=100" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "#### Load the model" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "if model is not None:\n", 159 | " nlp = spacy.load(model) # load existing spaCy model\n", 160 | " print(\"Loaded model '%s'\" % model)\n", 161 | "else:\n", 162 | " nlp = spacy.blank('en') # create blank Language class\n", 163 | " print(\"Created blank 'en' model\")" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "#### Set Up the Pipeline" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "# create the built-in pipeline components and add them to the pipeline\n", 180 | " # nlp.create_pipe works for built-ins that are registered with spaCy\n", 181 | "if 'ner' not in nlp.pipe_names:\n", 182 | " ner = nlp.create_pipe('ner')\n", 183 | " nlp.add_pipe(ner, last=True)\n", 184 | "# otherwise, get it so we can add labels\n", 185 | "else:\n", 186 | " ner = nlp.get_pipe('ner')" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "#### Train the Recognizer\n", 194 | "+ Add labels,Annotate them\n", 195 | "+ Pipes\n", 196 | "+ Begin_training()" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "\n", 206 | " # add labels\n", 207 | "for _, annotations in TRAIN_DATA:\n", 208 | " for ent in annotations.get('entities'):\n", 209 | " ner.add_label(ent[2])\n", 210 | "\n", 211 | " # get names of other pipes to disable them during training\n", 212 | "other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']\n", 213 | "with nlp.disable_pipes(*other_pipes): # only train NER\n", 214 | " optimizer = nlp.begin_training()\n", 215 | " for itn in range(n_iter):\n", 216 | " random.shuffle(TRAIN_DATA)\n", 217 | " losses = {}\n", 218 | " for text, annotations in tqdm(TRAIN_DATA):\n", 219 | " nlp.update(\n", 220 | " [text], # batch of texts\n", 221 | " [annotations], # batch of annotations\n", 222 | " drop=0.5, # dropout - make it harder to memorise data\n", 223 | " sgd=optimizer, # callable to update weights\n", 224 | " losses=losses)\n", 225 | " print(losses)" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "#### Test the trained model" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "# test the trained model\n", 242 | "for text, _ in TRAIN_DATA:\n", 243 | " doc = nlp(text)\n", 244 | " print('Entities', [(ent.text, ent.label_) for ent in doc.ents])\n", 245 | " print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])\n" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "#### Save the Model" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "# save model to output directory\n", 262 | "if output_dir is not None:\n", 263 | " output_dir = Path(output_dir)\n", 264 | " if not output_dir.exists():\n", 265 | " output_dir.mkdir()\n", 266 | " nlp.to_disk(output_dir)\n", 267 | " print(\"Saved model to\", output_dir)\n", 268 | "\n", 269 | " " 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": {}, 275 | "source": [ 276 | "#### Test The Saved Model\n", 277 | "+ NB Output Directory" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": {}, 284 | "outputs": [], 285 | "source": [ 286 | "# test the saved model\n", 287 | "print(\"Loading from\", output_dir)\n", 288 | "nlp2 = spacy.load(output_dir)\n", 289 | "for text, _ in TRAIN_DATA:\n", 290 | " doc = nlp2(text)\n", 291 | " print('Entities', [(ent.text, ent.label_) for ent in doc.ents])\n", 292 | " print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])" 293 | ] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": {}, 298 | "source": [ 299 | "### Adding Additional Entity Types\n" 300 | ] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "metadata": {}, 305 | "source": [ 306 | "### Natural Language Processing With SpaCy\n", 307 | "![title](SpaCy_logo.png)" 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": {}, 313 | "source": [ 314 | "#### Training the Named Entity Recognizer (NER)\n", 315 | "##### Adding An Additional Entity (NER)\n", 316 | "+ Load the model\n", 317 | " + spacy.load('en')\n", 318 | " - Disable existing pipe line (nlp.disable_pipes)\n", 319 | " + spacy.blank('en')\n", 320 | " - Added Entity Recognizer to Pipeline\n", 321 | "+ Add a Label eg(ner.add_label(LABEL) & (nlp.begin_training())\n", 322 | "+ Shuffle and loop over the examples\n", 323 | " - update the model (nlp.update)\n", 324 | "+ Save the trained model (nlp.to_disk)\n", 325 | "+ Test" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 78, 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [ 334 | "from __future__ import unicode_literals, print_function\n", 335 | "\n", 336 | "import plac\n", 337 | "import random\n", 338 | "from pathlib import Path\n", 339 | "import spacy" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 79, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [ 348 | "# new entity label\n", 349 | "LABEL = 'ANIMAL'" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": 80, 355 | "metadata": {}, 356 | "outputs": [], 357 | "source": [ 358 | "TRAIN_DATA = [\n", 359 | " (\"Horses are too tall and they pretend to care about your feelings\", {\n", 360 | " 'entities': [(0, 6, 'ANIMAL')]\n", 361 | " }),\n", 362 | "\n", 363 | " (\"Do they bite?\", {\n", 364 | " 'entities': []\n", 365 | " }),\n", 366 | "\n", 367 | " (\"horses are too tall and they pretend to care about your feelings\", {\n", 368 | " 'entities': [(0, 6, 'ANIMAL')]\n", 369 | " }),\n", 370 | "\n", 371 | " (\"horses pretend to care about your feelings\", {\n", 372 | " 'entities': [(0, 6, 'ANIMAL')]\n", 373 | " }),\n", 374 | "\n", 375 | " (\"they pretend to care about your feelings, those horses\", {\n", 376 | " 'entities': [(48, 54, 'ANIMAL')]\n", 377 | " }),\n", 378 | "\n", 379 | " (\"horses?\", {\n", 380 | " 'entities': [(0, 6, 'ANIMAL')]\n", 381 | " })\n", 382 | "]" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": 82, 388 | "metadata": {}, 389 | "outputs": [], 390 | "source": [ 391 | "\n", 392 | "@plac.annotations(\n", 393 | " model=(\"Model name. Defaults to blank 'en' model.\", \"option\", \"m\", str),\n", 394 | " new_model_name=(\"New model name for model meta.\", \"option\", \"nm\", str),\n", 395 | " output_dir=(\"Optional output directory\", \"option\", \"o\", Path),\n", 396 | " n_iter=(\"Number of training iterations\", \"option\", \"n\", int))\n", 397 | "\n", 398 | "\n", 399 | "def main(model=None, new_model_name='animal', output_dir=None, n_iter=20):\n", 400 | " \"\"\"Set up the pipeline and entity recognizer, and train the new entity.\"\"\"\n", 401 | " if model is not None:\n", 402 | " nlp = spacy.load(model) # load existing spaCy model\n", 403 | " print(\"Loaded model '%s'\" % model)\n", 404 | " else:\n", 405 | " nlp = spacy.blank('en') # create blank Language class\n", 406 | " print(\"Created blank 'en' model\")\n", 407 | " # Add entity recognizer to model if it's not in the pipeline\n", 408 | " # nlp.create_pipe works for built-ins that are registered with spaCy\n", 409 | " if 'ner' not in nlp.pipe_names:\n", 410 | " ner = nlp.create_pipe('ner')\n", 411 | " nlp.add_pipe(ner)\n", 412 | " # otherwise, get it, so we can add labels to it\n", 413 | " else:\n", 414 | " ner = nlp.get_pipe('ner')\n", 415 | "\n", 416 | " ner.add_label(LABEL) # add new entity label to entity recognizer\n", 417 | " if model is None:\n", 418 | " optimizer = nlp.begin_training()\n", 419 | " else:\n", 420 | " # Note that 'begin_training' initializes the models, so it'll zero out\n", 421 | " # existing entity types.\n", 422 | " optimizer = nlp.entity.create_optimizer()\n", 423 | "\n", 424 | " # get names of other pipes to disable them during training\n", 425 | " other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']\n", 426 | " with nlp.disable_pipes(*other_pipes): # only train NER\n", 427 | " for itn in range(n_iter):\n", 428 | " random.shuffle(TRAIN_DATA)\n", 429 | " losses = {}\n", 430 | " for text, annotations in tqdm(TRAIN_DATA):\n", 431 | " nlp.update([text], [annotations], sgd=optimizer, drop=0.35,\n", 432 | " losses=losses)\n", 433 | " print(losses)\n", 434 | "\n", 435 | " # test the trained model\n", 436 | " test_text = 'Do you like horses?'\n", 437 | " doc = nlp(test_text)\n", 438 | " print(\"Entities in '%s'\" % test_text)\n", 439 | " for ent in doc.ents:\n", 440 | " print(ent.label_, ent.text)\n", 441 | "\n", 442 | " # save model to output directory\n", 443 | " if output_dir is not None:\n", 444 | " output_dir = Path(output_dir)\n", 445 | " if not output_dir.exists():\n", 446 | " output_dir.mkdir()\n", 447 | " nlp.meta['name'] = new_model_name # rename model\n", 448 | " nlp.to_disk(output_dir)\n", 449 | " print(\"Saved model to\", output_dir)\n", 450 | "\n", 451 | " # test the saved model\n", 452 | " print(\"Loading from\", output_dir)\n", 453 | " nlp2 = spacy.load(output_dir)\n", 454 | " doc2 = nlp2(test_text)\n", 455 | " for ent in doc2.ents:\n", 456 | " print(ent.label_, ent.text)\n", 457 | "\n", 458 | "\n", 459 | "# if __name__ == '__main__':\n", 460 | "# plac.call(main)" 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": 83, 466 | "metadata": {}, 467 | "outputs": [ 468 | { 469 | "name": "stdout", 470 | "output_type": "stream", 471 | "text": [ 472 | "Created blank 'en' model\n", 473 | "Warning: Unnamed vectors -- this won't allow multiple vectors models to be loaded. (Shape: (0, 0))\n" 474 | ] 475 | }, 476 | { 477 | "name": "stderr", 478 | "output_type": "stream", 479 | "text": [ 480 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:07<00:00, 1.22s/it]\n" 481 | ] 482 | }, 483 | { 484 | "name": "stdout", 485 | "output_type": "stream", 486 | "text": [ 487 | "{'ner': 26.770396717498016}\n" 488 | ] 489 | }, 490 | { 491 | "name": "stderr", 492 | "output_type": "stream", 493 | "text": [ 494 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:06<00:00, 1.02s/it]\n" 495 | ] 496 | }, 497 | { 498 | "name": "stdout", 499 | "output_type": "stream", 500 | "text": [ 501 | "{'ner': 8.593518038099443}\n" 502 | ] 503 | }, 504 | { 505 | "name": "stderr", 506 | "output_type": "stream", 507 | "text": [ 508 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.29it/s]\n" 509 | ] 510 | }, 511 | { 512 | "name": "stdout", 513 | "output_type": "stream", 514 | "text": [ 515 | "{'ner': 4.161424036550985}\n" 516 | ] 517 | }, 518 | { 519 | "name": "stderr", 520 | "output_type": "stream", 521 | "text": [ 522 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.29it/s]\n" 523 | ] 524 | }, 525 | { 526 | "name": "stdout", 527 | "output_type": "stream", 528 | "text": [ 529 | "{'ner': 3.8918851538918418}\n" 530 | ] 531 | }, 532 | { 533 | "name": "stderr", 534 | "output_type": "stream", 535 | "text": [ 536 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.30it/s]\n" 537 | ] 538 | }, 539 | { 540 | "name": "stdout", 541 | "output_type": "stream", 542 | "text": [ 543 | "{'ner': 2.01546711932046}\n" 544 | ] 545 | }, 546 | { 547 | "name": "stderr", 548 | "output_type": "stream", 549 | "text": [ 550 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.31it/s]\n" 551 | ] 552 | }, 553 | { 554 | "name": "stdout", 555 | "output_type": "stream", 556 | "text": [ 557 | "{'ner': 0.000131435854561013}\n" 558 | ] 559 | }, 560 | { 561 | "name": "stderr", 562 | "output_type": "stream", 563 | "text": [ 564 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.32it/s]\n" 565 | ] 566 | }, 567 | { 568 | "name": "stdout", 569 | "output_type": "stream", 570 | "text": [ 571 | "{'ner': 1.3692610842225425e-07}\n" 572 | ] 573 | }, 574 | { 575 | "name": "stderr", 576 | "output_type": "stream", 577 | "text": [ 578 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.08it/s]\n" 579 | ] 580 | }, 581 | { 582 | "name": "stdout", 583 | "output_type": "stream", 584 | "text": [ 585 | "{'ner': 0.019683124967466954}\n" 586 | ] 587 | }, 588 | { 589 | "name": "stderr", 590 | "output_type": "stream", 591 | "text": [ 592 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.07it/s]\n" 593 | ] 594 | }, 595 | { 596 | "name": "stdout", 597 | "output_type": "stream", 598 | "text": [ 599 | "{'ner': 2.078213820644416e-12}\n" 600 | ] 601 | }, 602 | { 603 | "name": "stderr", 604 | "output_type": "stream", 605 | "text": [ 606 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.11it/s]\n" 607 | ] 608 | }, 609 | { 610 | "name": "stdout", 611 | "output_type": "stream", 612 | "text": [ 613 | "{'ner': 1.5424355623930257e-05}\n" 614 | ] 615 | }, 616 | { 617 | "name": "stderr", 618 | "output_type": "stream", 619 | "text": [ 620 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.29it/s]\n" 621 | ] 622 | }, 623 | { 624 | "name": "stdout", 625 | "output_type": "stream", 626 | "text": [ 627 | "{'ner': 0.34855798227363266}\n" 628 | ] 629 | }, 630 | { 631 | "name": "stderr", 632 | "output_type": "stream", 633 | "text": [ 634 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.29it/s]\n" 635 | ] 636 | }, 637 | { 638 | "name": "stdout", 639 | "output_type": "stream", 640 | "text": [ 641 | "{'ner': 1.2020330928745637e-21}\n" 642 | ] 643 | }, 644 | { 645 | "name": "stderr", 646 | "output_type": "stream", 647 | "text": [ 648 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.23it/s]\n" 649 | ] 650 | }, 651 | { 652 | "name": "stdout", 653 | "output_type": "stream", 654 | "text": [ 655 | "{'ner': 1.1364459848434984e-19}\n" 656 | ] 657 | }, 658 | { 659 | "name": "stderr", 660 | "output_type": "stream", 661 | "text": [ 662 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.01it/s]\n" 663 | ] 664 | }, 665 | { 666 | "name": "stdout", 667 | "output_type": "stream", 668 | "text": [ 669 | "{'ner': 5.07038899221475e-16}\n" 670 | ] 671 | }, 672 | { 673 | "name": "stderr", 674 | "output_type": "stream", 675 | "text": [ 676 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.07it/s]\n" 677 | ] 678 | }, 679 | { 680 | "name": "stdout", 681 | "output_type": "stream", 682 | "text": [ 683 | "{'ner': 7.756965635961777e-18}\n" 684 | ] 685 | }, 686 | { 687 | "name": "stderr", 688 | "output_type": "stream", 689 | "text": [ 690 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.21it/s]\n" 691 | ] 692 | }, 693 | { 694 | "name": "stdout", 695 | "output_type": "stream", 696 | "text": [ 697 | "{'ner': 4.682540175328388e-13}\n" 698 | ] 699 | }, 700 | { 701 | "name": "stderr", 702 | "output_type": "stream", 703 | "text": [ 704 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.17it/s]\n" 705 | ] 706 | }, 707 | { 708 | "name": "stdout", 709 | "output_type": "stream", 710 | "text": [ 711 | "{'ner': 4.9982126736537605e-14}\n" 712 | ] 713 | }, 714 | { 715 | "name": "stderr", 716 | "output_type": "stream", 717 | "text": [ 718 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.15it/s]\n" 719 | ] 720 | }, 721 | { 722 | "name": "stdout", 723 | "output_type": "stream", 724 | "text": [ 725 | "{'ner': 5.766438963914882e-17}\n" 726 | ] 727 | }, 728 | { 729 | "name": "stderr", 730 | "output_type": "stream", 731 | "text": [ 732 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.25it/s]\n" 733 | ] 734 | }, 735 | { 736 | "name": "stdout", 737 | "output_type": "stream", 738 | "text": [ 739 | "{'ner': 4.4997379863434744e-20}\n" 740 | ] 741 | }, 742 | { 743 | "name": "stderr", 744 | "output_type": "stream", 745 | "text": [ 746 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.29it/s]\n" 747 | ] 748 | }, 749 | { 750 | "name": "stdout", 751 | "output_type": "stream", 752 | "text": [ 753 | "{'ner': 1.4565571602945852e-16}\n", 754 | "Entities in 'Do you like horses?'\n", 755 | "ANIMAL horses\n" 756 | ] 757 | } 758 | ], 759 | "source": [ 760 | "# Run our Function\n", 761 | "main()" 762 | ] 763 | }, 764 | { 765 | "cell_type": "code", 766 | "execution_count": null, 767 | "metadata": {}, 768 | "outputs": [], 769 | "source": [ 770 | "# Our model was able to recognize horses as ANIMAL" 771 | ] 772 | }, 773 | { 774 | "cell_type": "code", 775 | "execution_count": null, 776 | "metadata": {}, 777 | "outputs": [], 778 | "source": [] 779 | } 780 | ], 781 | "metadata": { 782 | "kernelspec": { 783 | "display_name": "Python 3", 784 | "language": "python", 785 | "name": "python3" 786 | }, 787 | "language_info": { 788 | "codemirror_mode": { 789 | "name": "ipython", 790 | "version": 3 791 | }, 792 | "file_extension": ".py", 793 | "mimetype": "text/x-python", 794 | "name": "python", 795 | "nbconvert_exporter": "python", 796 | "pygments_lexer": "ipython3", 797 | "version": "3.6.6" 798 | } 799 | }, 800 | "nbformat": 4, 801 | "nbformat_minor": 2 802 | } 803 | --------------------------------------------------------------------------------