├── img
    ├── re.png
    ├── vs.png
    ├── 3 PAV.pdf
    ├── hood.png
    ├── span.png
    ├── span2.png
    ├── table.png
    ├── token.png
    ├── builtin.png
    ├── models.png
    ├── components.png
    ├── tokenizer.png
    ├── trainning1.png
    ├── components2.png
    ├── data-struct.png
    ├── dep_example.png
    └── whathappens.png
├── example
    ├── tokenizer
    └── meta.json
├── exercises
    ├── country_text.txt
    ├── iphone.json
    ├── gadgets.json
    ├── tweets.json
    ├── bookquotes.json
    ├── countries.json
    └── capitals.json
├── requirements.txt
├── README.md
├── Chapter_04.ipynb
├── Chapter_03.ipynb
└── Chapter_01.ipynb


/img/re.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cristianasp/spacy-notebook/HEAD/img/re.png


--------------------------------------------------------------------------------
/img/vs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cristianasp/spacy-notebook/HEAD/img/vs.png


--------------------------------------------------------------------------------
/img/3 PAV.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cristianasp/spacy-notebook/HEAD/img/3 PAV.pdf


--------------------------------------------------------------------------------
/img/hood.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cristianasp/spacy-notebook/HEAD/img/hood.png


--------------------------------------------------------------------------------
/img/span.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cristianasp/spacy-notebook/HEAD/img/span.png


--------------------------------------------------------------------------------
/img/span2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cristianasp/spacy-notebook/HEAD/img/span2.png


--------------------------------------------------------------------------------
/img/table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cristianasp/spacy-notebook/HEAD/img/table.png


--------------------------------------------------------------------------------
/img/token.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cristianasp/spacy-notebook/HEAD/img/token.png


--------------------------------------------------------------------------------
/img/builtin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cristianasp/spacy-notebook/HEAD/img/builtin.png


--------------------------------------------------------------------------------
/img/models.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cristianasp/spacy-notebook/HEAD/img/models.png


--------------------------------------------------------------------------------
/example/tokenizer:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cristianasp/spacy-notebook/HEAD/example/tokenizer


--------------------------------------------------------------------------------
/img/components.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cristianasp/spacy-notebook/HEAD/img/components.png


--------------------------------------------------------------------------------
/img/tokenizer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cristianasp/spacy-notebook/HEAD/img/tokenizer.png


--------------------------------------------------------------------------------
/img/trainning1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cristianasp/spacy-notebook/HEAD/img/trainning1.png


--------------------------------------------------------------------------------
/img/components2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cristianasp/spacy-notebook/HEAD/img/components2.png


--------------------------------------------------------------------------------
/img/data-struct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cristianasp/spacy-notebook/HEAD/img/data-struct.png


--------------------------------------------------------------------------------
/img/dep_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cristianasp/spacy-notebook/HEAD/img/dep_example.png


--------------------------------------------------------------------------------
/img/whathappens.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cristianasp/spacy-notebook/HEAD/img/whathappens.png


--------------------------------------------------------------------------------
/exercises/country_text.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cristianasp/spacy-notebook/HEAD/exercises/country_text.txt


--------------------------------------------------------------------------------
/exercises/iphone.json:
--------------------------------------------------------------------------------
1 | [
2 |   "How to preorder the iPhone X",
3 |   "iPhone X is coming",
4 |   "Should I pay $1,000 for the iPhone X?",
5 |   "The iPhone 8 reviews are here",
6 |   "Your iPhone goes up to 11 today",
7 |   "I need a new phone! Any tips?"
8 | ]


--------------------------------------------------------------------------------
/example/meta.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "lang":"en",
 3 |   "name":"model",
 4 |   "version":"0.0.0",
 5 |   "spacy_version":">=2.0.16",
 6 |   "description":"",
 7 |   "author":"",
 8 |   "email":"",
 9 |   "url":"",
10 |   "license":"",
11 |   "vectors":{
12 |     "width":0,
13 |     "vectors":0,
14 |     "keys":0,
15 |     "name":null
16 |   },
17 |   "pipeline":[
18 | 
19 |   ]
20 | }


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | spacy==2.1.3
2 | 
3 | https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz#egg=en_core_web_sm==2.1.0
4 | https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.1.0/en_core_web_md-2.1.0.tar.gz#egg=en_core_web_md==2.1.0
5 | https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-2.1.0/pt_core_news_sm-2.1.0.tar.gz#egg=pt_core_news_sm-2.1.0


--------------------------------------------------------------------------------
/exercises/gadgets.json:
--------------------------------------------------------------------------------
1 | [
2 |     ["How to preorder the iPhone X", { "entities": [[20, 28, "GADGET"]] }],
3 |     ["iPhone X is coming", { "entities": [[0, 8, "GADGET"]] }],
4 |     ["Should I pay $1,000 for the iPhone X?", { "entities": [[28, 36, "GADGET"]] }],
5 |     ["The iPhone 8 reviews are here", { "entities": [[4, 12, "GADGET"]] }],
6 |     ["Your iPhone goes up to 11 today", { "entities": [[5, 11, "GADGET"]] }],
7 |     ["I need a new phone! Any tips?", { "entities": [] }]
8 | ]


--------------------------------------------------------------------------------
/exercises/tweets.json:
--------------------------------------------------------------------------------
1 | [
2 |     "McDonalds is my favorite restaurant.",
3 |     "Here I thought @McDonalds only had precooked burgers but it seems they only have not cooked ones?? I have no time to get sick..",
4 |     "People really still eat McDonalds :(",
5 |     "The McDonalds in Spain has chicken wings. My heart is so happy ",
6 |     "@McDonalds Please bring back the most delicious fast food sandwich of all times!!....The Arch Deluxe :P",
7 |     "please hurry and open. I WANT A #McRib SANDWICH SO BAD! :D",
8 |     "This morning i made a terrible decision by gettin mcdonalds and now my stomach is payin for it"
9 | ]


--------------------------------------------------------------------------------
/exercises/bookquotes.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     [
 3 |         "One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin.",
 4 |         { "author": "Franz Kafka", "book": "Metamorphosis" }
 5 |     ],
 6 |     [
 7 |         "I know not all that may be coming, but be it what it will, I'll go to it laughing.",
 8 |         { "author": "Herman Melville", "book": "Moby-Dick or, The Whale" }
 9 |     ],
10 |     [
11 |         "It was the best of times, it was the worst of times.",
12 |         { "author": "Charles Dickens", "book": "A Tale of Two Cities" }
13 |     ],
14 |     [
15 |         "The only people for me are the mad ones, the ones who are mad to live, mad to talk, mad to be saved, desirous of everything at the same time, the ones who never yawn or say a commonplace thing, but burn, burn, burn like fabulous yellow roman candles exploding like spiders across the stars.",
16 |         { "author": "Jack Kerouac", "book": "On the Road" }
17 |     ],
18 |     [
19 |         "It was a bright cold day in April, and the clocks were striking thirteen.",
20 |         { "author": "George Orwell", "book": "1984" }
21 |     ],
22 |     [
23 |         "Nowadays people know the price of everything and the value of nothing.",
24 |         { "author": "Oscar Wilde", "book": "The Picture Of Dorian Gray" }
25 |     ]
26 | ]
27 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # A Notebook about NLP Spacy course
 2 | 
 3 | ## About this notebook
 4 | 
 5 | This notebook is based on Ines Montani's free online course, available at:
 6 | 
 7 | https://course.spacy.io/
 8 | 
 9 | https://github.com/ines/spacy-course
10 | 
11 | ## How to install spacy:
12 | 
13 | The instructions are in the link below :
14 | 
15 | https://spacy.io/usage
16 | 
17 | To install spacy, please use:
18 | 
19 | <pre><code>conda install -c conda-forge spacy</code></pre>
20 | 
21 | This notebook is tested for version 2.1.3
22 | 
23 | As per May, 4th 2019, installing Spacy via "conda install -c conda-forge spacy" delivers 2.0.8 version so I used "pip install -U spacy" to have version 2.1.3 in my computer
24 | 
25 | To check the library version, use the commands below:
26 | 
27 | <pre><code>import spacy
28 | print(spacy.__version__)
29 | </code></pre>
30 | 
31 | 
32 | ## How to download the models:
33 | 
34 | https://github.com/explosion/spacy-models/releases/
35 | 
36 | To download a model:
37 | 
38 | <pre><code>python -m spacy download pt
39 | python -m spacy download en
40 | </code></pre>
41 | 
42 | or...
43 | <pre><code>python -m spacy download en_core_web_sm
44 | python -m spacy download pt_core_news_sm
45 | python -m spacy download en_core_web_md
46 | </code></pre>
47 | 
48 | To link a model to refer to it more easily:
49 | 
50 | <pre><code>python -m spacy link en_core_web_md en
51 | python -m spacy link pt_core_news_sm pt
52 | </code></pre>
53 | 
54 | 
55 | ## Using nbextensions
56 | 
57 | I highly recommend you to install table of contents from nbextensions, that makes the navigation in the sections much more easier.
58 | 
59 | Instructions can be found here:
60 | 
61 | https://jupyter-contrib-nbextensions.readthedocs.io/en/latest/install.html
62 | 
63 | <pre><code>conda install -c conda-forge jupyter_contrib_nbextensions</code></pre>
64 | <pre><code>jupyter nbextension enable toc2</code></pre>
65 | 
66 | ## Additional information
67 | 
68 | If you have issues rendering these notebooks, you can try nbviewer
69 | 
70 | https://nbviewer.jupyter.org/github/Cristianasp/spacy/blob/master/Chapter_01.ipynb
71 | 
72 | https://nbviewer.jupyter.org/github/Cristianasp/spacy/blob/master/Chapter_02.ipynb
73 | 
74 | https://nbviewer.jupyter.org/github/Cristianasp/spacy/blob/master/Chapter_03.ipynb
75 | 
76 | https://nbviewer.jupyter.org/github/Cristianasp/spacy/blob/master/Chapter_04.ipynb
77 | 
78 | ## Or you want to use binder ?
79 | 
80 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/Cristianasp/spacy/master)
81 | 
82 | 
83 | ## LICENSE
84 | 
85 | The materials are licensed under CC BY-SA.
86 | 
87 | See here for details: [https://github.com/ines/spacy-course/blob/master/LICENSE]
88 | 


--------------------------------------------------------------------------------
/exercises/countries.json:
--------------------------------------------------------------------------------
  1 | [
  2 |     "Afghanistan",
  3 |     "\u00c5land Islands",
  4 |     "Albania",
  5 |     "Algeria",
  6 |     "American Samoa",
  7 |     "Andorra",
  8 |     "Angola",
  9 |     "Anguilla",
 10 |     "Antarctica",
 11 |     "Antigua and Barbuda",
 12 |     "Argentina",
 13 |     "Armenia",
 14 |     "Aruba",
 15 |     "Australia",
 16 |     "Austria",
 17 |     "Azerbaijan",
 18 |     "Bahamas",
 19 |     "Bahrain",
 20 |     "Bangladesh",
 21 |     "Barbados",
 22 |     "Belarus",
 23 |     "Belgium",
 24 |     "Belize",
 25 |     "Benin",
 26 |     "Bermuda",
 27 |     "Bhutan",
 28 |     "Bolivia (Plurinational State of)",
 29 |     "Bonaire, Sint Eustatius and Saba",
 30 |     "Bosnia and Herzegovina",
 31 |     "Botswana",
 32 |     "Bouvet Island",
 33 |     "Brazil",
 34 |     "British Indian Ocean Territory",
 35 |     "United States Minor Outlying Islands",
 36 |     "Virgin Islands (British)",
 37 |     "Virgin Islands (U.S.)",
 38 |     "Brunei Darussalam",
 39 |     "Bulgaria",
 40 |     "Burkina Faso",
 41 |     "Burundi",
 42 |     "Cambodia",
 43 |     "Cameroon",
 44 |     "Canada",
 45 |     "Cabo Verde",
 46 |     "Cayman Islands",
 47 |     "Central African Republic",
 48 |     "Chad",
 49 |     "Chile",
 50 |     "China",
 51 |     "Christmas Island",
 52 |     "Cocos (Keeling) Islands",
 53 |     "Colombia",
 54 |     "Comoros",
 55 |     "Congo",
 56 |     "Congo (Democratic Republic of the)",
 57 |     "Cook Islands",
 58 |     "Costa Rica",
 59 |     "Croatia",
 60 |     "Cuba",
 61 |     "Cura\u00e7ao",
 62 |     "Cyprus",
 63 |     "Czech Republic",
 64 |     "Denmark",
 65 |     "Djibouti",
 66 |     "Dominica",
 67 |     "Dominican Republic",
 68 |     "Ecuador",
 69 |     "Egypt",
 70 |     "El Salvador",
 71 |     "Equatorial Guinea",
 72 |     "Eritrea",
 73 |     "Estonia",
 74 |     "Ethiopia",
 75 |     "Falkland Islands (Malvinas)",
 76 |     "Faroe Islands",
 77 |     "Fiji",
 78 |     "Finland",
 79 |     "France",
 80 |     "French Guiana",
 81 |     "French Polynesia",
 82 |     "French Southern Territories",
 83 |     "Gabon",
 84 |     "Gambia",
 85 |     "Georgia",
 86 |     "Germany",
 87 |     "Ghana",
 88 |     "Gibraltar",
 89 |     "Greece",
 90 |     "Greenland",
 91 |     "Grenada",
 92 |     "Guadeloupe",
 93 |     "Guam",
 94 |     "Guatemala",
 95 |     "Guernsey",
 96 |     "Guinea",
 97 |     "Guinea-Bissau",
 98 |     "Guyana",
 99 |     "Haiti",
100 |     "Heard Island and McDonald Islands",
101 |     "Holy See",
102 |     "Honduras",
103 |     "Hong Kong",
104 |     "Hungary",
105 |     "Iceland",
106 |     "India",
107 |     "Indonesia",
108 |     "C\u00f4te d'Ivoire",
109 |     "Iran (Islamic Republic of)",
110 |     "Iraq",
111 |     "Ireland",
112 |     "Isle of Man",
113 |     "Israel",
114 |     "Italy",
115 |     "Jamaica",
116 |     "Japan",
117 |     "Jersey",
118 |     "Jordan",
119 |     "Kazakhstan",
120 |     "Kenya",
121 |     "Kiribati",
122 |     "Kuwait",
123 |     "Kyrgyzstan",
124 |     "Lao People's Democratic Republic",
125 |     "Latvia",
126 |     "Lebanon",
127 |     "Lesotho",
128 |     "Liberia",
129 |     "Libya",
130 |     "Liechtenstein",
131 |     "Lithuania",
132 |     "Luxembourg",
133 |     "Macao",
134 |     "Macedonia (the former Yugoslav Republic of)",
135 |     "Madagascar",
136 |     "Malawi",
137 |     "Malaysia",
138 |     "Maldives",
139 |     "Mali",
140 |     "Malta",
141 |     "Marshall Islands",
142 |     "Martinique",
143 |     "Mauritania",
144 |     "Mauritius",
145 |     "Mayotte",
146 |     "Mexico",
147 |     "Micronesia (Federated States of)",
148 |     "Moldova (Republic of)",
149 |     "Monaco",
150 |     "Mongolia",
151 |     "Montenegro",
152 |     "Montserrat",
153 |     "Morocco",
154 |     "Mozambique",
155 |     "Myanmar",
156 |     "Namibia",
157 |     "Nauru",
158 |     "Nepal",
159 |     "Netherlands",
160 |     "New Caledonia",
161 |     "New Zealand",
162 |     "Nicaragua",
163 |     "Niger",
164 |     "Nigeria",
165 |     "Niue",
166 |     "Norfolk Island",
167 |     "Korea (Democratic People's Republic of)",
168 |     "Northern Mariana Islands",
169 |     "Norway",
170 |     "Oman",
171 |     "Pakistan",
172 |     "Palau",
173 |     "Palestine, State of",
174 |     "Panama",
175 |     "Papua New Guinea",
176 |     "Paraguay",
177 |     "Peru",
178 |     "Philippines",
179 |     "Pitcairn",
180 |     "Poland",
181 |     "Portugal",
182 |     "Puerto Rico",
183 |     "Qatar",
184 |     "Republic of Kosovo",
185 |     "R\u00e9union",
186 |     "Romania",
187 |     "Russian Federation",
188 |     "Rwanda",
189 |     "Saint Barth\u00e9lemy",
190 |     "Saint Helena, Ascension and Tristan da Cunha",
191 |     "Saint Kitts and Nevis",
192 |     "Saint Lucia",
193 |     "Saint Martin (French part)",
194 |     "Saint Pierre and Miquelon",
195 |     "Saint Vincent and the Grenadines",
196 |     "Samoa",
197 |     "San Marino",
198 |     "Sao Tome and Principe",
199 |     "Saudi Arabia",
200 |     "Senegal",
201 |     "Serbia",
202 |     "Seychelles",
203 |     "Sierra Leone",
204 |     "Singapore",
205 |     "Sint Maarten (Dutch part)",
206 |     "Slovakia",
207 |     "Slovenia",
208 |     "Solomon Islands",
209 |     "Somalia",
210 |     "South Africa",
211 |     "South Georgia and the South Sandwich Islands",
212 |     "Korea (Republic of)",
213 |     "South Sudan",
214 |     "Spain",
215 |     "Sri Lanka",
216 |     "Sudan",
217 |     "Suriname",
218 |     "Svalbard and Jan Mayen",
219 |     "Swaziland",
220 |     "Sweden",
221 |     "Switzerland",
222 |     "Syrian Arab Republic",
223 |     "Taiwan",
224 |     "Tajikistan",
225 |     "Tanzania, United Republic of",
226 |     "Thailand",
227 |     "Timor-Leste",
228 |     "Togo",
229 |     "Tokelau",
230 |     "Tonga",
231 |     "Trinidad and Tobago",
232 |     "Tunisia",
233 |     "Turkey",
234 |     "Turkmenistan",
235 |     "Turks and Caicos Islands",
236 |     "Tuvalu",
237 |     "Uganda",
238 |     "Ukraine",
239 |     "United Arab Emirates",
240 |     "United Kingdom of Great Britain and Northern Ireland",
241 |     "United States of America",
242 |     "Uruguay",
243 |     "Uzbekistan",
244 |     "Vanuatu",
245 |     "Venezuela (Bolivarian Republic of)",
246 |     "Viet Nam",
247 |     "Wallis and Futuna",
248 |     "Western Sahara",
249 |     "Yemen",
250 |     "Zambia",
251 |     "Zimbabwe"
252 | ]


--------------------------------------------------------------------------------
/exercises/capitals.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "Afghanistan":"Kabul",
  3 |   "\u00c5land Islands":"Mariehamn",
  4 |   "Albania":"Tirana",
  5 |   "Algeria":"Algiers",
  6 |   "American Samoa":"Pago Pago",
  7 |   "Andorra":"Andorra la Vella",
  8 |   "Angola":"Luanda",
  9 |   "Anguilla":"The Valley",
 10 |   "Antarctica":"",
 11 |   "Antigua and Barbuda":"Saint John's",
 12 |   "Argentina":"Buenos Aires",
 13 |   "Armenia":"Yerevan",
 14 |   "Aruba":"Oranjestad",
 15 |   "Australia":"Canberra",
 16 |   "Austria":"Vienna",
 17 |   "Azerbaijan":"Baku",
 18 |   "Bahamas":"Nassau",
 19 |   "Bahrain":"Manama",
 20 |   "Bangladesh":"Dhaka",
 21 |   "Barbados":"Bridgetown",
 22 |   "Belarus":"Minsk",
 23 |   "Belgium":"Brussels",
 24 |   "Belize":"Belmopan",
 25 |   "Benin":"Porto-Novo",
 26 |   "Bermuda":"Hamilton",
 27 |   "Bhutan":"Thimphu",
 28 |   "Bolivia (Plurinational State of)":"Sucre",
 29 |   "Bonaire, Sint Eustatius and Saba":"Kralendijk",
 30 |   "Bosnia and Herzegovina":"Sarajevo",
 31 |   "Botswana":"Gaborone",
 32 |   "Bouvet Island":"",
 33 |   "Brazil":"Bras\u00edlia",
 34 |   "British Indian Ocean Territory":"Diego Garcia",
 35 |   "United States Minor Outlying Islands":"",
 36 |   "Virgin Islands (British)":"Road Town",
 37 |   "Virgin Islands (U.S.)":"Charlotte Amalie",
 38 |   "Brunei Darussalam":"Bandar Seri Begawan",
 39 |   "Bulgaria":"Sofia",
 40 |   "Burkina Faso":"Ouagadougou",
 41 |   "Burundi":"Bujumbura",
 42 |   "Cambodia":"Phnom Penh",
 43 |   "Cameroon":"Yaound\u00e9",
 44 |   "Canada":"Ottawa",
 45 |   "Cabo Verde":"Praia",
 46 |   "Cayman Islands":"George Town",
 47 |   "Central African Republic":"Bangui",
 48 |   "Chad":"N'Djamena",
 49 |   "Chile":"Santiago",
 50 |   "China":"Beijing",
 51 |   "Christmas Island":"Flying Fish Cove",
 52 |   "Cocos (Keeling) Islands":"West Island",
 53 |   "Colombia":"Bogot\u00e1",
 54 |   "Comoros":"Moroni",
 55 |   "Congo":"Brazzaville",
 56 |   "Congo (Democratic Republic of the)":"Kinshasa",
 57 |   "Cook Islands":"Avarua",
 58 |   "Costa Rica":"San Jos\u00e9",
 59 |   "Croatia":"Zagreb",
 60 |   "Cuba":"Havana",
 61 |   "Cura\u00e7ao":"Willemstad",
 62 |   "Cyprus":"Nicosia",
 63 |   "Czech Republic":"Prague",
 64 |   "Denmark":"Copenhagen",
 65 |   "Djibouti":"Djibouti",
 66 |   "Dominica":"Roseau",
 67 |   "Dominican Republic":"Santo Domingo",
 68 |   "Ecuador":"Quito",
 69 |   "Egypt":"Cairo",
 70 |   "El Salvador":"San Salvador",
 71 |   "Equatorial Guinea":"Malabo",
 72 |   "Eritrea":"Asmara",
 73 |   "Estonia":"Tallinn",
 74 |   "Ethiopia":"Addis Ababa",
 75 |   "Falkland Islands (Malvinas)":"Stanley",
 76 |   "Faroe Islands":"T\u00f3rshavn",
 77 |   "Fiji":"Suva",
 78 |   "Finland":"Helsinki",
 79 |   "France":"Paris",
 80 |   "French Guiana":"Cayenne",
 81 |   "French Polynesia":"Papeet\u0113",
 82 |   "French Southern Territories":"Port-aux-Fran\u00e7ais",
 83 |   "Gabon":"Libreville",
 84 |   "Gambia":"Banjul",
 85 |   "Georgia":"Tbilisi",
 86 |   "Germany":"Berlin",
 87 |   "Ghana":"Accra",
 88 |   "Gibraltar":"Gibraltar",
 89 |   "Greece":"Athens",
 90 |   "Greenland":"Nuuk",
 91 |   "Grenada":"St. George's",
 92 |   "Guadeloupe":"Basse-Terre",
 93 |   "Guam":"Hag\u00e5t\u00f1a",
 94 |   "Guatemala":"Guatemala City",
 95 |   "Guernsey":"St. Peter Port",
 96 |   "Guinea":"Conakry",
 97 |   "Guinea-Bissau":"Bissau",
 98 |   "Guyana":"Georgetown",
 99 |   "Haiti":"Port-au-Prince",
100 |   "Heard Island and McDonald Islands":"",
101 |   "Holy See":"Rome",
102 |   "Honduras":"Tegucigalpa",
103 |   "Hong Kong":"City of Victoria",
104 |   "Hungary":"Budapest",
105 |   "Iceland":"Reykjav\u00edk",
106 |   "India":"New Delhi",
107 |   "Indonesia":"Jakarta",
108 |   "C\u00f4te d'Ivoire":"Yamoussoukro",
109 |   "Iran (Islamic Republic of)":"Tehran",
110 |   "Iraq":"Baghdad",
111 |   "Ireland":"Dublin",
112 |   "Isle of Man":"Douglas",
113 |   "Israel":"Jerusalem",
114 |   "Italy":"Rome",
115 |   "Jamaica":"Kingston",
116 |   "Japan":"Tokyo",
117 |   "Jersey":"Saint Helier",
118 |   "Jordan":"Amman",
119 |   "Kazakhstan":"Astana",
120 |   "Kenya":"Nairobi",
121 |   "Kiribati":"South Tarawa",
122 |   "Kuwait":"Kuwait City",
123 |   "Kyrgyzstan":"Bishkek",
124 |   "Lao People's Democratic Republic":"Vientiane",
125 |   "Latvia":"Riga",
126 |   "Lebanon":"Beirut",
127 |   "Lesotho":"Maseru",
128 |   "Liberia":"Monrovia",
129 |   "Libya":"Tripoli",
130 |   "Liechtenstein":"Vaduz",
131 |   "Lithuania":"Vilnius",
132 |   "Luxembourg":"Luxembourg",
133 |   "Macao":"",
134 |   "Macedonia (the former Yugoslav Republic of)":"Skopje",
135 |   "Madagascar":"Antananarivo",
136 |   "Malawi":"Lilongwe",
137 |   "Malaysia":"Kuala Lumpur",
138 |   "Maldives":"Mal\u00e9",
139 |   "Mali":"Bamako",
140 |   "Malta":"Valletta",
141 |   "Marshall Islands":"Majuro",
142 |   "Martinique":"Fort-de-France",
143 |   "Mauritania":"Nouakchott",
144 |   "Mauritius":"Port Louis",
145 |   "Mayotte":"Mamoudzou",
146 |   "Mexico":"Mexico City",
147 |   "Micronesia (Federated States of)":"Palikir",
148 |   "Moldova (Republic of)":"Chi\u0219in\u0103u",
149 |   "Monaco":"Monaco",
150 |   "Mongolia":"Ulan Bator",
151 |   "Montenegro":"Podgorica",
152 |   "Montserrat":"Plymouth",
153 |   "Morocco":"Rabat",
154 |   "Mozambique":"Maputo",
155 |   "Myanmar":"Naypyidaw",
156 |   "Namibia":"Windhoek",
157 |   "Nauru":"Yaren",
158 |   "Nepal":"Kathmandu",
159 |   "Netherlands":"Amsterdam",
160 |   "New Caledonia":"Noum\u00e9a",
161 |   "New Zealand":"Wellington",
162 |   "Nicaragua":"Managua",
163 |   "Niger":"Niamey",
164 |   "Nigeria":"Abuja",
165 |   "Niue":"Alofi",
166 |   "Norfolk Island":"Kingston",
167 |   "Korea (Democratic People's Republic of)":"Pyongyang",
168 |   "Northern Mariana Islands":"Saipan",
169 |   "Norway":"Oslo",
170 |   "Oman":"Muscat",
171 |   "Pakistan":"Islamabad",
172 |   "Palau":"Ngerulmud",
173 |   "Palestine, State of":"Ramallah",
174 |   "Panama":"Panama City",
175 |   "Papua New Guinea":"Port Moresby",
176 |   "Paraguay":"Asunci\u00f3n",
177 |   "Peru":"Lima",
178 |   "Philippines":"Manila",
179 |   "Pitcairn":"Adamstown",
180 |   "Poland":"Warsaw",
181 |   "Portugal":"Lisbon",
182 |   "Puerto Rico":"San Juan",
183 |   "Qatar":"Doha",
184 |   "Republic of Kosovo":"Pristina",
185 |   "R\u00e9union":"Saint-Denis",
186 |   "Romania":"Bucharest",
187 |   "Russian Federation":"Moscow",
188 |   "Rwanda":"Kigali",
189 |   "Saint Barth\u00e9lemy":"Gustavia",
190 |   "Saint Helena, Ascension and Tristan da Cunha":"Jamestown",
191 |   "Saint Kitts and Nevis":"Basseterre",
192 |   "Saint Lucia":"Castries",
193 |   "Saint Martin (French part)":"Marigot",
194 |   "Saint Pierre and Miquelon":"Saint-Pierre",
195 |   "Saint Vincent and the Grenadines":"Kingstown",
196 |   "Samoa":"Apia",
197 |   "San Marino":"City of San Marino",
198 |   "Sao Tome and Principe":"S\u00e3o Tom\u00e9",
199 |   "Saudi Arabia":"Riyadh",
200 |   "Senegal":"Dakar",
201 |   "Serbia":"Belgrade",
202 |   "Seychelles":"Victoria",
203 |   "Sierra Leone":"Freetown",
204 |   "Singapore":"Singapore",
205 |   "Sint Maarten (Dutch part)":"Philipsburg",
206 |   "Slovakia":"Bratislava",
207 |   "Slovenia":"Ljubljana",
208 |   "Solomon Islands":"Honiara",
209 |   "Somalia":"Mogadishu",
210 |   "South Africa":"Pretoria",
211 |   "South Georgia and the South Sandwich Islands":"King Edward Point",
212 |   "Korea (Republic of)":"Seoul",
213 |   "South Sudan":"Juba",
214 |   "Spain":"Madrid",
215 |   "Sri Lanka":"Colombo",
216 |   "Sudan":"Khartoum",
217 |   "Suriname":"Paramaribo",
218 |   "Svalbard and Jan Mayen":"Longyearbyen",
219 |   "Swaziland":"Lobamba",
220 |   "Sweden":"Stockholm",
221 |   "Switzerland":"Bern",
222 |   "Syrian Arab Republic":"Damascus",
223 |   "Taiwan":"Taipei",
224 |   "Tajikistan":"Dushanbe",
225 |   "Tanzania, United Republic of":"Dodoma",
226 |   "Thailand":"Bangkok",
227 |   "Timor-Leste":"Dili",
228 |   "Togo":"Lom\u00e9",
229 |   "Tokelau":"Fakaofo",
230 |   "Tonga":"Nuku'alofa",
231 |   "Trinidad and Tobago":"Port of Spain",
232 |   "Tunisia":"Tunis",
233 |   "Turkey":"Ankara",
234 |   "Turkmenistan":"Ashgabat",
235 |   "Turks and Caicos Islands":"Cockburn Town",
236 |   "Tuvalu":"Funafuti",
237 |   "Uganda":"Kampala",
238 |   "Ukraine":"Kiev",
239 |   "United Arab Emirates":"Abu Dhabi",
240 |   "United Kingdom of Great Britain and Northern Ireland":"London",
241 |   "United States of America":"Washington, D.C.",
242 |   "Uruguay":"Montevideo",
243 |   "Uzbekistan":"Tashkent",
244 |   "Vanuatu":"Port Vila",
245 |   "Venezuela (Bolivarian Republic of)":"Caracas",
246 |   "Viet Nam":"Hanoi",
247 |   "Wallis and Futuna":"Mata-Utu",
248 |   "Western Sahara":"El Aai\u00fan",
249 |   "Yemen":"Sana'a",
250 |   "Zambia":"Lusaka",
251 |   "Zimbabwe":"Harare"
252 | }


--------------------------------------------------------------------------------
/Chapter_04.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Training a neural network model\n",
   8 |     "\n",
   9 |     "https://course.spacy.io/chapter4\n",
  10 |     "\n",
  11 |     "In this chapter, you'll learn how to update spaCy's statistical models to customize them for your use case – for example, to predict a new entity type in online comments. You'll write your own training loop from scratch, and understand the basics of how training works, along with tips and tricks that can make your custom NLP projects more successful.\n"
  12 |    ]
  13 |   },
  14 |   {
  15 |    "cell_type": "markdown",
  16 |    "metadata": {},
  17 |    "source": [
  18 |     "## Training and updating models\n",
  19 |     "\n",
  20 |     "Welcome to the final chapter, which is about one of the most exciting aspects of modern NLP: training your own models!\n",
  21 |     "\n",
  22 |     "In this lesson, you'll learn about training and updating spaCy's neural network models and the data you need for it – focusing specifically on the named entity recognizer."
  23 |    ]
  24 |   },
  25 |   {
  26 |    "cell_type": "markdown",
  27 |    "metadata": {},
  28 |    "source": [
  29 |     "Why updating the model?\n",
  30 |     "\n",
  31 |     "* Better results on your specific domain\n",
  32 |     "* Learn classification schemes specifically for your problem\n",
  33 |     "* Essential for text classification\n",
  34 |     "* Very useful for named entity recognition\n",
  35 |     "* Less critical for part-of-speech tagging and dependency parsing\n",
  36 |     "\n",
  37 |     "Before we get starting with explaining how, it's worth taking a second to ask ourselves: Why would we want to update the model with our own examples? Why can't we just rely on pre-trained models?\n",
  38 |     "\n",
  39 |     "Statistical models make predictions based on the examples they were trained on.\n",
  40 |     "\n",
  41 |     "You can usually make the model more accurate by showing it examples from your domain.\n",
  42 |     "\n",
  43 |     "You often also want to predict categories specific to your problem, so the model needs to learn about them.\n",
  44 |     "\n",
  45 |     "This is essential for text classification, very useful for entity recognition and a little less critical for tagging and parsing."
  46 |    ]
  47 |   },
  48 |   {
  49 |    "cell_type": "markdown",
  50 |    "metadata": {},
  51 |    "source": [
  52 |     "### How training works\n",
  53 |     "\n",
  54 |     "1. Initialize the model weights randomly with nlp.begin_training\n",
  55 |     "\n",
  56 |     "2. Predict a few examples with the current weights by calling nlp.update\n",
  57 |     "\n",
  58 |     "3. Compare prediction with true labels\n",
  59 |     "\n",
  60 |     "4. Calculate how to change weights to improve predictions\n",
  61 |     "\n",
  62 |     "5. Update weights slightly\n",
  63 |     "\n",
  64 |     "6. Go back to 2.\n",
  65 |     "\n",
  66 |     "spaCy supports updating existing models with more examples, and training new models.\n",
  67 |     "\n",
  68 |     "If we're not starting with a pre-trained model, we first initialize the weights randomly.\n",
  69 |     "\n",
  70 |     "Next, we call nlp dot update, which predicts a batch of examples with the current weights.\n",
  71 |     "\n",
  72 |     "The model then checks the predictions against the correct answers, and decides how to change the weights to achieve better predictions next time.\n",
  73 |     "\n",
  74 |     "Finally, we make a small correction to the current weights and move on to the next batch of examples.\n",
  75 |     "\n",
  76 |     "We continue calling nlp dot update for each batch of examples in the data.\n"
  77 |    ]
  78 |   },
  79 |   {
  80 |    "cell_type": "markdown",
  81 |    "metadata": {},
  82 |    "source": [
  83 |     "Here's an illustration showing the process.\n",
  84 |     "\n",
  85 |     "<img src=\"img/trainning1.png\" />\n",
  86 |     "\n",
  87 |     "The training data are the examples we want to update the model with.\n",
  88 |     "\n",
  89 |     "The text should be a sentence, paragraph or longer document. For the best results, it should be similar to what the model will see at runtime.\n",
  90 |     "\n",
  91 |     "The label is what we want the model to predict. This can be a text category, or an entity span and its type.\n",
  92 |     "\n",
  93 |     "The gradient is how we should change the model to reduce the current error. It's computed when we compare the predicted label to the true label.\n",
  94 |     "\n",
  95 |     "After training, we can then save out an updated model and use it in our application."
  96 |    ]
  97 |   },
  98 |   {
  99 |    "cell_type": "markdown",
 100 |    "metadata": {},
 101 |    "source": [
 102 |     "* Training data: Examples and their annotations.\n",
 103 |     "\n",
 104 |     "* Text: The input text the model should predict a label for.\n",
 105 |     "\n",
 106 |     "* Label: The label the model should predict.\n",
 107 |     "\n",
 108 |     "* Gradient: How to change the weights.\n",
 109 |     "\n"
 110 |    ]
 111 |   },
 112 |   {
 113 |    "cell_type": "markdown",
 114 |    "metadata": {},
 115 |    "source": [
 116 |     "### Training the entity recognizer\n",
 117 |     "\n",
 118 |     "Let's look at an example for a specific component: the entity recognizer.\n",
 119 |     "\n",
 120 |     "The entity recognizer takes a document and predicts phrases and their labels. This means that the training data needs to include texts, the entities they contain, and the entity labels.\n",
 121 |     "\n",
 122 |     "Entities can't overlap, so each token can only be part of one entity.\n",
 123 |     "\n",
 124 |     "Because the entity recognizer predicts entities in context, it also needs to be trained on entities and their surrounding context.\n",
 125 |     "\n",
 126 |     "The easiest way to do this is to show the model a text and a list of character offsets. For example, \"iPhone X\" is a gadget, starts at character 0 and ends at character 8.\n",
 127 |     "\n",
 128 |     "It's also very important for the model to learn words that aren't entities.\n",
 129 |     "\n",
 130 |     "In this case, the list of span annotations will be empty.\n",
 131 |     "\n",
 132 |     "Our goal is to teach the model to recognize new entities in similar contexts, even if they weren't in the training data."
 133 |    ]
 134 |   },
 135 |   {
 136 |    "cell_type": "markdown",
 137 |    "metadata": {},
 138 |    "source": [
 139 |     "* The entity recognizer tags words and phrases in context\n",
 140 |     "\n",
 141 |     "* Each token can only be part of one entity\n",
 142 |     "\n",
 143 |     "* Examples need to come with context\n",
 144 |     "\n",
 145 |     "(\"iPhone X is coming\", {'entities': [(0, 8, 'GADGET')]})\n",
 146 |     "\n",
 147 |     "* Texts with no entities are also important\n",
 148 |     "\n",
 149 |     "(\"I need a new phone! Any tips?\", {'entities': []})\n",
 150 |     "\n",
 151 |     "* Goal: teach the model to generalize"
 152 |    ]
 153 |   },
 154 |   {
 155 |    "cell_type": "markdown",
 156 |    "metadata": {},
 157 |    "source": [
 158 |     "### The training data\n",
 159 |     "\n",
 160 |     "The training data tells the model what we want it to predict. This could be texts and named entities we want to recognize, or tokens and their correct part-of-speech tags.\n",
 161 |     "\n",
 162 |     "To update an existing model, we can start with a few hundred to a few thousand examples.\n",
 163 |     "\n",
 164 |     "To train a new category we may need up to a million.\n",
 165 |     "\n",
 166 |     "spaCy's pre-trained English models for instance were trained on 2 million words labelled with part-of-speech tags, dependencies and named entities.\n",
 167 |     "\n",
 168 |     "Training data is usually created by humans who assign labels to texts.\n",
 169 |     "\n",
 170 |     "This is a lot of work, but can be semi-automated – for example, using spaCy's Matcher."
 171 |    ]
 172 |   },
 173 |   {
 174 |    "cell_type": "markdown",
 175 |    "metadata": {},
 176 |    "source": [
 177 |     "* Examples of what we want the model to predict in context\n",
 178 |     "\n",
 179 |     "* Update an existing model: a few hundred to a few thousand examples\n",
 180 |     "    \n",
 181 |     "* Train a new category: a few thousand to a million examples\n",
 182 |     "    \n",
 183 |     "* spaCy's English models: 2 million words\n",
 184 |     "\n",
 185 |     "* Usually created manually by human annotators\n",
 186 |     "\n",
 187 |     "* Can be semi-automated – for example, using spaCy's Matcher!"
 188 |    ]
 189 |   },
 190 |   {
 191 |    "cell_type": "markdown",
 192 |    "metadata": {},
 193 |    "source": [
 194 |     "Now it's time to get started and prepare the training data. Let's look at some examples and create a small dataset for a new entity type."
 195 |    ]
 196 |   },
 197 |   {
 198 |    "cell_type": "markdown",
 199 |    "metadata": {},
 200 |    "source": [
 201 |     "## Purpose of training"
 202 |    ]
 203 |   },
 204 |   {
 205 |    "cell_type": "markdown",
 206 |    "metadata": {},
 207 |    "source": [
 208 |     "While spaCy comes with a range of pre-trained models to predict linguistic annotations, you almost always want to fine-tune them with more examples. You can do this by training them with more labelled data.\n",
 209 |     "\n",
 210 |     "What does training not help with?\n",
 211 |     "\n",
 212 |     "( ) Improve model accuracy on your data.\n",
 213 |     "\n",
 214 |     "( ) Learn new classification schemes.\n",
 215 |     "\n",
 216 |     "(X) Discover patterns in unlabelled data.\n",
 217 |     "\n"
 218 |    ]
 219 |   },
 220 |   {
 221 |    "cell_type": "markdown",
 222 |    "metadata": {},
 223 |    "source": [
 224 |     "## Creating training data\n",
 225 |     "\n",
 226 |     "spaCy’s rule-based *Matcher*  is a great way to quickly create training data for named entity models. A list of sentences is available as the variable *TEXTS*. You can print it the IPython shell to inspect it. We want to find all mentions of different iPhone models, so we can create training data to teach a model to recognize them as 'GADGET'.\n",
 227 |     "\n",
 228 |     "Write a pattern for two tokens whose lowercase forms match 'iphone' and 'x'.\n",
 229 |     "Write a pattern for two tokens: one token whose lowercase form matches 'iphone' and an optional digit using the '?' operator."
 230 |    ]
 231 |   },
 232 |   {
 233 |    "cell_type": "code",
 234 |    "execution_count": 7,
 235 |    "metadata": {},
 236 |    "outputs": [],
 237 |    "source": [
 238 |     "import json\n",
 239 |     "from spacy.matcher import Matcher\n",
 240 |     "from spacy.lang.en import English\n",
 241 |     "\n",
 242 |     "with open(\"exercises/iphone.json\") as f:\n",
 243 |     "    TEXTS = json.loads(f.read())\n",
 244 |     "\n",
 245 |     "nlp = English()\n",
 246 |     "matcher = Matcher(nlp.vocab)\n",
 247 |     "\n",
 248 |     "# Two tokens whose lowercase forms match 'iphone' and 'x'\n",
 249 |     "pattern1 = [{\"LOWER\": \"iphone\"}, {\"LOWER\": \"x\"}]\n",
 250 |     "\n",
 251 |     "# Token whose lowercase form matches 'iphone' and an optional digit\n",
 252 |     "pattern2 = [{\"LOWER\": \"iphone\"}, {\"IS_DIGIT\": True, \"OP\": \"?\"}]\n",
 253 |     "\n",
 254 |     "# Add patterns to the matcher\n",
 255 |     "matcher.add(\"GADGET\", None, pattern1, pattern2)"
 256 |    ]
 257 |   },
 258 |   {
 259 |    "cell_type": "markdown",
 260 |    "metadata": {},
 261 |    "source": [
 262 |     "Let’s use the match patterns we’ve created in the previous exercise to bootstrap a set of training examples. A list of sentences is available as the variable TEXTS.\n",
 263 |     "\n",
 264 |     "* Create a doc object for each text using nlp.pipe.\n",
 265 |     "* Match on the doc and create a list of matched spans.\n",
 266 |     "* Get (start character, end character, label) tuples of matched spans.\n",
 267 |     "* Format each example as a tuple of the text and a dict, mapping 'entities' to the entity tuples.\n",
 268 |     "* Append the example to TRAINING_DATA and inspect the printed data."
 269 |    ]
 270 |   },
 271 |   {
 272 |    "cell_type": "code",
 273 |    "execution_count": 24,
 274 |    "metadata": {},
 275 |    "outputs": [
 276 |     {
 277 |      "name": "stdout",
 278 |      "output_type": "stream",
 279 |      "text": [
 280 |       "document: How to preorder the iPhone X\n",
 281 |       "[iPhone X, iPhone]\n",
 282 |       "[(20, 28, 'GADGET'), (20, 26, 'GADGET')]\n",
 283 |       "('How to preorder the iPhone X', {'entities': [(20, 28, 'GADGET'), (20, 26, 'GADGET')]})\n",
 284 |       "\n",
 285 |       "document: iPhone X is coming\n",
 286 |       "[iPhone X, iPhone]\n",
 287 |       "[(0, 8, 'GADGET'), (0, 6, 'GADGET')]\n",
 288 |       "('iPhone X is coming', {'entities': [(0, 8, 'GADGET'), (0, 6, 'GADGET')]})\n",
 289 |       "\n",
 290 |       "document: Should I pay $1,000 for the iPhone X?\n",
 291 |       "[iPhone X, iPhone]\n",
 292 |       "[(28, 36, 'GADGET'), (28, 34, 'GADGET')]\n",
 293 |       "('Should I pay $1,000 for the iPhone X?', {'entities': [(28, 36, 'GADGET'), (28, 34, 'GADGET')]})\n",
 294 |       "\n",
 295 |       "document: The iPhone 8 reviews are here\n",
 296 |       "[iPhone 8]\n",
 297 |       "[(4, 12, 'GADGET')]\n",
 298 |       "('The iPhone 8 reviews are here', {'entities': [(4, 12, 'GADGET')]})\n",
 299 |       "\n",
 300 |       "document: Your iPhone goes up to 11 today\n",
 301 |       "[iPhone]\n",
 302 |       "[(5, 11, 'GADGET')]\n",
 303 |       "('Your iPhone goes up to 11 today', {'entities': [(5, 11, 'GADGET')]})\n",
 304 |       "\n",
 305 |       "document: I need a new phone! Any tips?\n",
 306 |       "[]\n",
 307 |       "[]\n",
 308 |       "('I need a new phone! Any tips?', {'entities': []})\n",
 309 |       "\n",
 310 |       "('How to preorder the iPhone X', {'entities': [(20, 28, 'GADGET'), (20, 26, 'GADGET')]})\n",
 311 |       "('iPhone X is coming', {'entities': [(0, 8, 'GADGET'), (0, 6, 'GADGET')]})\n",
 312 |       "('Should I pay $1,000 for the iPhone X?', {'entities': [(28, 36, 'GADGET'), (28, 34, 'GADGET')]})\n",
 313 |       "('The iPhone 8 reviews are here', {'entities': [(4, 12, 'GADGET')]})\n",
 314 |       "('Your iPhone goes up to 11 today', {'entities': [(5, 11, 'GADGET')]})\n",
 315 |       "('I need a new phone! Any tips?', {'entities': []})\n"
 316 |      ]
 317 |     }
 318 |    ],
 319 |    "source": [
 320 |     "import json\n",
 321 |     "from spacy.matcher import Matcher\n",
 322 |     "from spacy.lang.en import English\n",
 323 |     "\n",
 324 |     "with open(\"exercises/iphone.json\") as f:\n",
 325 |     "    TEXTS = json.loads(f.read())\n",
 326 |     "\n",
 327 |     "nlp = English()\n",
 328 |     "matcher = Matcher(nlp.vocab)\n",
 329 |     "pattern1 = [{\"LOWER\": \"iphone\"}, {\"LOWER\": \"x\"}]\n",
 330 |     "pattern2 = [{\"LOWER\": \"iphone\"}, {\"IS_DIGIT\": True, \"OP\": \"?\"}]\n",
 331 |     "matcher.add(\"GADGET\", None, pattern1, pattern2)\n",
 332 |     "\n",
 333 |     "TRAINING_DATA = []\n",
 334 |     "\n",
 335 |     "# Create a Doc object for each text in TEXTS\n",
 336 |     "for doc in nlp.pipe(TEXTS):\n",
 337 |     "    print(\"document: \"+doc.text)\n",
 338 |     "    # Match on the doc and create a list of matched spans\n",
 339 |     "    spans = [doc[start:end] for match_id, start, end in matcher(doc)]\n",
 340 |     "    print(spans)\n",
 341 |     "    # Get (start character, end character, label) tuples of matches\n",
 342 |     "    entities = [(span.start_char, span.end_char, \"GADGET\") for span in spans]\n",
 343 |     "    print(entities)\n",
 344 |     "    # Format the matches as a (doc.text, entities) tuple\n",
 345 |     "    training_example = (doc.text, {\"entities\": entities})\n",
 346 |     "    print(training_example)\n",
 347 |     "    # Append the example to the training data\n",
 348 |     "    TRAINING_DATA.append(training_example)\n",
 349 |     "    print()\n",
 350 |     "print(*TRAINING_DATA, sep=\"\\n\")"
 351 |    ]
 352 |   },
 353 |   {
 354 |    "cell_type": "markdown",
 355 |    "metadata": {},
 356 |    "source": [
 357 |     "## The training loop\n",
 358 |     "\n",
 359 |     "While some other libraries give you one method that takes care of training a model, spaCy gives you full control over the training loop."
 360 |    ]
 361 |   },
 362 |   {
 363 |    "cell_type": "markdown",
 364 |    "metadata": {},
 365 |    "source": [
 366 |     "### The steps of a training loop:\n",
 367 |     "    \n",
 368 |     "1 Loop for a number of times.\n",
 369 |     "\n",
 370 |     "2 Shuffle the training data.\n",
 371 |     "\n",
 372 |     "3 Divide the data into batches.\n",
 373 |     "\n",
 374 |     "4 Update the model for each batch.\n",
 375 |     "\n",
 376 |     "5 Save the updated model."
 377 |    ]
 378 |   },
 379 |   {
 380 |    "cell_type": "markdown",
 381 |    "metadata": {},
 382 |    "source": [
 383 |     "The training loop is a series of steps that's performed to train or update a model.\n",
 384 |     "\n",
 385 |     "We usually need to perform it several times, for multiple iterations, so that the model can learn from it effectively. If we want to train for 10 iterations, we need to loop 10 times.\n",
 386 |     "\n",
 387 |     "To prevent the model from getting stuck in a suboptimal solution, we randomly shuffle the data for each iteration. This is a very common strategy when doing stochastic gradient descent.\n",
 388 |     "\n",
 389 |     "Next, we divide the training data into batches of several examples, also known as minibatching. This makes it easier to make a more accurate estimate of the gradient.\n",
 390 |     "\n",
 391 |     "Finally, we update the model for each batch, and start the loop again until we've reached the last iteration.\n",
 392 |     "\n",
 393 |     "We can then save the model to a directory and use it in spaCy."
 394 |    ]
 395 |   },
 396 |   {
 397 |    "cell_type": "markdown",
 398 |    "metadata": {},
 399 |    "source": [
 400 |     "### Recap: how training works\n",
 401 |     "\n",
 402 |     "<img src=\"img/trainning1.png\" />\n",
 403 |     "\n",
 404 |     "* Training data: Examples and their annotations.\n",
 405 |     "* Text: The input text the model should predict a label for.\n",
 406 |     "* Label: The label the model should predict.\n",
 407 |     "* Gradient: How to change the weights."
 408 |    ]
 409 |   },
 410 |   {
 411 |    "cell_type": "markdown",
 412 |    "metadata": {},
 413 |    "source": [
 414 |     "To recap:\n",
 415 |     "\n",
 416 |     "The training data are the examples we want to update the model with.\n",
 417 |     "\n",
 418 |     "The text should be a sentence, paragraph or longer document. For the best results, it should be similar to what the model will see at runtime.\n",
 419 |     "\n",
 420 |     "The label is what we want the model to predict. This can be a text category, or an entity span and its type.\n",
 421 |     "\n",
 422 |     "The gradient is how we should change the model to reduce the current error. It's computed when we compare the predicted label to the true label."
 423 |    ]
 424 |   },
 425 |   {
 426 |    "cell_type": "markdown",
 427 |    "metadata": {},
 428 |    "source": [
 429 |     "### Example loop\n",
 430 |     "\n",
 431 |     "Here's an example.\n",
 432 |     "\n",
 433 |     "Let's imagine we have a list of training examples consisting of texts and entity annotations.\n",
 434 |     "\n",
 435 |     "We want to loop for 10 iterations, so we're iterating over a range of 10.\n",
 436 |     "\n",
 437 |     "Next, we use the random module to randomly shuffle the training data.\n",
 438 |     "\n",
 439 |     "We then use spaCy's minibatch utility function to divide the examples into batches.\n",
 440 |     "\n",
 441 |     "For each batch, we get the texts and annotations and call the nlp dot update method to update the model.\n",
 442 |     "\n",
 443 |     "Finally, we call the nlp dot to disk method to save the trained model to a directory."
 444 |    ]
 445 |   },
 446 |   {
 447 |    "cell_type": "code",
 448 |    "execution_count": null,
 449 |    "metadata": {},
 450 |    "outputs": [],
 451 |    "source": [
 452 |     "'''\n",
 453 |     "TRAINING_DATA = [\n",
 454 |     "    (\"How to preorder the iPhone X\", {'entities': [(20, 28, 'GADGET')]})\n",
 455 |     "    # And many more examples...\n",
 456 |     "]\n",
 457 |     "'''"
 458 |    ]
 459 |   },
 460 |   {
 461 |    "cell_type": "code",
 462 |    "execution_count": 24,
 463 |    "metadata": {
 464 |     "collapsed": true
 465 |    },
 466 |    "outputs": [
 467 |     {
 468 |      "data": {
 469 |       "text/plain": [
 470 |        "'\\n# Loop for 10 iterations\\nfor i in range(10):\\n    # Shuffle the training data\\n    random.shuffle(TRAINING_DATA)\\n    # Create batches and iterate over them\\n    for batch in spacy.util.minibatch(TRAINING_DATA):\\n        # Split the batch in texts and annotations\\n        texts = [text for text, annotation in batch]\\n        annotations = [annotation for text, annotation in batch]\\n        # Update the model\\n        nlp.update(texts, annotations)\\n\\n# Save the model\\nnlp.to_disk(\"example\")\\n'"
 471 |       ]
 472 |      },
 473 |      "execution_count": 24,
 474 |      "metadata": {},
 475 |      "output_type": "execute_result"
 476 |     }
 477 |    ],
 478 |    "source": [
 479 |     "'''\n",
 480 |     "# Loop for 10 iterations\n",
 481 |     "for i in range(10):\n",
 482 |     "    # Shuffle the training data\n",
 483 |     "    random.shuffle(TRAINING_DATA)\n",
 484 |     "    # Create batches and iterate over them\n",
 485 |     "    for batch in spacy.util.minibatch(TRAINING_DATA):\n",
 486 |     "        # Split the batch in texts and annotations\n",
 487 |     "        texts = [text for text, annotation in batch]\n",
 488 |     "        annotations = [annotation for text, annotation in batch]\n",
 489 |     "        # Update the model\n",
 490 |     "        nlp.update(texts, annotations)\n",
 491 |     "\n",
 492 |     "# Save the model\n",
 493 |     "nlp.to_disk(\"example\")\n",
 494 |     "'''"
 495 |    ]
 496 |   },
 497 |   {
 498 |    "cell_type": "markdown",
 499 |    "metadata": {},
 500 |    "source": [
 501 |     "### Update an existing model\n",
 502 |     "\n",
 503 |     "* Improve the predictions on new data\n",
 504 |     "* Especially useful to improve existing categories, like PERSON\n",
 505 |     "* Also possible to add new categories\n",
 506 |     "* Be careful and make sure the model doesn't \"forget\" the old ones"
 507 |    ]
 508 |   },
 509 |   {
 510 |    "cell_type": "markdown",
 511 |    "metadata": {},
 512 |    "source": [
 513 |     "spaCy lets you update an existing pre-trained model with more data – for example, to improve its predictions on different texts.\n",
 514 |     "\n",
 515 |     "This is especially useful if you want to improve categories the model already knows, like \"person\" or \"organization\".\n",
 516 |     "\n",
 517 |     "You can also update a model to add new categories.\n",
 518 |     "\n",
 519 |     "Just make sure to always update it with examples of the new category and examples of the other categories it previously predicted correctly. Otherwise improving the new category might hurt the other categories."
 520 |    ]
 521 |   },
 522 |   {
 523 |    "cell_type": "markdown",
 524 |    "metadata": {},
 525 |    "source": [
 526 |     "### Setting up a new pipeline from scratch\n",
 527 |     "\n",
 528 |     "In this example, we start off with a blank English model using the spacy dot blank method. The blank model doesn't have any pipeline components, only the language data and tokenization rules.\n",
 529 |     "\n",
 530 |     "We then create a blank entity recognizer and add it to the pipeline.\n",
 531 |     "\n",
 532 |     "Using the \"add label\" method, we can add new string labels to the model.\n",
 533 |     "\n",
 534 |     "We can now call nlp dot begin training to initialize the model with random weights.\n",
 535 |     "\n",
 536 |     "To get better accuracy, we want to loop over the examples more than once and randomly shuffle the data on each iteration.\n",
 537 |     "\n",
 538 |     "On each iteration, we divide the examples into batches using spaCy's minibatch utility function. Each example consists of a text and its annotations.\n",
 539 |     "\n",
 540 |     "Finally, we update the model with the texts and annotations and continue the loop."
 541 |    ]
 542 |   },
 543 |   {
 544 |    "cell_type": "code",
 545 |    "execution_count": 25,
 546 |    "metadata": {
 547 |     "collapsed": true
 548 |    },
 549 |    "outputs": [
 550 |     {
 551 |      "data": {
 552 |       "text/plain": [
 553 |        "\"\\n# Start with blank English model\\nnlp = spacy.blank('en')\\n# Create blank entity recognizer and add it to the pipeline\\nner = nlp.create_pipe('ner')\\nnlp.add_pipe(ner)\\n# Add a new label\\nner.add_label('GADGET')\\n\\n# Start the training\\nnlp.begin_training()\\n# Train for 10 iterations\\nfor itn in range(10):\\n    random.shuffle(examples)\\n    # Divide examples into batches\\n    for batch in spacy.util.minibatch(examples, size=2):\\n        texts = [text for text, annotation in batch]\\n        annotations = [annotation for text, annotation in batch]\\n        # Update the model\\n        nlp.update(texts, annotations)\\n        \""
 554 |       ]
 555 |      },
 556 |      "execution_count": 25,
 557 |      "metadata": {},
 558 |      "output_type": "execute_result"
 559 |     }
 560 |    ],
 561 |    "source": [
 562 |     "'''\n",
 563 |     "# Start with blank English model\n",
 564 |     "nlp = spacy.blank('en')\n",
 565 |     "# Create blank entity recognizer and add it to the pipeline\n",
 566 |     "ner = nlp.create_pipe('ner')\n",
 567 |     "nlp.add_pipe(ner)\n",
 568 |     "# Add a new label\n",
 569 |     "ner.add_label('GADGET')\n",
 570 |     "\n",
 571 |     "# Start the training\n",
 572 |     "nlp.begin_training()\n",
 573 |     "# Train for 10 iterations\n",
 574 |     "for itn in range(10):\n",
 575 |     "    random.shuffle(examples)\n",
 576 |     "    # Divide examples into batches\n",
 577 |     "    for batch in spacy.util.minibatch(examples, size=2):\n",
 578 |     "        texts = [text for text, annotation in batch]\n",
 579 |     "        annotations = [annotation for text, annotation in batch]\n",
 580 |     "        # Update the model\n",
 581 |     "        nlp.update(texts, annotations)\n",
 582 |     "        '''"
 583 |    ]
 584 |   },
 585 |   {
 586 |    "cell_type": "markdown",
 587 |    "metadata": {},
 588 |    "source": [
 589 |     "Time to practice! Now that you've seen the training loop, let's use the data created in the previous exercise to update a model."
 590 |    ]
 591 |   },
 592 |   {
 593 |    "cell_type": "markdown",
 594 |    "metadata": {},
 595 |    "source": [
 596 |     "## Setting up the pipeline\n",
 597 |     "\n",
 598 |     "In this exercise, you’ll prepare a spaCy pipeline to train the entity recognizer to recognize 'GADGET' entities in a text – for example, “iPhone X”.\n",
 599 |     "\n",
 600 |     "* Create a blank 'en' model, for example using the spacy.blank method.\n",
 601 |     "* Create a new entity recognizer using nlp.create_pipe and add it to the pipeline.\n",
 602 |     "* Add the new label 'GADGET' to the entity recognizer using the add_label method on the pipeline component.\n"
 603 |    ]
 604 |   },
 605 |   {
 606 |    "cell_type": "code",
 607 |    "execution_count": null,
 608 |    "metadata": {},
 609 |    "outputs": [],
 610 |    "source": [
 611 |     "import spacy\n",
 612 |     "\n",
 613 |     "# Create a blank 'en' model\n",
 614 |     "nlp = spacy.blank(\"en\")\n",
 615 |     "\n",
 616 |     "# Create a new entity recognizer and add it to the pipeline\n",
 617 |     "ner = nlp.create_pipe(\"ner\")\n",
 618 |     "nlp.add_pipe(ner)\n",
 619 |     "\n",
 620 |     "# Add the label 'GADGET' to the entity recognizer\n",
 621 |     "ner.add_label(\"GADGET\")"
 622 |    ]
 623 |   },
 624 |   {
 625 |    "cell_type": "markdown",
 626 |    "metadata": {},
 627 |    "source": [
 628 |     "## Building a training loop\n",
 629 |     "\n",
 630 |     "Let’s write a simple training loop from scratch!\n",
 631 |     "\n",
 632 |     "The pipeline you’ve created in the previous exercise is available as the nlp object. It already contains the entity recognizer with the added label 'GADGET'.\n",
 633 |     "\n",
 634 |     "The small set of labelled examples that you’ve created previously is available as TRAINING_DATA. To see the examples, you can print them in your script.\n",
 635 |     "\n",
 636 |     "Call nlp.begin_training, create a training loop for 10 iterations and shuffle the training data.\n",
 637 |     "Create batches of training data using spacy.util.minibatch and iterate over the batches.\n",
 638 |     "Convert the (text, annotations) tuples in the batch to lists of texts and annotations.\n",
 639 |     "For each batch, use nlp.update to update the model with the texts and annotations."
 640 |    ]
 641 |   },
 642 |   {
 643 |    "cell_type": "code",
 644 |    "execution_count": 26,
 645 |    "metadata": {},
 646 |    "outputs": [
 647 |     {
 648 |      "name": "stdout",
 649 |      "output_type": "stream",
 650 |      "text": [
 651 |       "2.2457692833\n",
 652 |       "3.2140585109\n",
 653 |       "3.4722888768\n",
 654 |       "0.0000015177\n",
 655 |       "0.5200842448\n",
 656 |       "0.5200924764\n",
 657 |       "0.0000000003\n",
 658 |       "0.0000000077\n",
 659 |       "0.0000000077\n",
 660 |       "0.0000000000\n",
 661 |       "0.0000077896\n",
 662 |       "0.0000088767\n",
 663 |       "0.0000000349\n",
 664 |       "0.0000000349\n",
 665 |       "0.0000000354\n",
 666 |       "0.0000000000\n",
 667 |       "0.0000000003\n",
 668 |       "0.0000000003\n",
 669 |       "0.0000000000\n",
 670 |       "0.0000000000\n",
 671 |       "0.0000000001\n",
 672 |       "0.0000000001\n",
 673 |       "0.0000000002\n",
 674 |       "0.0000000002\n",
 675 |       "0.0000000006\n",
 676 |       "0.0000000006\n",
 677 |       "0.0000000006\n",
 678 |       "0.0000000000\n",
 679 |       "0.0000000000\n",
 680 |       "0.0000000050\n"
 681 |      ]
 682 |     }
 683 |    ],
 684 |    "source": [
 685 |     "import spacy\n",
 686 |     "import random\n",
 687 |     "import json\n",
 688 |     "   \n",
 689 |     "with open(\"exercises/gadgets.json\") as f:\n",
 690 |     "    TRAINING_DATA = json.loads(f.read())\n",
 691 |     "\n",
 692 |     "nlp = spacy.blank(\"en\")\n",
 693 |     "ner = nlp.create_pipe(\"ner\")\n",
 694 |     "nlp.add_pipe(ner)\n",
 695 |     "ner.add_label(\"GADGET\")\n",
 696 |     "\n",
 697 |     "nlp.vocab.vectors.name = 'example'\n",
 698 |     "\n",
 699 |     "# Start the training\n",
 700 |     "nlp.begin_training()\n",
 701 |     "\n",
 702 |     "# Loop for 10 iterations\n",
 703 |     "for itn in range(10):\n",
 704 |     "    # Shuffle the training data\n",
 705 |     "    random.shuffle(TRAINING_DATA)\n",
 706 |     "    losses = {}\n",
 707 |     "\n",
 708 |     "    # Batch the examples and iterate over them\n",
 709 |     "    for batch in spacy.util.minibatch(TRAINING_DATA, size=2):\n",
 710 |     "        texts = [text for text, entities in batch]\n",
 711 |     "        annotations = [entities for text, entities in batch]\n",
 712 |     "\n",
 713 |     "        # Update the model\n",
 714 |     "        nlp.update(texts, annotations, losses=losses)\n",
 715 |     "        print(\"{0:.10f}\".format(losses['ner']) )"
 716 |    ]
 717 |   },
 718 |   {
 719 |    "cell_type": "markdown",
 720 |    "metadata": {},
 721 |    "source": [
 722 |     "## Exploring the model\n",
 723 |     "\n",
 724 |     "Let’s see how the model performs on unseen data! To speed things up a little, we already ran a trained model for the label 'GADGET' over some text. Here are some of the results:\n",
 725 |     "\n",
 726 |     "<img src=\"img/table.png\" />"
 727 |    ]
 728 |   },
 729 |   {
 730 |    "cell_type": "markdown",
 731 |    "metadata": {},
 732 |    "source": [
 733 |     "Out of all the entities in the texts, how many did the model get correct? \n",
 734 |     "\n",
 735 |     "Keep in mind that incomplete entity spans count as mistakes, too! Tip: Count the number of entities that the model should have predicted. Then count the number of entities it actually predicted correctly and divide it by the number of total correct entities.\n",
 736 |     "\n",
 737 |     "( ) 45%\n",
 738 |     "\n",
 739 |     "( ) 60%\n",
 740 |     "\n",
 741 |     "(X) 70%\n",
 742 |     "\n",
 743 |     "( ) 90%"
 744 |    ]
 745 |   },
 746 |   {
 747 |    "cell_type": "markdown",
 748 |    "metadata": {},
 749 |    "source": [
 750 |     "## Training best practices"
 751 |    ]
 752 |   },
 753 |   {
 754 |    "cell_type": "markdown",
 755 |    "metadata": {},
 756 |    "source": [
 757 |     "When you start running your own experiments, you might find that a lot of things just don't work the way you want them to. And that's okay.\n",
 758 |     "\n",
 759 |     "Training models is an iterative process, and you have to try different things until you find out what works best.\n",
 760 |     "\n",
 761 |     "In this lesson, I'll be sharing some best practices and things to keep in mind when training your own models.\n",
 762 |     "\n",
 763 |     "Let's take a look at some of the problems you may come across."
 764 |    ]
 765 |   },
 766 |   {
 767 |    "cell_type": "markdown",
 768 |    "metadata": {},
 769 |    "source": [
 770 |     "### Problem 1: Models can \"forget\" things\n",
 771 |     "\n",
 772 |     "* Existing model can overfit on new data\n",
 773 |     "e.g.: if you only update it with WEBSITE, it can \"unlearn\" what a PERSON is\n",
 774 |     "\n",
 775 |     "* Also known as \"catastrophic forgetting\" problem\n",
 776 |     "\n",
 777 |     "Statistical models can learn lots of things – but it doesn't mean that they won't unlearn them.\n",
 778 |     "\n",
 779 |     "If you're updating an existing model with new data, especially new labels, it can overfit and adjust too much to the new examples.\n",
 780 |     "\n",
 781 |     "For instance, if you're only updating it with examples of \"website\", it may \"forget\" other labels it previously predicted correctly – like \"person\".\n",
 782 |     "\n",
 783 |     "This is also known as the catastrophic forgetting problem."
 784 |    ]
 785 |   },
 786 |   {
 787 |    "cell_type": "markdown",
 788 |    "metadata": {},
 789 |    "source": [
 790 |     "### Solution 1: Mix in previously correct predictions\n",
 791 |     "\n",
 792 |     "To prevent this, make sure to always mix in examples of what the model previously got correct.\n",
 793 |     "\n",
 794 |     "If you're training a new category \"website\", also include examples of \"person\".\n",
 795 |     "\n",
 796 |     "spaCy can help you with this. You can create those additional examples by running the existing model over data and extracting the entity spans you care about.\n",
 797 |     "\n",
 798 |     "You can then mix those examples in with your existing data and update the model with annotations of all labels.\n",
 799 |     "\n",
 800 |     "For example, if you're training WEBSITE, also include examples of PERSON\n",
 801 |     "\n",
 802 |     "Run existing spaCy model over data and extract all other relevant entities\n",
 803 |     "\n",
 804 |     "**BAD:**\n",
 805 |     "\n",
 806 |     "TRAINING_DATA = [\n",
 807 |     "    ('Reddit is a website', {'entities': [(0, 6, 'WEBSITE')]})\n",
 808 |     "]\n",
 809 |     "\n",
 810 |     "**GOOD:**\n",
 811 |     "\n",
 812 |     "TRAINING_DATA = [\n",
 813 |     "    ('Reddit is a website', {'entities': [(0, 6, 'WEBSITE')]}),\n",
 814 |     "    ('Obama is a person', {'entities': [(0, 5, 'PERSON')]})\n",
 815 |     "]"
 816 |    ]
 817 |   },
 818 |   {
 819 |    "cell_type": "markdown",
 820 |    "metadata": {},
 821 |    "source": [
 822 |     "### Problem 2: Models can't learn everything\n",
 823 |     "\n",
 824 |     "Another common problem is that your model just won't learn what you want it to.\n",
 825 |     "\n",
 826 |     "spaCy's models make predictions based on the local context – for example, for named entities, the surrounding words are most important.\n",
 827 |     "\n",
 828 |     "If the decision is difficult to make based on the context, the model can struggle to learn it.\n",
 829 |     "\n",
 830 |     "The label scheme also needs to be consistent and not too specific.\n",
 831 |     "\n",
 832 |     "For example, it may be very difficult to teach a model to predict whether something is adult clothing or children's clothing based on the context. However, just predicting the label \"clothing\" may work better.\n",
 833 |     "\n",
 834 |     "spaCy's models make predictions based on local context:\n",
 835 |     "\n",
 836 |     "* Model can struggle to learn if decision is difficult to make based on context\n",
 837 |     "\n",
 838 |     "* Label scheme needs to be consistent and not too specific\n",
 839 |     "\n",
 840 |     "For example: CLOTHING is better than ADULT_CLOTHING and CHILDRENS_CLOTHING"
 841 |    ]
 842 |   },
 843 |   {
 844 |    "cell_type": "markdown",
 845 |    "metadata": {},
 846 |    "source": [
 847 |     "### Solution 2: Plan your label scheme carefully\n",
 848 |     "\n",
 849 |     "Before you start training and updating models, it's worth taking a step back and planning your label scheme.\n",
 850 |     "\n",
 851 |     "Try to pick categories that are reflected in the local context and make them more generic if possible.\n",
 852 |     "\n",
 853 |     "You can always add a rule-based system later to go from generic to specific.\n",
 854 |     "\n",
 855 |     "Generic categories like \"clothing\" or \"band\" are both easier to label and easier to learn.\n",
 856 |     "\n",
 857 |     "Pick categories that are reflected in local context:\n",
 858 |     "    \n",
 859 |     "* More generic is better than too specific\n",
 860 |     "\n",
 861 |     "* Use rules to go from generic labels to specific categories\n",
 862 |     "\n",
 863 |     "**BAD:**\n",
 864 |     "\n",
 865 |     "LABELS = ['ADULT_SHOES', 'CHILDRENS_SHOES', 'BANDS_I_LIKE]\n",
 866 |     "          \n",
 867 |     "**GOOD:**\n",
 868 |     "\n",
 869 |     "LABELS = ['CLOTHING', 'BAND']"
 870 |    ]
 871 |   },
 872 |   {
 873 |    "cell_type": "markdown",
 874 |    "metadata": {},
 875 |    "source": [
 876 |     "## Good data x bad data\n",
 877 |     "\n",
 878 |     "Here’s an excerpt from a training set that labels the entity type TOURIST_DESTINATION in traveler reviews."
 879 |    ]
 880 |   },
 881 |   {
 882 |    "cell_type": "code",
 883 |    "execution_count": 32,
 884 |    "metadata": {},
 885 |    "outputs": [],
 886 |    "source": [
 887 |     "TRAINING_DATA = [\n",
 888 |     "    (\n",
 889 |     "        \"i went to amsterdem last year and the canals were beautiful\",\n",
 890 |     "        {\"entities\": [(10, 19, \"TOURIST_DESTINATION\")]},\n",
 891 |     "    ),\n",
 892 |     "    (\n",
 893 |     "        \"You should visit Paris once in your life, but the Eiffel Tower is kinda boring\",\n",
 894 |     "        {\"entities\": [(17, 22, \"TOURIST_DESTINATION\")]},\n",
 895 |     "    ),\n",
 896 |     "    (\"There's also a Paris in Arkansas, lol\", {\"entities\": []}),\n",
 897 |     "    (\n",
 898 |     "        \"Berlin is perfect for summer holiday: lots of parks, great nightlife, cheap beer!\",\n",
 899 |     "        {\"entities\": [(0, 6, \"TOURIST_DESTINATION\")]},\n",
 900 |     "    ),\n",
 901 |     "]"
 902 |    ]
 903 |   },
 904 |   {
 905 |    "cell_type": "markdown",
 906 |    "metadata": {},
 907 |    "source": [
 908 |     "Why is this data and label scheme problematic? \n",
 909 |     "\n",
 910 |     "( X ) Whether a place is a tourist destination is a subjective judgement and not a definitive category. It will be very difficult for the entity recognizer to learn.\n",
 911 |     "\n",
 912 |     "( ) Paris and Arkansas should also be labelled as tourist destinations for consistency. Otherwise, the model will be confused.\n",
 913 |     "\n",
 914 |     "( ) Rare out-of-vocabulary words like the misspelled 'amsterdem' shouldn't be labelled as entities.\n",
 915 |     "\n",
 916 |     "That's correct! A much better approach would be to only label GPE (geopolitical entity) or LOCATION and then use a rule-based system to determine whether the entity is a tourist destination in this context. For example, you could resolve the entities types back to a knowledge base or look them up in a travel wiki."
 917 |    ]
 918 |   },
 919 |   {
 920 |    "cell_type": "markdown",
 921 |    "metadata": {},
 922 |    "source": [
 923 |     "Rewrite the TRAINING_DATA to only use the label GPE (cities, states, countries) instead of TOURIST_DESTINATION.\n",
 924 |     "\n",
 925 |     "Don’t forget to add tuples for the GPE entities that weren’t labeled in the old data."
 926 |    ]
 927 |   },
 928 |   {
 929 |    "cell_type": "code",
 930 |    "execution_count": 33,
 931 |    "metadata": {},
 932 |    "outputs": [],
 933 |    "source": [
 934 |     "TRAINING_DATA = [\n",
 935 |     "    (\n",
 936 |     "        \"i went to amsterdem last year and the canals were beautiful\",\n",
 937 |     "        {\"entities\": [(10, 19, \"GPE\")]},\n",
 938 |     "    ),\n",
 939 |     "    (\n",
 940 |     "        \"You should visit Paris once in your life, but the Eiffel Tower is kinda boring\",\n",
 941 |     "        {\"entities\": [(17, 22, \"GPE\")]},\n",
 942 |     "    ),\n",
 943 |     "    (\n",
 944 |     "        \"There's also a Paris in Arkansas, lol\",\n",
 945 |     "        {\"entities\": [(15, 20, \"GPE\"), (24, 32, \"GPE\")]},\n",
 946 |     "    ),\n",
 947 |     "    (\n",
 948 |     "        \"Berlin is perfect for summer holiday: lots of parks, great nightlife, cheap beer!\",\n",
 949 |     "        {\"entities\": [(0, 6, \"GPE\")]},\n",
 950 |     "    ),\n",
 951 |     "]"
 952 |    ]
 953 |   },
 954 |   {
 955 |    "cell_type": "markdown",
 956 |    "metadata": {},
 957 |    "source": [
 958 |     "## Traiing multiple labels\n",
 959 |     "\n",
 960 |     "Here’s a small sample of a dataset created to train a new entity type WEBSITE. The original dataset contains a few thousand sentences. In this exercise, you’ll be doing the labeling by hand. In real life, you probably want to automate this and use an annotation tool – for example, Brat, a popular open-source solution, or Prodigy, our own annotation tool that integrates with spaCy."
 961 |    ]
 962 |   },
 963 |   {
 964 |    "cell_type": "markdown",
 965 |    "metadata": {},
 966 |    "source": [
 967 |     "Complete the entity offsets for the WEBSITE entities in the data. Feel free to use len() if you don’t want to count the characters."
 968 |    ]
 969 |   },
 970 |   {
 971 |    "cell_type": "code",
 972 |    "execution_count": 34,
 973 |    "metadata": {},
 974 |    "outputs": [],
 975 |    "source": [
 976 |     "TRAINING_DATA = [\n",
 977 |     "    (\n",
 978 |     "        \"Reddit partners with Patreon to help creators build communities\",\n",
 979 |     "        {\"entities\": [(0, 6, \"WEBSITE\"), (21, 28, \"WEBSITE\")]},\n",
 980 |     "    ),\n",
 981 |     "    (\"PewDiePie smashes YouTube record\", {\"entities\": [(18, 25, \"WEBSITE\")]}),\n",
 982 |     "    (\n",
 983 |     "        \"Reddit founder Alexis Ohanian gave away two Metallica tickets to fans\",\n",
 984 |     "        {\"entities\": [(0, 6, \"WEBSITE\")]},\n",
 985 |     "    ),\n",
 986 |     "    # And so on...\n",
 987 |     "]"
 988 |    ]
 989 |   },
 990 |   {
 991 |    "cell_type": "markdown",
 992 |    "metadata": {},
 993 |    "source": [
 994 |     "A model was trained with the data you just labelled, plus a few thousand similar examples. After training, it’s doing great on WEBSITE, but doesn’t recognize PERSON anymore. Why could this be happening?\n",
 995 |     "\n",
 996 |     "( ) It's very difficult for the model to learn about different categories like PERSON and WEBSITE.\n",
 997 |     "\n",
 998 |     "( X ) The training data included no examples of PERSON, so the model learned that this label is incorrect.\n",
 999 |     "\n",
1000 |     "( ) The hyperparameters need to be retuned so that both entity types can be recognized.\n",
1001 |     "\n",
1002 |     "**If PERSON entities occur in the training data but aren’t labelled, the model will learn that they shouldn’t be predicted.** \n",
1003 |     "\n",
1004 |     "Similarly, if an existing entity type isn’t present in the training data, the model may ”forget” and stop predicting it."
1005 |    ]
1006 |   },
1007 |   {
1008 |    "cell_type": "markdown",
1009 |    "metadata": {},
1010 |    "source": [
1011 |     "Update the training data to include annotations for the PERSON entities “PewDiePie” and “Alexis Ohanian”."
1012 |    ]
1013 |   },
1014 |   {
1015 |    "cell_type": "code",
1016 |    "execution_count": 35,
1017 |    "metadata": {},
1018 |    "outputs": [],
1019 |    "source": [
1020 |     "TRAINING_DATA = [\n",
1021 |     "    (\n",
1022 |     "        \"Reddit partners with Patreon to help creators build communities\",\n",
1023 |     "        {\"entities\": [(0, 6, \"WEBSITE\"), (21, 28, \"WEBSITE\")]},\n",
1024 |     "    ),\n",
1025 |     "    (\n",
1026 |     "        \"PewDiePie smashes YouTube record\",\n",
1027 |     "        {\"entities\": [(0, 9, \"PERSON\"), (18, 25, \"WEBSITE\")]},\n",
1028 |     "    ),\n",
1029 |     "    (\n",
1030 |     "        \"Reddit founder Alexis Ohanian gave away two Metallica tickets to fans\",\n",
1031 |     "        {\"entities\": [(0, 6, \"WEBSITE\"), (15, 29, \"PERSON\")]},\n",
1032 |     "    ),\n",
1033 |     "    # And so on...\n",
1034 |     "]"
1035 |    ]
1036 |   },
1037 |   {
1038 |    "cell_type": "markdown",
1039 |    "metadata": {},
1040 |    "source": [
1041 |     "## Wrapping up !\n",
1042 |     "\n",
1043 |     "Congratulations – you've made it to the end of the course!\n",
1044 |     "\n"
1045 |    ]
1046 |   },
1047 |   {
1048 |    "cell_type": "markdown",
1049 |    "metadata": {},
1050 |    "source": [
1051 |     "**Your new spaCy skills**\n",
1052 |     "\n",
1053 |     "* Extract linguistic features: part-of-speech tags, dependencies, named entities\n",
1054 |     "* Work with pre-trained statistical models\n",
1055 |     "* Find words and phrases using Matcher and PhraseMatcher match rules\n",
1056 |     "* Best practices for working with data structures Doc, Token Span, Vocab, Lexeme\n",
1057 |     "* Find semantic similarities using word vectors\n",
1058 |     "* Write custom pipeline components with extension attributes\n",
1059 |     "* Scale up your spaCy pipelines and make them fast\n",
1060 |     "* Create training data for spaCy' statistical models\n",
1061 |     "* Train and update spaCy's neural network models with new data\n",
1062 |     "\n",
1063 |     "\n",
1064 |     "Here's an overview of all the new skills you learned so far:\n",
1065 |     "\n",
1066 |     "In the first chapter, you learned how to extract linguistic features like part-of-speech tags, syntactic dependencies and named entities, and how to work with pre-trained statistical models.\n",
1067 |     "\n",
1068 |     "You also learned to write powerful match patterns to extract words and phrases using spaCy's matcher and phrase matcher.\n",
1069 |     "\n",
1070 |     "Chapter 2 was all about information extraction, and you learned how to work with the data structures, the Doc, Token and Span, as well as the vocab and lexical entries.\n",
1071 |     "\n",
1072 |     "You also used spaCy to predict semantic similarities using word vectors.\n",
1073 |     "\n",
1074 |     "In chapter 3, you got some more insights into spaCy's pipeline, and learned to write your own custom pipeline components that modify the Doc.\n",
1075 |     "\n",
1076 |     "You also created your own custom extension attributes for Docs, Tokens and Spans, and learned about processing streams and making your pipeline faster.\n",
1077 |     "\n",
1078 |     "Finally, in chapter 4, you learned about training and updating spaCy's statistical models, specifically the entity recognizer.\n",
1079 |     "\n",
1080 |     "You learned some useful tricks for how to create training data, and how to design your label scheme to get the best results."
1081 |    ]
1082 |   },
1083 |   {
1084 |    "cell_type": "markdown",
1085 |    "metadata": {},
1086 |    "source": [
1087 |     "**More things to do with spaCy**\n",
1088 |     "\n",
1089 |     "Of course, there's a lot more that spaCy can do that we didn't get to cover in this course.\n",
1090 |     "\n",
1091 |     "While we focused mostly on training the entity recognizer, you can also train and update the other statistical pipeline components like the part-of-speech tagger and dependency parser.\n",
1092 |     "\n",
1093 |     "Another useful pipeline component is the text classifier, which can learn to predict labels that apply to the whole text. It's not part of the pre-trained models, but you can add it to an existing model and train it on your own data."
1094 |    ]
1095 |   },
1096 |   {
1097 |    "cell_type": "markdown",
1098 |    "metadata": {},
1099 |    "source": [
1100 |     "Training and updating other pipeline components:\n",
1101 |     "\n",
1102 |     "* Part-of-speech tagger\n",
1103 |     "* Dependency parser\n",
1104 |     "* Text classifier\n",
1105 |     "\n",
1106 |     "Visit:\n",
1107 |     "\n",
1108 |     "https://spacy.io/usage/training"
1109 |    ]
1110 |   },
1111 |   {
1112 |    "cell_type": "code",
1113 |    "execution_count": null,
1114 |    "metadata": {},
1115 |    "outputs": [],
1116 |    "source": []
1117 |   },
1118 |   {
1119 |    "cell_type": "code",
1120 |    "execution_count": null,
1121 |    "metadata": {},
1122 |    "outputs": [],
1123 |    "source": []
1124 |   },
1125 |   {
1126 |    "cell_type": "code",
1127 |    "execution_count": null,
1128 |    "metadata": {},
1129 |    "outputs": [],
1130 |    "source": []
1131 |   },
1132 |   {
1133 |    "cell_type": "code",
1134 |    "execution_count": null,
1135 |    "metadata": {},
1136 |    "outputs": [],
1137 |    "source": []
1138 |   }
1139 |  ],
1140 |  "metadata": {
1141 |   "kernelspec": {
1142 |    "display_name": "Python 3",
1143 |    "language": "python",
1144 |    "name": "python3"
1145 |   },
1146 |   "language_info": {
1147 |    "codemirror_mode": {
1148 |     "name": "ipython",
1149 |     "version": 3
1150 |    },
1151 |    "file_extension": ".py",
1152 |    "mimetype": "text/x-python",
1153 |    "name": "python",
1154 |    "nbconvert_exporter": "python",
1155 |    "pygments_lexer": "ipython3",
1156 |    "version": "3.6.9"
1157 |   },
1158 |   "toc": {
1159 |    "base_numbering": 1,
1160 |    "nav_menu": {},
1161 |    "number_sections": true,
1162 |    "sideBar": true,
1163 |    "skip_h1_title": false,
1164 |    "title_cell": "Table of Contents",
1165 |    "title_sidebar": "Contents",
1166 |    "toc_cell": false,
1167 |    "toc_position": {},
1168 |    "toc_section_display": true,
1169 |    "toc_window_display": false
1170 |   }
1171 |  },
1172 |  "nbformat": 4,
1173 |  "nbformat_minor": 2
1174 | }
1175 | 


--------------------------------------------------------------------------------
/Chapter_03.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Processing Pipelines\n",
   8 |     "\n",
   9 |     "https://course.spacy.io/chapter3\n",
  10 |     "\n",
  11 |     "This chapter will show you to everything you need to know about spaCy's processing pipeline. You'll learn what goes on under the hood when you process a text, how to write your own components and add them to the pipeline, and how to use custom attributes to add your own meta data to the documents, spans and tokens"
  12 |    ]
  13 |   },
  14 |   {
  15 |    "cell_type": "markdown",
  16 |    "metadata": {},
  17 |    "source": [
  18 |     "## 1. Processing pipelines\n",
  19 |     "\n",
  20 |     "In this lesson, you'll learn about the pipeline components provided by spaCy, and what happens behind the scenes when you call nlp on a string of text.\n",
  21 |     "\n",
  22 |     "You've already written this plenty of times by now: pass a string of text to the nlp object, and receive a Doc object.\n",
  23 |     "\n",
  24 |     "But what does the nlp object actually do?\n",
  25 |     "\n",
  26 |     "First, the tokenizer is applied to turn the string of text into a Doc object. Next, a series of pipeline components is applied to the Doc in order. In this case, the tagger, then the parser, then the entity recognizer. Finally, the processed Doc is returned, so you can work with it."
  27 |    ]
  28 |   },
  29 |   {
  30 |    "cell_type": "markdown",
  31 |    "metadata": {},
  32 |    "source": [
  33 |     "<img src=\"img/whathappens.png\" />"
  34 |    ]
  35 |   },
  36 |   {
  37 |    "cell_type": "markdown",
  38 |    "metadata": {},
  39 |    "source": [
  40 |     "spaCy ships with the following built-in pipeline components:\n",
  41 |     "\n",
  42 |     "* The part-of-speech tagger sets the token dot tag attribute.\n",
  43 |     "\n",
  44 |     "* The dependency parser adds the token dot dep and token dot head attributes and is also responsible for detecting sentences and base noun phrases, also known as noun chunks.\n",
  45 |     "\n",
  46 |     "* The named entity recognizer adds the detected entities to the doc dot ents property. It also sets entity type attributes on the tokens that indicate if a token is part of an entity or not.\n",
  47 |     "\n",
  48 |     "* Finally, the text classifier sets category labels that apply to the whole text, and adds them to the doc dot cats property. Because text categories are always very specific, the text classifier is not included in any of the pre-trained models by default. But you can use it to train your own system.\n"
  49 |    ]
  50 |   },
  51 |   {
  52 |    "cell_type": "markdown",
  53 |    "metadata": {},
  54 |    "source": [
  55 |     "<img src=\"img/builtin.png\" />"
  56 |    ]
  57 |   },
  58 |   {
  59 |    "cell_type": "markdown",
  60 |    "metadata": {},
  61 |    "source": [
  62 |     "All models you can load into spaCy include several files and a meta JSON.\n",
  63 |     "\n",
  64 |     "The meta defines things like the language and pipeline. This tells spaCy which components to instantiate.\n",
  65 |     "\n",
  66 |     "The built-in components that make predictions also need binary data. The data is included in the model package and loaded into the component when you load the model."
  67 |    ]
  68 |   },
  69 |   {
  70 |    "cell_type": "markdown",
  71 |    "metadata": {},
  72 |    "source": [
  73 |     "<img src=\"img/hood.png\" /> \n",
  74 |     "\n",
  75 |     "### Pipelines Attributes\n",
  76 |     "\n",
  77 |     "To see the names of the pipeline components present in the current nlp object, you can use the nlp dot pipe names attribute.\n",
  78 |     "\n",
  79 |     "For a list of component name and component function tuples, you can use the nlp dot pipeline attribute.\n",
  80 |     "\n",
  81 |     "The component functions are the functions applied to the Doc to process it and set attributes – for example, part-of-speech tags or named entities."
  82 |    ]
  83 |   },
  84 |   {
  85 |    "cell_type": "code",
  86 |    "execution_count": 9,
  87 |    "metadata": {},
  88 |    "outputs": [
  89 |     {
  90 |      "name": "stdout",
  91 |      "output_type": "stream",
  92 |      "text": [
  93 |       "['tagger', 'parser', 'ner']\n"
  94 |      ]
  95 |     }
  96 |    ],
  97 |    "source": [
  98 |     "# Import the English language class and create the nlp object\n",
  99 |     "from spacy.lang.en import English\n",
 100 |     "\n",
 101 |     "nlp = spacy.load(\"en_core_web_sm\")\n",
 102 |     "\n",
 103 |     "# Process a text\n",
 104 |     "doc = nlp(\"This is a table.\")\n",
 105 |     "\n",
 106 |     "print(nlp.pipe_names)"
 107 |    ]
 108 |   },
 109 |   {
 110 |    "cell_type": "code",
 111 |    "execution_count": 10,
 112 |    "metadata": {},
 113 |    "outputs": [
 114 |     {
 115 |      "name": "stdout",
 116 |      "output_type": "stream",
 117 |      "text": [
 118 |       "[('tagger', <spacy.pipeline.pipes.Tagger object at 0x000001A481A9D630>), ('parser', <spacy.pipeline.pipes.DependencyParser object at 0x000001A4869A7A08>), ('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x000001A4869A7A68>)]\n"
 119 |      ]
 120 |     }
 121 |    ],
 122 |    "source": [
 123 |     "print(nlp.pipeline)"
 124 |    ]
 125 |   },
 126 |   {
 127 |    "cell_type": "markdown",
 128 |    "metadata": {},
 129 |    "source": [
 130 |     "## 2. What happens when you call NLP?"
 131 |    ]
 132 |   },
 133 |   {
 134 |    "cell_type": "markdown",
 135 |    "metadata": {},
 136 |    "source": [
 137 |     "What does spaCy do when you call nlp on a string of text?\n",
 138 |     "\n",
 139 |     "Tokenize the text and apply each pipeline component in order."
 140 |    ]
 141 |   },
 142 |   {
 143 |    "cell_type": "markdown",
 144 |    "metadata": {},
 145 |    "source": [
 146 |     "## 3. Inspecting the pipeline"
 147 |    ]
 148 |   },
 149 |   {
 150 |    "cell_type": "markdown",
 151 |    "metadata": {},
 152 |    "source": [
 153 |     "Let’s inspect the small English model’s pipeline!\n",
 154 |     "\n",
 155 |     "* Load the en_core_web_sm model and create the nlp object.\n",
 156 |     "\n",
 157 |     "* Print the names of the pipeline components using nlp.pipe_names.\n",
 158 |     "\n",
 159 |     "* Print the full pipeline of (name, component) tuples using nlp.pipeline."
 160 |    ]
 161 |   },
 162 |   {
 163 |    "cell_type": "code",
 164 |    "execution_count": 8,
 165 |    "metadata": {},
 166 |    "outputs": [
 167 |     {
 168 |      "name": "stdout",
 169 |      "output_type": "stream",
 170 |      "text": [
 171 |       "['tagger', 'parser', 'ner']\n",
 172 |       "[('tagger', <spacy.pipeline.pipes.Tagger object at 0x000001A484FDE5F8>), ('parser', <spacy.pipeline.pipes.DependencyParser object at 0x000001A485178CA8>), ('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x000001A485178D08>)]\n"
 173 |      ]
 174 |     }
 175 |    ],
 176 |    "source": [
 177 |     "import spacy\n",
 178 |     "\n",
 179 |     "# Load the en_core_web_sm model\n",
 180 |     "nlp = spacy.load(\"en_core_web_sm\")\n",
 181 |     "\n",
 182 |     "# Print the names of the pipeline components\n",
 183 |     "print(nlp.pipe_names)\n",
 184 |     "\n",
 185 |     "# Print the full pipeline of (name, component) tuples\n",
 186 |     "print(nlp.pipeline)"
 187 |    ]
 188 |   },
 189 |   {
 190 |    "cell_type": "markdown",
 191 |    "metadata": {},
 192 |    "source": [
 193 |     "## 4. Custom Pipeline components"
 194 |    ]
 195 |   },
 196 |   {
 197 |    "cell_type": "markdown",
 198 |    "metadata": {},
 199 |    "source": [
 200 |     "Now that you know how spaCy's pipeline works, let's take a look at another very powerful feature: custom pipeline components.\n",
 201 |     "\n",
 202 |     "Custom pipeline components let you add your own function to the spaCy pipeline that is executed when you call the nlp object on a text – for example, to modify the Doc and add more data to it."
 203 |    ]
 204 |   },
 205 |   {
 206 |    "cell_type": "markdown",
 207 |    "metadata": {},
 208 |    "source": [
 209 |     "After the text is tokenized and a Doc object has been created, pipeline components are applied in order. spaCy supports a range of built-in components, but also lets you define your own.\n",
 210 |     "\n",
 211 |     "Custom components are executed automatically when you call the nlp object on a text.\n",
 212 |     "\n",
 213 |     "They're especially useful for adding your own custom metadata to documents and tokens.\n",
 214 |     "\n",
 215 |     "You can also use them to update built-in attributes, like the named entity spans."
 216 |    ]
 217 |   },
 218 |   {
 219 |    "cell_type": "markdown",
 220 |    "metadata": {},
 221 |    "source": [
 222 |     "\n",
 223 |     "Fundamentally, a **pipeline component** is a function or callable that takes a doc, modifies it and returns it, so it can be processed by the next component in the pipeline.\n",
 224 |     "\n",
 225 |     "Components can be added to the pipeline using the nlp dot add pipe method. The method takes at least one argument: the component function.\n",
 226 |     "\n",
 227 |     "<img src=\"img/components.png\" /> "
 228 |    ]
 229 |   },
 230 |   {
 231 |    "cell_type": "markdown",
 232 |    "metadata": {},
 233 |    "source": [
 234 |     "To specify where to add the component in the pipeline, you can use the following keyword arguments:\n",
 235 |     "\n",
 236 |     "Setting **last** to True will add the component last in the pipeline. This is the default behavior.\n",
 237 |     "\n",
 238 |     "Setting **first** to True will add the component first in the pipeline, right after the tokenizer.\n",
 239 |     "\n",
 240 |     "The \"before\" and \"after\" arguments let you define the name of an existing component to add the new component before or after. For example, before equals \"ner\" will add it before the named entity recognizer.\n",
 241 |     "\n",
 242 |     "The other component to add the new component before or after needs to exist, though – otherwise, spaCy will raise an error.\n",
 243 |     "\n",
 244 |     "<img src=\"img/components2.png\" /> "
 245 |    ]
 246 |   },
 247 |   {
 248 |    "cell_type": "markdown",
 249 |    "metadata": {},
 250 |    "source": [
 251 |     "### Example: a simple component\n",
 252 |     "\n",
 253 |     "Here's an example of a simple pipeline component.\n",
 254 |     "\n",
 255 |     "We start off with the small English model.\n",
 256 |     "\n",
 257 |     "We then define the component – a function that takes a Doc object and returns it.\n",
 258 |     "\n",
 259 |     "Let's do something simple and print the length of the Doc that passes through the pipeline.\n",
 260 |     "\n",
 261 |     "Don't forget to return the Doc so it can be processed by the next component in the pipeline! The Doc created by the tokenizer is passed through all components, so it's important that they all return the modified doc.\n",
 262 |     "\n",
 263 |     "We can now add the component to the pipeline. Let's add it to the very beginning right after the tokenizer by setting first equals True.\n",
 264 |     "\n",
 265 |     "When we print the pipeline component names, the custom component now shows up at the start. This means it will be applied first when we process a Doc."
 266 |    ]
 267 |   },
 268 |   {
 269 |    "cell_type": "code",
 270 |    "execution_count": 13,
 271 |    "metadata": {},
 272 |    "outputs": [
 273 |     {
 274 |      "name": "stdout",
 275 |      "output_type": "stream",
 276 |      "text": [
 277 |       "Pipeline: ['custom_component', 'tagger', 'parser', 'ner']\n"
 278 |      ]
 279 |     }
 280 |    ],
 281 |    "source": [
 282 |     "# Create the nlp object\n",
 283 |     "nlp = spacy.load('en_core_web_sm')\n",
 284 |     "\n",
 285 |     "# Define a custom component\n",
 286 |     "def custom_component(doc):\n",
 287 |     "    # Print the doc's length\n",
 288 |     "    print('Doc length:', len(doc))\n",
 289 |     "    # Return the doc object\n",
 290 |     "    return doc\n",
 291 |     "\n",
 292 |     "# Add the component first in the pipeline\n",
 293 |     "nlp.add_pipe(custom_component, first=True)\n",
 294 |     "\n",
 295 |     "# Print the pipeline component names\n",
 296 |     "print('Pipeline:', nlp.pipe_names)"
 297 |    ]
 298 |   },
 299 |   {
 300 |    "cell_type": "markdown",
 301 |    "metadata": {},
 302 |    "source": [
 303 |     "Now when we process a text using the nlp object, the custom component will be applied to the Doc and the length of the document will be printed."
 304 |    ]
 305 |   },
 306 |   {
 307 |    "cell_type": "code",
 308 |    "execution_count": 14,
 309 |    "metadata": {},
 310 |    "outputs": [
 311 |     {
 312 |      "name": "stdout",
 313 |      "output_type": "stream",
 314 |      "text": [
 315 |       "Doc length: 3\n"
 316 |      ]
 317 |     }
 318 |    ],
 319 |    "source": [
 320 |     "# Create the nlp object\n",
 321 |     "nlp = spacy.load('en_core_web_sm')\n",
 322 |     "\n",
 323 |     "# Define a custom component\n",
 324 |     "def custom_component(doc):\n",
 325 |     "\n",
 326 |     "    # Print the doc's length\n",
 327 |     "    print('Doc length:', len(doc))\n",
 328 |     "\n",
 329 |     "    # Return the doc object\n",
 330 |     "    return doc\n",
 331 |     "\n",
 332 |     "# Add the component first in the pipeline\n",
 333 |     "nlp.add_pipe(custom_component, first=True)\n",
 334 |     "\n",
 335 |     "# Process a text\n",
 336 |     "doc = nlp(\"Hello world!\")"
 337 |    ]
 338 |   },
 339 |   {
 340 |    "cell_type": "markdown",
 341 |    "metadata": {},
 342 |    "source": [
 343 |     "## 5. Use Cases for Custom Components"
 344 |    ]
 345 |   },
 346 |   {
 347 |    "cell_type": "markdown",
 348 |    "metadata": {},
 349 |    "source": [
 350 |     "Which of these problems can be solved by custom pipeline components? Choose all that apply:\n",
 351 |     "\n",
 352 |     "1. Updating the pre-trained models and improving their predictions\n",
 353 |     "\n",
 354 |     "2. Computing your own values based on tokens and their attributes\n",
 355 |     "\n",
 356 |     "3. Adding named entities, for example based on a dictionary\n",
 357 |     "\n",
 358 |     "4. Implementing support for an additional language"
 359 |    ]
 360 |   },
 361 |   {
 362 |    "cell_type": "markdown",
 363 |    "metadata": {},
 364 |    "source": [
 365 |     "Answer: 2 and 3"
 366 |    ]
 367 |   },
 368 |   {
 369 |    "cell_type": "markdown",
 370 |    "metadata": {},
 371 |    "source": [
 372 |     "## 6. Simple Components"
 373 |    ]
 374 |   },
 375 |   {
 376 |    "cell_type": "markdown",
 377 |    "metadata": {},
 378 |    "source": [
 379 |     "The example shows a custom component that prints the character length of a document. Can you complete it?\n",
 380 |     "\n",
 381 |     "Complete the component function with the doc’s length.\n",
 382 |     "Add the length_component to the existing pipeline as the first component.\n",
 383 |     "Try out the new pipeline and process any text with the nlp object – for example “This is a sentence.”."
 384 |    ]
 385 |   },
 386 |   {
 387 |    "cell_type": "code",
 388 |    "execution_count": 1,
 389 |    "metadata": {},
 390 |    "outputs": [
 391 |     {
 392 |      "name": "stdout",
 393 |      "output_type": "stream",
 394 |      "text": [
 395 |       "['length_component', 'tagger', 'parser', 'ner']\n",
 396 |       "This document is 5 tokens long.\n"
 397 |      ]
 398 |     }
 399 |    ],
 400 |    "source": [
 401 |     "import spacy\n",
 402 |     "\n",
 403 |     "# Define the custom component\n",
 404 |     "def length_component(doc):\n",
 405 |     "    # Get the doc's length\n",
 406 |     "    doc_length = len(doc)\n",
 407 |     "    print(\"This document is {} tokens long.\".format(doc_length))\n",
 408 |     "    # Return the doc\n",
 409 |     "    return doc\n",
 410 |     "\n",
 411 |     "\n",
 412 |     "# Load the small English model\n",
 413 |     "nlp = spacy.load(\"en_core_web_sm\")\n",
 414 |     "\n",
 415 |     "# Add the component first in the pipeline and print the pipe names\n",
 416 |     "nlp.add_pipe(length_component, first=True)\n",
 417 |     "print(nlp.pipe_names)\n",
 418 |     "\n",
 419 |     "# Process a text\n",
 420 |     "doc = nlp(\"This is a sentence.\")"
 421 |    ]
 422 |   },
 423 |   {
 424 |    "cell_type": "markdown",
 425 |    "metadata": {},
 426 |    "source": [
 427 |     "## 7. Complex Components"
 428 |    ]
 429 |   },
 430 |   {
 431 |    "cell_type": "markdown",
 432 |    "metadata": {},
 433 |    "source": [
 434 |     "In this exercise, you’ll be writing a custom component that uses the PhraseMatcher to find animal names in the document and adds the matched spans to the doc.ents. A PhraseMatcher with the animal patterns has already been created as the variable matcher.\n",
 435 |     "\n",
 436 |     "* Define the custom component and apply the matcher to the doc.\n",
 437 |     "* Create a Span for each match, assign the label ID for 'ANIMAL' and overwrite the doc.ents with the new spans.\n",
 438 |     "* Add the new component to the pipeline after the 'ner' component.\n",
 439 |     "* Process the text and print the entity text and entity label for the entities in doc.ents."
 440 |    ]
 441 |   },
 442 |   {
 443 |    "cell_type": "code",
 444 |    "execution_count": 3,
 445 |    "metadata": {},
 446 |    "outputs": [
 447 |     {
 448 |      "name": "stdout",
 449 |      "output_type": "stream",
 450 |      "text": [
 451 |       "animal_patterns: [Golden Retriever, cat, turtle, Rattus norvegicus]\n",
 452 |       "['tagger', 'parser', 'ner', 'animal_component']\n",
 453 |       "[('cat', 'ANIMAL'), ('Golden Retriever', 'ANIMAL')]\n"
 454 |      ]
 455 |     }
 456 |    ],
 457 |    "source": [
 458 |     "import spacy\n",
 459 |     "from spacy.matcher import PhraseMatcher\n",
 460 |     "from spacy.tokens import Span\n",
 461 |     "\n",
 462 |     "nlp = spacy.load(\"en_core_web_sm\")\n",
 463 |     "animals = [\"Golden Retriever\", \"cat\", \"turtle\", \"Rattus norvegicus\"]\n",
 464 |     "animal_patterns = list(nlp.pipe(animals))\n",
 465 |     "print(\"animal_patterns:\", animal_patterns)\n",
 466 |     "matcher = PhraseMatcher(nlp.vocab)\n",
 467 |     "matcher.add(\"ANIMAL\", None, *animal_patterns)\n",
 468 |     "\n",
 469 |     "# Define the custom component\n",
 470 |     "def animal_component(doc):\n",
 471 |     "    # Apply the matcher to the doc\n",
 472 |     "    matches = matcher(doc)\n",
 473 |     "    # Create a Span for each match and assign the label 'ANIMAL'\n",
 474 |     "    spans = [Span(doc, start, end, label=\"ANIMAL\") for match_id, start, end in matches]\n",
 475 |     "    # Overwrite the doc.ents with the matched spans\n",
 476 |     "    doc.ents = spans\n",
 477 |     "    return doc\n",
 478 |     "\n",
 479 |     "\n",
 480 |     "# Add the component to the pipeline after the 'ner' component\n",
 481 |     "nlp.add_pipe(animal_component, after=\"ner\")\n",
 482 |     "print(nlp.pipe_names)\n",
 483 |     "\n",
 484 |     "# Process the text and print the text and label for the doc.ents\n",
 485 |     "doc = nlp(\"I have a cat and a Golden Retriever\")\n",
 486 |     "print([(ent.text, ent.label_) for ent in doc.ents])"
 487 |    ]
 488 |   },
 489 |   {
 490 |    "cell_type": "markdown",
 491 |    "metadata": {},
 492 |    "source": [
 493 |     "## 8. Extension Attributes"
 494 |    ]
 495 |   },
 496 |   {
 497 |    "cell_type": "markdown",
 498 |    "metadata": {},
 499 |    "source": [
 500 |     "In this lesson, you'll learn how to add custom attributes to the Doc, Token and Span objects to store custom data."
 501 |    ]
 502 |   },
 503 |   {
 504 |    "cell_type": "markdown",
 505 |    "metadata": {},
 506 |    "source": [
 507 |     "Custom attributes let you add any meta data to Docs, Tokens and Spans. The data can be added once, or it can be computed dynamically.\n",
 508 |     "\n",
 509 |     "Custom attributes are available via the dot-underscore property. This makes it clear that they were added by the user, and not built into spaCy, like token dot text.\n",
 510 |     "\n",
 511 |     "Attributes need to be registered on the global Doc, Token and Span classes you can import from spacy dot tokens. You've already worked with those in the previous chapters. To register a custom attribute on the Doc, Token or Span, you can use the set extension method.\n",
 512 |     "\n",
 513 |     "The first argument is the attribute name. Keyword arguments let you define how the value should be computed. In this case, it has a default value and can be overwritten."
 514 |    ]
 515 |   },
 516 |   {
 517 |    "cell_type": "markdown",
 518 |    "metadata": {},
 519 |    "source": [
 520 |     "### Setting custom attributes\n",
 521 |     "\n",
 522 |     "* Add custom metadata to documents, tokens and spans\n",
 523 |     "* Accessible via the ._ property"
 524 |    ]
 525 |   },
 526 |   {
 527 |    "cell_type": "code",
 528 |    "execution_count": null,
 529 |    "metadata": {},
 530 |    "outputs": [],
 531 |    "source": [
 532 |     "doc._.title = 'My document'\n",
 533 |     "\n",
 534 |     "token._.is_color = True\n",
 535 |     "\n",
 536 |     "span._.has_color = False"
 537 |    ]
 538 |   },
 539 |   {
 540 |    "cell_type": "markdown",
 541 |    "metadata": {},
 542 |    "source": [
 543 |     "Registered on the global Doc, Token or Span using the set_extension method"
 544 |    ]
 545 |   },
 546 |   {
 547 |    "cell_type": "code",
 548 |    "execution_count": 6,
 549 |    "metadata": {},
 550 |    "outputs": [],
 551 |    "source": [
 552 |     "# Import global classes\n",
 553 |     "from spacy.tokens import Doc, Token, Span\n",
 554 |     "\n",
 555 |     "# Set extensions on the Doc, Token and Span\n",
 556 |     "Doc.set_extension('title', default=None)\n",
 557 |     "Token.set_extension('is_color', default=False)\n",
 558 |     "Span.set_extension('has_color', default=False)"
 559 |    ]
 560 |   },
 561 |   {
 562 |    "cell_type": "markdown",
 563 |    "metadata": {},
 564 |    "source": [
 565 |     "There are three types of extensions: \n",
 566 |     "    * attribute extensions, \n",
 567 |     "    * property extensions and \n",
 568 |     "    * method extensions."
 569 |    ]
 570 |   },
 571 |   {
 572 |    "cell_type": "markdown",
 573 |    "metadata": {},
 574 |    "source": [
 575 |     "### Attribute extensions\n",
 576 |     "\n",
 577 |     "Attribute extensions set a default value that can be overwritten.\n",
 578 |     "\n",
 579 |     "For example, a custom \"is color\" attribute on the token that defaults to False.\n",
 580 |     "\n",
 581 |     "On individual tokens, its value can be changed by overwriting it – in this case, True for the token \"blue\"."
 582 |    ]
 583 |   },
 584 |   {
 585 |    "cell_type": "code",
 586 |    "execution_count": 9,
 587 |    "metadata": {},
 588 |    "outputs": [],
 589 |    "source": [
 590 |     "from spacy.tokens import Token\n",
 591 |     "\n",
 592 |     "# Set extension on the Token with default value\n",
 593 |     "Token.set_extension('is_color', default=False, force=True)\n",
 594 |     "\n",
 595 |     "doc = nlp(\"The sky is blue.\")\n",
 596 |     "\n",
 597 |     "# Overwrite extension attribute value\n",
 598 |     "doc[3]._.is_color = True"
 599 |    ]
 600 |   },
 601 |   {
 602 |    "cell_type": "markdown",
 603 |    "metadata": {},
 604 |    "source": [
 605 |     "### Property extensions\n",
 606 |     "\n",
 607 |     "* Define a getter and an optional setter function\n",
 608 |     "* Getter only called when you retrieve the attribute value\n",
 609 |     "\n",
 610 |     "Property extensions work like properties in Python: they can define a getter function and an optional setter.\n",
 611 |     "\n",
 612 |     "The getter function is only called when you retrieve the attribute. This lets you compute the value dynamically, and even take other custom attributes into account.\n",
 613 |     "\n",
 614 |     "Getter functions take one argument: the object, in this case, the token. In this example, the function returns whether the token text is in our list of colors.\n",
 615 |     "\n",
 616 |     "We can then provide the function via the getter keyword argument when we register the extension.\n",
 617 |     "\n",
 618 |     "The token \"blue\" now returns True for \"is color\".\n",
 619 |     "\n"
 620 |    ]
 621 |   },
 622 |   {
 623 |    "cell_type": "code",
 624 |    "execution_count": 11,
 625 |    "metadata": {},
 626 |    "outputs": [
 627 |     {
 628 |      "name": "stdout",
 629 |      "output_type": "stream",
 630 |      "text": [
 631 |       "True - blue\n"
 632 |      ]
 633 |     }
 634 |    ],
 635 |    "source": [
 636 |     "from spacy.tokens import Token\n",
 637 |     "\n",
 638 |     "# Define getter function\n",
 639 |     "def get_is_color(token):\n",
 640 |     "    colors = ['red', 'yellow', 'blue']\n",
 641 |     "    return token.text in colors\n",
 642 |     "\n",
 643 |     "# Set extension on the Token with getter\n",
 644 |     "Token.set_extension('is_color', getter=get_is_color, force=True)\n",
 645 |     "\n",
 646 |     "doc = nlp(\"The sky is blue.\")\n",
 647 |     "print(doc[3]._.is_color, '-', doc[3].text)"
 648 |    ]
 649 |   },
 650 |   {
 651 |    "cell_type": "markdown",
 652 |    "metadata": {},
 653 |    "source": [
 654 |     "* Span extensions should almost always use a getter\n",
 655 |     "\n",
 656 |     "If you want to set extension attributes on a Span, you almost always want to use a property extension with a getter. Otherwise, you'd have to update every possible span ever by hand to set all the values.\n",
 657 |     "\n",
 658 |     "In this example, the \"get has color\" function takes the span and returns whether the text of any of the tokens is in the list of colors.\n",
 659 |     "\n",
 660 |     "After we've processed the doc, we can check different slices of the doc and the custom \"has color\" property returns whether the span contains a color token or not."
 661 |    ]
 662 |   },
 663 |   {
 664 |    "cell_type": "code",
 665 |    "execution_count": 13,
 666 |    "metadata": {},
 667 |    "outputs": [
 668 |     {
 669 |      "name": "stdout",
 670 |      "output_type": "stream",
 671 |      "text": [
 672 |       "True - sky is blue\n",
 673 |       "False - The sky\n"
 674 |      ]
 675 |     }
 676 |    ],
 677 |    "source": [
 678 |     "from spacy.tokens import Span\n",
 679 |     "\n",
 680 |     "# Define getter function\n",
 681 |     "def get_has_color(span):\n",
 682 |     "    colors = ['red', 'yellow', 'blue']\n",
 683 |     "    return any(token.text in colors for token in span)\n",
 684 |     "\n",
 685 |     "# Set extension on the Span with getter\n",
 686 |     "Span.set_extension('has_color', getter=get_has_color, force=True)\n",
 687 |     "\n",
 688 |     "doc = nlp(\"The sky is blue.\")\n",
 689 |     "print(doc[1:4]._.has_color, '-', doc[1:4].text)\n",
 690 |     "print(doc[0:2]._.has_color, '-', doc[0:2].text)"
 691 |    ]
 692 |   },
 693 |   {
 694 |    "cell_type": "markdown",
 695 |    "metadata": {},
 696 |    "source": [
 697 |     "### Method extensions\n",
 698 |     "\n",
 699 |     "* Assign a function that becomes available as an object method\n",
 700 |     "* Lets you pass arguments to the extension function\n",
 701 |     "\n",
 702 |     "Method extensions make the extension attribute a callable method.\n",
 703 |     "\n",
 704 |     "You can then pass one or more arguments to it, and compute attribute values dynamically – for example, based on a certain argument or setting.\n",
 705 |     "\n",
 706 |     "In this example, the method function checks whether the doc contains a token with a given text. The first argument of the method is always the object itself – in this case, the Doc. It's passed in automatically when the method is called. All other function arguments will be arguments on the method extension. In this case, \"token text\".\n",
 707 |     "\n",
 708 |     "Here, the custom \"has token\" method returns True for the word \"blue\" and False for the word \"cloud\"."
 709 |    ]
 710 |   },
 711 |   {
 712 |    "cell_type": "code",
 713 |    "execution_count": 15,
 714 |    "metadata": {},
 715 |    "outputs": [
 716 |     {
 717 |      "name": "stdout",
 718 |      "output_type": "stream",
 719 |      "text": [
 720 |       "True - blue\n",
 721 |       "False - cloud\n"
 722 |      ]
 723 |     }
 724 |    ],
 725 |    "source": [
 726 |     "from spacy.tokens import Doc\n",
 727 |     "\n",
 728 |     "# Define method with arguments\n",
 729 |     "def has_token(doc, token_text):\n",
 730 |     "    in_doc = token_text in [token.text for token in doc]\n",
 731 |     "    return in_doc\n",
 732 |     "\n",
 733 |     "# Set extension on the Doc with method\n",
 734 |     "Doc.set_extension('has_token', method=has_token)\n",
 735 |     "\n",
 736 |     "doc = nlp(\"The sky is blue.\")\n",
 737 |     "print(doc._.has_token('blue'), '- blue')\n",
 738 |     "print(doc._.has_token('cloud'), '- cloud')"
 739 |    ]
 740 |   },
 741 |   {
 742 |    "cell_type": "markdown",
 743 |    "metadata": {},
 744 |    "source": [
 745 |     "## 9. Setting extension attributes\n",
 746 |     "\n",
 747 |     "Let’s practice setting some extension attributes.\n",
 748 |     "\n",
 749 |     "Step 1\n",
 750 |     "\n",
 751 |     "* Use Token.set_extension to register is_country (default False).\n",
 752 |     "* Update it for \"Spain\" and print it for all tokens."
 753 |    ]
 754 |   },
 755 |   {
 756 |    "cell_type": "code",
 757 |    "execution_count": 16,
 758 |    "metadata": {},
 759 |    "outputs": [
 760 |     {
 761 |      "name": "stdout",
 762 |      "output_type": "stream",
 763 |      "text": [
 764 |       "[('I', False), ('live', False), ('in', False), ('Spain', True), ('.', False)]\n"
 765 |      ]
 766 |     }
 767 |    ],
 768 |    "source": [
 769 |     "from spacy.lang.en import English\n",
 770 |     "from spacy.tokens import Token\n",
 771 |     "\n",
 772 |     "nlp = English()\n",
 773 |     "\n",
 774 |     "# Register the Token extension attribute 'is_country' with the default value False\n",
 775 |     "Token.set_extension(\"is_country\", default=False)\n",
 776 |     "\n",
 777 |     "# Process the text and set the is_country attribute to True for the token \"Spain\"\n",
 778 |     "doc = nlp(\"I live in Spain.\")\n",
 779 |     "doc[3]._.is_country = True\n",
 780 |     "\n",
 781 |     "# Print the token text and the is_country attribute for all tokens\n",
 782 |     "print([(token.text, token._.is_country) for token in doc])"
 783 |    ]
 784 |   },
 785 |   {
 786 |    "cell_type": "markdown",
 787 |    "metadata": {},
 788 |    "source": [
 789 |     "Step 2\n",
 790 |     "* Use Token.set_extension to register 'reversed' (getter function get_reversed).\n",
 791 |     "* Print its value for each token."
 792 |    ]
 793 |   },
 794 |   {
 795 |    "cell_type": "code",
 796 |    "execution_count": 17,
 797 |    "metadata": {},
 798 |    "outputs": [
 799 |     {
 800 |      "name": "stdout",
 801 |      "output_type": "stream",
 802 |      "text": [
 803 |       "reversed: llA\n",
 804 |       "reversed: snoitazilareneg\n",
 805 |       "reversed: era\n",
 806 |       "reversed: eslaf\n",
 807 |       "reversed: ,\n",
 808 |       "reversed: gnidulcni\n",
 809 |       "reversed: siht\n",
 810 |       "reversed: eno\n",
 811 |       "reversed: .\n"
 812 |      ]
 813 |     }
 814 |    ],
 815 |    "source": [
 816 |     "from spacy.lang.en import English\n",
 817 |     "from spacy.tokens import Token\n",
 818 |     "\n",
 819 |     "nlp = English()\n",
 820 |     "\n",
 821 |     "# Define the getter function that takes a token and returns its reversed text\n",
 822 |     "def get_reversed(token):\n",
 823 |     "    return token.text[::-1]\n",
 824 |     "\n",
 825 |     "\n",
 826 |     "# Register the Token property extension 'reversed' with the getter get_reversed\n",
 827 |     "Token.set_extension(\"reversed\", getter=get_reversed)\n",
 828 |     "\n",
 829 |     "# Process the text and print the reversed attribute for each token\n",
 830 |     "doc = nlp(\"All generalizations are false, including this one.\")\n",
 831 |     "for token in doc:\n",
 832 |     "    print(\"reversed:\", token._.reversed)"
 833 |    ]
 834 |   },
 835 |   {
 836 |    "cell_type": "markdown",
 837 |    "metadata": {},
 838 |    "source": [
 839 |     "## 10. Setting extension attributes\n",
 840 |     "\n",
 841 |     "Let’s try setting some more complex attributes using getters and method extensions.\n",
 842 |     "\n",
 843 |     "Part 1\n",
 844 |     "* Complete the has_number function .\n",
 845 |     "* Use Doc.set_extension to register has_number (getter get_has_number) and print its value.\n"
 846 |    ]
 847 |   },
 848 |   {
 849 |    "cell_type": "code",
 850 |    "execution_count": null,
 851 |    "metadata": {},
 852 |    "outputs": [],
 853 |    "source": [
 854 |     "Let’s try setting some more complex attributes using getters and method extensions.\n",
 855 |     "\n",
 856 |     "Part 1\n",
 857 |     "Complete the has_number function .\n",
 858 |     "Use Doc.set_extension to register has_number (getter get_has_number) and print its value."
 859 |    ]
 860 |   },
 861 |   {
 862 |    "cell_type": "code",
 863 |    "execution_count": 18,
 864 |    "metadata": {},
 865 |    "outputs": [
 866 |     {
 867 |      "name": "stdout",
 868 |      "output_type": "stream",
 869 |      "text": [
 870 |       "has_number: True\n"
 871 |      ]
 872 |     }
 873 |    ],
 874 |    "source": [
 875 |     "from spacy.lang.en import English\n",
 876 |     "from spacy.tokens import Doc\n",
 877 |     "\n",
 878 |     "nlp = English()\n",
 879 |     "\n",
 880 |     "# Define the getter function\n",
 881 |     "def get_has_number(doc):\n",
 882 |     "    # Return if any of the tokens in the doc return True for token.like_num\n",
 883 |     "    return any(token.like_num for token in doc)\n",
 884 |     "\n",
 885 |     "# Register the Doc property extension 'has_number' with the getter get_has_number\n",
 886 |     "Doc.set_extension(\"has_number\", getter=get_has_number)\n",
 887 |     "\n",
 888 |     "# Process the text and check the custom has_number attribute\n",
 889 |     "doc = nlp(\"The museum closed for five years in 2012.\")\n",
 890 |     "print(\"has_number:\", doc._.has_number)"
 891 |    ]
 892 |   },
 893 |   {
 894 |    "cell_type": "markdown",
 895 |    "metadata": {},
 896 |    "source": [
 897 |     "Part 2\n",
 898 |     "* Use Span.set_extension to register 'to_html' (method to_html).\n",
 899 |     "* Call it on doc[0:2] with the tag 'strong'."
 900 |    ]
 901 |   },
 902 |   {
 903 |    "cell_type": "code",
 904 |    "execution_count": 19,
 905 |    "metadata": {},
 906 |    "outputs": [
 907 |     {
 908 |      "name": "stdout",
 909 |      "output_type": "stream",
 910 |      "text": [
 911 |       "<strong>Hello world</strong>\n"
 912 |      ]
 913 |     }
 914 |    ],
 915 |    "source": [
 916 |     "from spacy.lang.en import English\n",
 917 |     "from spacy.tokens import Span\n",
 918 |     "\n",
 919 |     "nlp = English()\n",
 920 |     "\n",
 921 |     "# Define the method\n",
 922 |     "def to_html(span, tag):\n",
 923 |     "    # Wrap the span text in a HTML tag and return it\n",
 924 |     "    return \"<{tag}>{text}</{tag}>\".format(tag=tag, text=span.text)\n",
 925 |     "\n",
 926 |     "\n",
 927 |     "# Register the Span property extension 'to_html' with the method to_html\n",
 928 |     "Span.set_extension(\"to_html\", method=to_html)\n",
 929 |     "\n",
 930 |     "# Process the text and call the to_html method on the span with the tag name 'strong'\n",
 931 |     "doc = nlp(\"Hello world, this is a sentence.\")\n",
 932 |     "span = doc[0:2]\n",
 933 |     "print(span._.to_html(\"strong\"))"
 934 |    ]
 935 |   },
 936 |   {
 937 |    "cell_type": "markdown",
 938 |    "metadata": {},
 939 |    "source": [
 940 |     "## 11. Entities and extensions\n",
 941 |     "\n",
 942 |     "In this exercise, you’ll combine custom extension attributes with the model’s predictions and create an attribute getter that returns a Wikipedia search URL if the span is a person, organization, or location.\n",
 943 |     "\n",
 944 |     "* Complete the get_wikipedia_url getter so it only returns the URL if the span’s label is in the list of labels.\n",
 945 |     "* Set the Span extension 'wikipedia_url' using the getter get_wikipedia_url.\n",
 946 |     "* Iterate over the entities in the doc and output their Wikipedia URL.\n"
 947 |    ]
 948 |   },
 949 |   {
 950 |    "cell_type": "code",
 951 |    "execution_count": 21,
 952 |    "metadata": {},
 953 |    "outputs": [
 954 |     {
 955 |      "name": "stdout",
 956 |      "output_type": "stream",
 957 |      "text": [
 958 |       "over fifty years None\n",
 959 |       "first None\n",
 960 |       "David Bowie https://en.wikipedia.org/w/index.php?search=David_Bowie\n"
 961 |      ]
 962 |     }
 963 |    ],
 964 |    "source": [
 965 |     "import spacy\n",
 966 |     "from spacy.tokens import Span\n",
 967 |     "\n",
 968 |     "nlp = spacy.load(\"en_core_web_sm\")\n",
 969 |     "\n",
 970 |     "\n",
 971 |     "def get_wikipedia_url(span):\n",
 972 |     "    # Get a Wikipedia URL if the span has one of the labels\n",
 973 |     "    if span.label_ in (\"PERSON\", \"ORG\", \"GPE\", \"LOCATION\"):\n",
 974 |     "        entity_text = span.text.replace(\" \", \"_\")\n",
 975 |     "        return \"https://en.wikipedia.org/w/index.php?search=\" + entity_text\n",
 976 |     "\n",
 977 |     "\n",
 978 |     "# Set the Span extension wikipedia_url using get getter get_wikipedia_url\n",
 979 |     "Span.set_extension(\"wikipedia_url\", getter=get_wikipedia_url)\n",
 980 |     "\n",
 981 |     "doc = nlp(\n",
 982 |     "    \"In over fifty years from his very first recordings right through to his \"\n",
 983 |     "    \"last album, David Bowie was at the vanguard of contemporary culture.\"\n",
 984 |     ")\n",
 985 |     "for ent in doc.ents:\n",
 986 |     "    # Print the text and Wikipedia URL of the entity\n",
 987 |     "    print(ent.text, ent._.wikipedia_url)"
 988 |    ]
 989 |   },
 990 |   {
 991 |    "cell_type": "markdown",
 992 |    "metadata": {},
 993 |    "source": [
 994 |     "## 12. Components with extensions\n",
 995 |     "\n",
 996 |     "Extension attributes are especially powerful if they’re combined with custom pipeline components. In this exercise, you’ll write a pipeline component that finds country names and a custom extension attribute that returns a country’s capital, if available.\n",
 997 |     "\n",
 998 |     "A phrase matcher with all countries is available as the variable matcher. A dictionary of countries mapped to their capital cities is available as the variable CAPITALS.\n",
 999 |     "\n",
1000 |     "* Complete the countries_component and create a Span with the label 'GPE' (geopolitical entity) for all matches.\n",
1001 |     "* Add the component to the pipeline.\n",
1002 |     "* Register the Span extension attribute 'capital' with the getter get_capital.\n",
1003 |     "* Process the text and print the entity text, entity label and entity capital for each entity span in doc.ents."
1004 |    ]
1005 |   },
1006 |   {
1007 |    "cell_type": "code",
1008 |    "execution_count": 22,
1009 |    "metadata": {},
1010 |    "outputs": [
1011 |     {
1012 |      "name": "stdout",
1013 |      "output_type": "stream",
1014 |      "text": [
1015 |       "['countries_component']\n",
1016 |       "[('Czech Republic', 'GPE', 'Prague'), ('Slovakia', 'GPE', 'Bratislava')]\n"
1017 |      ]
1018 |     }
1019 |    ],
1020 |    "source": [
1021 |     "import json\n",
1022 |     "from spacy.lang.en import English\n",
1023 |     "from spacy.tokens import Span\n",
1024 |     "from spacy.matcher import PhraseMatcher\n",
1025 |     "\n",
1026 |     "with open(\"exercises/countries.json\") as f:\n",
1027 |     "    COUNTRIES = json.loads(f.read())\n",
1028 |     "\n",
1029 |     "with open(\"exercises/capitals.json\") as f:\n",
1030 |     "    CAPITALS = json.loads(f.read())\n",
1031 |     "\n",
1032 |     "nlp = English()\n",
1033 |     "matcher = PhraseMatcher(nlp.vocab)\n",
1034 |     "matcher.add(\"COUNTRY\", None, *list(nlp.pipe(COUNTRIES)))\n",
1035 |     "\n",
1036 |     "\n",
1037 |     "def countries_component(doc):\n",
1038 |     "    # Create an entity Span with the label 'GPE' for all matches\n",
1039 |     "    matches = matcher(doc)\n",
1040 |     "    doc.ents = [Span(doc, start, end, label=\"GPE\") for match_id, start, end in matches]\n",
1041 |     "    return doc\n",
1042 |     "\n",
1043 |     "\n",
1044 |     "# Add the component to the pipeline\n",
1045 |     "nlp.add_pipe(countries_component)\n",
1046 |     "print(nlp.pipe_names)\n",
1047 |     "\n",
1048 |     "# Getter that looks up the span text in the dictionary of country capitals\n",
1049 |     "get_capital = lambda span: CAPITALS.get(span.text)\n",
1050 |     "\n",
1051 |     "# Register the Span extension attribute 'capital' with the getter get_capital\n",
1052 |     "Span.set_extension(\"capital\", getter=get_capital)\n",
1053 |     "\n",
1054 |     "# Process the text and print the entity text, label and capital attributes\n",
1055 |     "doc = nlp(\"Czech Republic may help Slovakia protect its airspace\")\n",
1056 |     "print([(ent.text, ent.label_, ent._.capital) for ent in doc.ents])"
1057 |    ]
1058 |   },
1059 |   {
1060 |    "cell_type": "markdown",
1061 |    "metadata": {},
1062 |    "source": [
1063 |     "## 13. Scaling and performance\n",
1064 |     "\n",
1065 |     "In this lesson, I'll show you a few tips and tricks to make your spaCy pipelines run as fast as possible, and process large volumes of text efficiently.\n",
1066 |     "\n",
1067 |     "### Processing large volumes of text\n",
1068 |     "\n",
1069 |     "* Use nlp.pipe method\n",
1070 |     "* Processes texts as a stream, yields Doc objects\n",
1071 |     "* Much faster than calling nlp on each text\n",
1072 |     "\n",
1073 |     "BAD:\n",
1074 |     "\n",
1075 |     "docs = [nlp(text) for text in LOTS_OF_TEXTS]\n",
1076 |     "\n",
1077 |     "GOOD:\n",
1078 |     "\n",
1079 |     "docs = list(nlp.pipe(LOTS_OF_TEXTS))\n",
1080 |     "\n",
1081 |     "If you need to process a lot of texts and create a lot of Doc objects in a row, the nlp dot pipe method can speed this up significantly.\n",
1082 |     "\n",
1083 |     "It processes the texts as a stream and yields Doc objects.\n",
1084 |     "\n",
1085 |     "It is much faster than just calling nlp on each text, because it batches up the texts.\n",
1086 |     "\n",
1087 |     "nlp dot pipe is a generator that yields Doc objects, so in order to get a list of Docs, remember to call the list method around it.\n"
1088 |    ]
1089 |   },
1090 |   {
1091 |    "cell_type": "markdown",
1092 |    "metadata": {},
1093 |    "source": [
1094 |     "### Passing in context\n",
1095 |     "\n",
1096 |     "* Setting as_tuples=True on nlp.pipe lets you pass in (text, context) tuples\n",
1097 |     "* Yields (doc, context) tuples\n",
1098 |     "* Useful for associating metadata with the doc\n",
1099 |     "\n",
1100 |     "nlp dot pipe also supports passing in tuples of text / context if you set \"as tuples\" to True.\n",
1101 |     "\n",
1102 |     "The method will then yield doc / context tuples.\n",
1103 |     "\n",
1104 |     "This is useful for passing in additional metadata, like an ID associated with the text, or a page number."
1105 |    ]
1106 |   },
1107 |   {
1108 |    "cell_type": "code",
1109 |    "execution_count": 24,
1110 |    "metadata": {},
1111 |    "outputs": [
1112 |     {
1113 |      "name": "stdout",
1114 |      "output_type": "stream",
1115 |      "text": [
1116 |       "This is a text 15\n",
1117 |       "And another text 16\n"
1118 |      ]
1119 |     }
1120 |    ],
1121 |    "source": [
1122 |     "data = [\n",
1123 |     "    ('This is a text', {'id': 1, 'page_number': 15}),\n",
1124 |     "    ('And another text', {'id': 2, 'page_number': 16}),\n",
1125 |     "]\n",
1126 |     "\n",
1127 |     "for doc, context in nlp.pipe(data, as_tuples=True):\n",
1128 |     "    print(doc.text, context['page_number'])"
1129 |    ]
1130 |   },
1131 |   {
1132 |    "cell_type": "markdown",
1133 |    "metadata": {},
1134 |    "source": [
1135 |     "You can even add the context meta data to custom attributes.\n",
1136 |     "\n",
1137 |     "In this example, we're registering two extensions, \"id\" and \"page number\", which default to None.\n",
1138 |     "\n",
1139 |     "After processing the text and passing through the context, we can overwrite the doc extensions with our context metadata."
1140 |    ]
1141 |   },
1142 |   {
1143 |    "cell_type": "code",
1144 |    "execution_count": 25,
1145 |    "metadata": {},
1146 |    "outputs": [],
1147 |    "source": [
1148 |     "from spacy.tokens import Doc\n",
1149 |     "\n",
1150 |     "Doc.set_extension('id', default=None)\n",
1151 |     "Doc.set_extension('page_number', default=None)\n",
1152 |     "\n",
1153 |     "data = [\n",
1154 |     "    ('This is a text', {'id': 1, 'page_number': 15}),\n",
1155 |     "    ('And another text', {'id': 2, 'page_number': 16}),\n",
1156 |     "]\n",
1157 |     "\n",
1158 |     "for doc, context in nlp.pipe(data, as_tuples=True):\n",
1159 |     "    doc._.id = context['id']\n",
1160 |     "    doc._.page_number = context['page_number']"
1161 |    ]
1162 |   },
1163 |   {
1164 |    "cell_type": "markdown",
1165 |    "metadata": {},
1166 |    "source": [
1167 |     "### Using only the tokenizer\n",
1168 |     "\n",
1169 |     "Another common scenario: Sometimes you already have a model loaded to do other processing, but you only need the tokenizer for one particular text.\n",
1170 |     "\n",
1171 |     "Running the whole pipeline is unnecessarily slow, because you'll be getting a bunch of predictions from the model that you don't need."
1172 |    ]
1173 |   },
1174 |   {
1175 |    "cell_type": "markdown",
1176 |    "metadata": {},
1177 |    "source": [
1178 |     "<img src=\"img/tokenizer.png\" /> "
1179 |    ]
1180 |   },
1181 |   {
1182 |    "cell_type": "markdown",
1183 |    "metadata": {},
1184 |    "source": [
1185 |     "If you only need a tokenized Doc object, you can use the nlp dot make doc method instead, which takes a text and returns a Doc.\n",
1186 |     "\n",
1187 |     "This is also how spaCy does it behind the scenes: nlp dot make doc turns the text into a Doc before the pipeline components are called.\n",
1188 |     "\n",
1189 |     "Use nlp.make_doc to turn a text in to a Doc object\n",
1190 |     "\n",
1191 |     "BAD:\n",
1192 |     "\n",
1193 |     "doc = nlp(\"Hello world\")\n",
1194 |     "\n",
1195 |     "GOOD:\n",
1196 |     "\n",
1197 |     "doc = nlp.make_doc(\"Hello world!\")"
1198 |    ]
1199 |   },
1200 |   {
1201 |    "cell_type": "markdown",
1202 |    "metadata": {},
1203 |    "source": [
1204 |     "### Disabling pipeline components\n",
1205 |     "\n",
1206 |     "spaCy also allows you to temporarily disable pipeline components using the nlp dot disable pipes context manager.\n",
1207 |     "\n",
1208 |     "It takes a variable number of arguments, the string names of the pipeline components to disable. For example, if you only want to use the entity recognizer to process a document, you can temporarily disable the tagger and parser.\n",
1209 |     "\n",
1210 |     "After the with block, the disabled pipeline components are automatically restored.\n",
1211 |     "\n",
1212 |     "In the with block, spaCy will only run the remaining components.\n",
1213 |     "\n",
1214 |     "* Use nlp.disable_pipes to temporarily disable one or more pipes\n",
1215 |     "* Restores them after the with block\n",
1216 |     "* Only runs the remaining components"
1217 |    ]
1218 |   },
1219 |   {
1220 |    "cell_type": "code",
1221 |    "execution_count": 44,
1222 |    "metadata": {},
1223 |    "outputs": [
1224 |     {
1225 |      "name": "stdout",
1226 |      "output_type": "stream",
1227 |      "text": [
1228 |       "(American, College Park, Georgia)\n"
1229 |      ]
1230 |     }
1231 |    ],
1232 |    "source": [
1233 |     "# Disable tagger and parser\n",
1234 |     "with nlp.disable_pipes('tagger', 'parser'):\n",
1235 |     "    # Process the text and print the entities\n",
1236 |     "    doc = nlp(text)\n",
1237 |     "    print(doc.ents)"
1238 |    ]
1239 |   },
1240 |   {
1241 |    "cell_type": "markdown",
1242 |    "metadata": {},
1243 |    "source": [
1244 |     "## 14. Processing streams\n",
1245 |     "\n",
1246 |     "In this exercise, you’ll be using nlp.pipe for more efficient text processing. The nlp object has already been created for you. A list of tweets about a popular American fast food chain are available as the variable TEXTS.\n",
1247 |     "\n",
1248 |     "Part 1\n",
1249 |     "\n",
1250 |     "* Rewrite the example to use nlp.pipe. Instead of iterating over the texts and processing them, iterate over the doc objects yielded by nlp.pipe."
1251 |    ]
1252 |   },
1253 |   {
1254 |    "cell_type": "code",
1255 |    "execution_count": 30,
1256 |    "metadata": {},
1257 |    "outputs": [
1258 |     {
1259 |      "name": "stdout",
1260 |      "output_type": "stream",
1261 |      "text": [
1262 |       "['favorite']\n",
1263 |       "['sick']\n",
1264 |       "[]\n",
1265 |       "['happy']\n",
1266 |       "['delicious', 'fast']\n",
1267 |       "[]\n",
1268 |       "['terrible', 'gettin', 'payin']\n"
1269 |      ]
1270 |     }
1271 |    ],
1272 |    "source": [
1273 |     "import json\n",
1274 |     "import spacy\n",
1275 |     "\n",
1276 |     "nlp = spacy.load(\"en_core_web_sm\")\n",
1277 |     "\n",
1278 |     "with open(\"exercises/tweets.json\") as f:\n",
1279 |     "    TEXTS = json.loads(f.read())\n",
1280 |     "\n",
1281 |     "# Process the texts and print the adjectives\n",
1282 |     "for text in TEXTS:\n",
1283 |     "    doc = nlp(text)\n",
1284 |     "    print([token.text for token in doc if token.pos_ == \"ADJ\"])"
1285 |    ]
1286 |   },
1287 |   {
1288 |    "cell_type": "code",
1289 |    "execution_count": 35,
1290 |    "metadata": {},
1291 |    "outputs": [
1292 |     {
1293 |      "name": "stdout",
1294 |      "output_type": "stream",
1295 |      "text": [
1296 |       "['favorite']\n",
1297 |       "['sick']\n",
1298 |       "[]\n",
1299 |       "['happy']\n",
1300 |       "['delicious', 'fast']\n",
1301 |       "[]\n",
1302 |       "['terrible', 'gettin', 'payin']\n"
1303 |      ]
1304 |     }
1305 |    ],
1306 |    "source": [
1307 |     "import json\n",
1308 |     "import spacy\n",
1309 |     "\n",
1310 |     "nlp = spacy.load(\"en_core_web_sm\")\n",
1311 |     "\n",
1312 |     "with open(\"exercises/tweets.json\") as f:\n",
1313 |     "    TEXTS = json.loads(f.read())\n",
1314 |     "\n",
1315 |     "# Process the texts and print the adjectives\n",
1316 |     "for doc in nlp.pipe(TEXTS):\n",
1317 |     "    print([token.text for token in doc if token.pos_ == \"ADJ\"])"
1318 |    ]
1319 |   },
1320 |   {
1321 |    "cell_type": "markdown",
1322 |    "metadata": {},
1323 |    "source": [
1324 |     "Rewrite the example to use nlp.pipe. Don’t forget to call list() around the result to turn it into a list."
1325 |    ]
1326 |   },
1327 |   {
1328 |    "cell_type": "code",
1329 |    "execution_count": 31,
1330 |    "metadata": {},
1331 |    "outputs": [
1332 |     {
1333 |      "name": "stdout",
1334 |      "output_type": "stream",
1335 |      "text": [
1336 |       "(McDonalds,) (@McDonalds,) (McDonalds,) (McDonalds, Spain) (The Arch Deluxe,) (WANT, McRib) (This morning,)\n"
1337 |      ]
1338 |     }
1339 |    ],
1340 |    "source": [
1341 |     "import json\n",
1342 |     "import spacy\n",
1343 |     "\n",
1344 |     "nlp = spacy.load(\"en_core_web_sm\")\n",
1345 |     "\n",
1346 |     "with open(\"exercises/tweets.json\") as f:\n",
1347 |     "    TEXTS = json.loads(f.read())\n",
1348 |     "\n",
1349 |     "# Process the texts and print the entities\n",
1350 |     "docs = [nlp(text) for text in TEXTS]\n",
1351 |     "entities = [doc.ents for doc in docs]\n",
1352 |     "print(*entities)"
1353 |    ]
1354 |   },
1355 |   {
1356 |    "cell_type": "code",
1357 |    "execution_count": 34,
1358 |    "metadata": {},
1359 |    "outputs": [
1360 |     {
1361 |      "name": "stdout",
1362 |      "output_type": "stream",
1363 |      "text": [
1364 |       "(McDonalds,) (@McDonalds,) (McDonalds,) (McDonalds, Spain) (The Arch Deluxe,) (WANT, McRib) (This morning,)\n"
1365 |      ]
1366 |     }
1367 |    ],
1368 |    "source": [
1369 |     "import json\n",
1370 |     "import spacy\n",
1371 |     "\n",
1372 |     "nlp = spacy.load(\"en_core_web_sm\")\n",
1373 |     "\n",
1374 |     "with open(\"exercises/tweets.json\") as f:\n",
1375 |     "    TEXTS = json.loads(f.read())\n",
1376 |     "\n",
1377 |     "# Process the texts and print the entities\n",
1378 |     "docs = list(nlp.pipe(TEXTS))\n",
1379 |     "entities = [doc.ents for doc in docs]\n",
1380 |     "print(*entities)"
1381 |    ]
1382 |   },
1383 |   {
1384 |    "cell_type": "markdown",
1385 |    "metadata": {},
1386 |    "source": [
1387 |     "Rewrite the example to use nlp.pipe. Don’t forget to call list() around the result to turn it into a list."
1388 |    ]
1389 |   },
1390 |   {
1391 |    "cell_type": "code",
1392 |    "execution_count": 37,
1393 |    "metadata": {},
1394 |    "outputs": [
1395 |     {
1396 |      "data": {
1397 |       "text/plain": [
1398 |        "[David Bowie, Angela Merkel, Lady Gaga]"
1399 |       ]
1400 |      },
1401 |      "execution_count": 37,
1402 |      "metadata": {},
1403 |      "output_type": "execute_result"
1404 |     }
1405 |    ],
1406 |    "source": [
1407 |     "from spacy.lang.en import English\n",
1408 |     "\n",
1409 |     "nlp = English()\n",
1410 |     "\n",
1411 |     "people = [\"David Bowie\", \"Angela Merkel\", \"Lady Gaga\"]\n",
1412 |     "\n",
1413 |     "# Create a list of patterns for the PhraseMatcher\n",
1414 |     "patterns = [nlp(person) for person in people]\n",
1415 |     "patterns"
1416 |    ]
1417 |   },
1418 |   {
1419 |    "cell_type": "code",
1420 |    "execution_count": 39,
1421 |    "metadata": {},
1422 |    "outputs": [
1423 |     {
1424 |      "data": {
1425 |       "text/plain": [
1426 |        "[David Bowie, Angela Merkel, Lady Gaga]"
1427 |       ]
1428 |      },
1429 |      "execution_count": 39,
1430 |      "metadata": {},
1431 |      "output_type": "execute_result"
1432 |     }
1433 |    ],
1434 |    "source": [
1435 |     "from spacy.lang.en import English\n",
1436 |     "\n",
1437 |     "nlp = English()\n",
1438 |     "\n",
1439 |     "people = [\"David Bowie\", \"Angela Merkel\", \"Lady Gaga\"]\n",
1440 |     "\n",
1441 |     "# Create a list of patterns for the PhraseMatcher\n",
1442 |     "patterns = list(nlp.pipe(people))\n",
1443 |     "patterns"
1444 |    ]
1445 |   },
1446 |   {
1447 |    "cell_type": "markdown",
1448 |    "metadata": {},
1449 |    "source": [
1450 |     "## 15 Processing data with context\n",
1451 |     "\n",
1452 |     "In this exercise, you’ll be using custom attributes to add author and book meta information to quotes.\n",
1453 |     "\n",
1454 |     "A list of [text, context] examples is available as the variable DATA. The texts are quotes from famous books, and the contexts dictionaries with the keys 'author' and 'book'.\n",
1455 |     "\n",
1456 |     "Use the set_extension method to register the custom attributes 'author' and 'book' on the Doc, which default to None.\n",
1457 |     "Process the [text, context] pairs in DATA using nlp.pipe with as_tuples=True.\n",
1458 |     "Overwrite the doc._.book and doc._.author with the respective info passed in as the context."
1459 |    ]
1460 |   },
1461 |   {
1462 |    "cell_type": "code",
1463 |    "execution_count": 40,
1464 |    "metadata": {},
1465 |    "outputs": [
1466 |     {
1467 |      "name": "stdout",
1468 |      "output_type": "stream",
1469 |      "text": [
1470 |       "One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin. \n",
1471 |       " — 'Metamorphosis' by Franz Kafka \n",
1472 |       "\n",
1473 |       "I know not all that may be coming, but be it what it will, I'll go to it laughing. \n",
1474 |       " — 'Moby-Dick or, The Whale' by Herman Melville \n",
1475 |       "\n",
1476 |       "It was the best of times, it was the worst of times. \n",
1477 |       " — 'A Tale of Two Cities' by Charles Dickens \n",
1478 |       "\n",
1479 |       "The only people for me are the mad ones, the ones who are mad to live, mad to talk, mad to be saved, desirous of everything at the same time, the ones who never yawn or say a commonplace thing, but burn, burn, burn like fabulous yellow roman candles exploding like spiders across the stars. \n",
1480 |       " — 'On the Road' by Jack Kerouac \n",
1481 |       "\n",
1482 |       "It was a bright cold day in April, and the clocks were striking thirteen. \n",
1483 |       " — '1984' by George Orwell \n",
1484 |       "\n",
1485 |       "Nowadays people know the price of everything and the value of nothing. \n",
1486 |       " — 'The Picture Of Dorian Gray' by Oscar Wilde \n",
1487 |       "\n"
1488 |      ]
1489 |     }
1490 |    ],
1491 |    "source": [
1492 |     "import json\n",
1493 |     "from spacy.lang.en import English\n",
1494 |     "from spacy.tokens import Doc\n",
1495 |     "\n",
1496 |     "with open(\"exercises/bookquotes.json\") as f:\n",
1497 |     "    DATA = json.loads(f.read())\n",
1498 |     "\n",
1499 |     "nlp = English()\n",
1500 |     "\n",
1501 |     "# Register the Doc extension 'author' (default None)\n",
1502 |     "Doc.set_extension(\"author\", default=None)\n",
1503 |     "\n",
1504 |     "# Register the Doc extension 'book' (default None)\n",
1505 |     "Doc.set_extension(\"book\", default=None)\n",
1506 |     "\n",
1507 |     "for doc, context in nlp.pipe(DATA, as_tuples=True):\n",
1508 |     "    # Set the doc._.book and doc._.author attributes from the context\n",
1509 |     "    doc._.book = context[\"book\"]\n",
1510 |     "    doc._.author = context[\"author\"]\n",
1511 |     "\n",
1512 |     "    # Print the text and custom attribute data\n",
1513 |     "    print(doc.text, \"\\n\", \"— '{}' by {}\".format(doc._.book, doc._.author), \"\\n\")"
1514 |    ]
1515 |   },
1516 |   {
1517 |    "cell_type": "markdown",
1518 |    "metadata": {},
1519 |    "source": [
1520 |     "## 16 Selective processing\n",
1521 |     "\n",
1522 |     "In this exercise, you’ll use the nlp.make_doc and nlp.disable_pipes methods to only run selected components when processing a text.\n",
1523 |     "\n",
1524 |     "Part 1\n",
1525 |     "\n",
1526 |     "Rewrite the code to only tokenize the text using nlp.make_doc."
1527 |    ]
1528 |   },
1529 |   {
1530 |    "cell_type": "code",
1531 |    "execution_count": 41,
1532 |    "metadata": {},
1533 |    "outputs": [
1534 |     {
1535 |      "name": "stdout",
1536 |      "output_type": "stream",
1537 |      "text": [
1538 |       "['Chick', '-', 'fil', '-', 'A', 'is', 'an', 'American', 'fast', 'food', 'restaurant', 'chain', 'headquartered', 'in', 'the', 'city', 'of', 'College', 'Park', ',', 'Georgia', ',', 'specializing', 'in', 'chicken', 'sandwiches', '.']\n"
1539 |      ]
1540 |     }
1541 |    ],
1542 |    "source": [
1543 |     "import spacy\n",
1544 |     "\n",
1545 |     "nlp = spacy.load(\"en_core_web_sm\")\n",
1546 |     "text = (\n",
1547 |     "    \"Chick-fil-A is an American fast food restaurant chain headquartered in \"\n",
1548 |     "    \"the city of College Park, Georgia, specializing in chicken sandwiches.\"\n",
1549 |     ")\n",
1550 |     "\n",
1551 |     "# Only tokenize the text\n",
1552 |     "doc = nlp(text)\n",
1553 |     "print([token.text for token in doc])"
1554 |    ]
1555 |   },
1556 |   {
1557 |    "cell_type": "code",
1558 |    "execution_count": 42,
1559 |    "metadata": {},
1560 |    "outputs": [
1561 |     {
1562 |      "name": "stdout",
1563 |      "output_type": "stream",
1564 |      "text": [
1565 |       "['Chick', '-', 'fil', '-', 'A', 'is', 'an', 'American', 'fast', 'food', 'restaurant', 'chain', 'headquartered', 'in', 'the', 'city', 'of', 'College', 'Park', ',', 'Georgia', ',', 'specializing', 'in', 'chicken', 'sandwiches', '.']\n"
1566 |      ]
1567 |     }
1568 |    ],
1569 |    "source": [
1570 |     "import spacy\n",
1571 |     "\n",
1572 |     "nlp = spacy.load(\"en_core_web_sm\")\n",
1573 |     "text = (\n",
1574 |     "    \"Chick-fil-A is an American fast food restaurant chain headquartered in \"\n",
1575 |     "    \"the city of College Park, Georgia, specializing in chicken sandwiches.\"\n",
1576 |     ")\n",
1577 |     "\n",
1578 |     "# Only tokenize the text\n",
1579 |     "doc = nlp.make_doc(text)\n",
1580 |     "print([token.text for token in doc])"
1581 |    ]
1582 |   },
1583 |   {
1584 |    "cell_type": "markdown",
1585 |    "metadata": {},
1586 |    "source": [
1587 |     "Part 2\n",
1588 |     "\n",
1589 |     "Disable the tagger and parser using the nlp.disable_pipes method.\n",
1590 |     "\n",
1591 |     "Process the text and print all entities in the doc."
1592 |    ]
1593 |   },
1594 |   {
1595 |    "cell_type": "code",
1596 |    "execution_count": 43,
1597 |    "metadata": {},
1598 |    "outputs": [
1599 |     {
1600 |      "name": "stdout",
1601 |      "output_type": "stream",
1602 |      "text": [
1603 |       "(American, College Park, Georgia)\n"
1604 |      ]
1605 |     }
1606 |    ],
1607 |    "source": [
1608 |     "import spacy\n",
1609 |     "\n",
1610 |     "nlp = spacy.load(\"en_core_web_sm\")\n",
1611 |     "text = (\n",
1612 |     "    \"Chick-fil-A is an American fast food restaurant chain headquartered in \"\n",
1613 |     "    \"the city of College Park, Georgia, specializing in chicken sandwiches.\"\n",
1614 |     ")\n",
1615 |     "\n",
1616 |     "# Disable the tagger and parser\n",
1617 |     "with nlp.disable_pipes(\"tagger\", \"parser\"):\n",
1618 |     "    # Process the text\n",
1619 |     "    doc = nlp(text)\n",
1620 |     "    # Print the entities in the doc\n",
1621 |     "    print(doc.ents)"
1622 |    ]
1623 |   },
1624 |   {
1625 |    "cell_type": "code",
1626 |    "execution_count": null,
1627 |    "metadata": {},
1628 |    "outputs": [],
1629 |    "source": []
1630 |   },
1631 |   {
1632 |    "cell_type": "code",
1633 |    "execution_count": null,
1634 |    "metadata": {},
1635 |    "outputs": [],
1636 |    "source": []
1637 |   }
1638 |  ],
1639 |  "metadata": {
1640 |   "kernelspec": {
1641 |    "display_name": "Python 3",
1642 |    "language": "python",
1643 |    "name": "python3"
1644 |   },
1645 |   "language_info": {
1646 |    "codemirror_mode": {
1647 |     "name": "ipython",
1648 |     "version": 3
1649 |    },
1650 |    "file_extension": ".py",
1651 |    "mimetype": "text/x-python",
1652 |    "name": "python",
1653 |    "nbconvert_exporter": "python",
1654 |    "pygments_lexer": "ipython3",
1655 |    "version": "3.6.9"
1656 |   },
1657 |   "toc": {
1658 |    "base_numbering": 1,
1659 |    "nav_menu": {},
1660 |    "number_sections": false,
1661 |    "sideBar": false,
1662 |    "skip_h1_title": false,
1663 |    "title_cell": "Table of Contens",
1664 |    "title_sidebar": "Contents",
1665 |    "toc_cell": false,
1666 |    "toc_position": {
1667 |     "height": "calc(100% - 180px)",
1668 |     "left": "10px",
1669 |     "top": "150px",
1670 |     "width": "332.807px"
1671 |    },
1672 |    "toc_section_display": false,
1673 |    "toc_window_display": false
1674 |   }
1675 |  },
1676 |  "nbformat": 4,
1677 |  "nbformat_minor": 2
1678 | }
1679 | 


--------------------------------------------------------------------------------
/Chapter_01.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Finding words, phrases, names and concepts\n",
   8 |     "\n",
   9 |     "https://course.spacy.io/chapter1"
  10 |    ]
  11 |   },
  12 |   {
  13 |    "cell_type": "markdown",
  14 |    "metadata": {},
  15 |    "source": [
  16 |     "## Introduction to Spacy\n",
  17 |     "\n",
  18 |     "At the center of spaCy is the object containing the processing pipeline. We usually call this variable \"nlp\".\n",
  19 |     "\n",
  20 |     "For example, to create an English nlp object, you can import the English language class from spacy dot lang dot en and instantiate it. You can use the nlp object like a function to analyze text.\n",
  21 |     "\n",
  22 |     "It contains all the different components in the pipeline.\n",
  23 |     "\n",
  24 |     "It also includes language-specific rules used for tokenizing the text into words and punctuation. spaCy supports a variety of languages that are available in spacy dot lang."
  25 |    ]
  26 |   },
  27 |   {
  28 |    "cell_type": "code",
  29 |    "execution_count": 1,
  30 |    "metadata": {},
  31 |    "outputs": [],
  32 |    "source": [
  33 |     "# Import the English language class\n",
  34 |     "from spacy.lang.pt import Portuguese\n",
  35 |     "\n",
  36 |     "# Create the nlp object\n",
  37 |     "nlp = Portuguese()"
  38 |    ]
  39 |   },
  40 |   {
  41 |    "cell_type": "code",
  42 |    "execution_count": 2,
  43 |    "metadata": {},
  44 |    "outputs": [
  45 |     {
  46 |      "data": {
  47 |       "text/plain": [
  48 |        "<spacy.lang.pt.Portuguese at 0x1e1617712b0>"
  49 |       ]
  50 |      },
  51 |      "execution_count": 2,
  52 |      "metadata": {},
  53 |      "output_type": "execute_result"
  54 |     }
  55 |    ],
  56 |    "source": [
  57 |     "nlp"
  58 |    ]
  59 |   },
  60 |   {
  61 |    "cell_type": "markdown",
  62 |    "metadata": {},
  63 |    "source": [
  64 |     "When you process a text with the nlp object, spaCy creates a Doc object – short for \"document\". The Doc lets you access information about the text in a structured way, and no information is lost.\n",
  65 |     "\n",
  66 |     "The Doc behaves like a normal Python sequence by the way and lets you iterate over its tokens, or get a token by its index. But more on that later!"
  67 |    ]
  68 |   },
  69 |   {
  70 |    "cell_type": "code",
  71 |    "execution_count": 4,
  72 |    "metadata": {},
  73 |    "outputs": [
  74 |     {
  75 |      "name": "stdout",
  76 |      "output_type": "stream",
  77 |      "text": [
  78 |       "Olá\n",
  79 |       "mundo\n",
  80 |       "!\n"
  81 |      ]
  82 |     }
  83 |    ],
  84 |    "source": [
  85 |     "# Created by processing a string of text with the nlp object\n",
  86 |     "doc = nlp(\"Olá mundo!\")\n",
  87 |     "\n",
  88 |     "# Iterate over tokens in a Doc\n",
  89 |     "for token in doc:\n",
  90 |     "    print(token.text)"
  91 |    ]
  92 |   },
  93 |   {
  94 |    "cell_type": "markdown",
  95 |    "metadata": {},
  96 |    "source": [
  97 |     "<img src=\"img/token.png\" /> \n",
  98 |     "\n",
  99 |     "Token objects represent the tokens in a document – for example, a word or a punctuation character.\n",
 100 |     "\n",
 101 |     "To get a token at a specific position, you can index into the Doc.\n",
 102 |     "\n",
 103 |     "Token objects also provide various attributes that let you access more information about the tokens. For example, the dot text attribute returns the verbatim token text."
 104 |    ]
 105 |   },
 106 |   {
 107 |    "cell_type": "code",
 108 |    "execution_count": 60,
 109 |    "metadata": {},
 110 |    "outputs": [
 111 |     {
 112 |      "name": "stdout",
 113 |      "output_type": "stream",
 114 |      "text": [
 115 |       "mundo\n"
 116 |      ]
 117 |     }
 118 |    ],
 119 |    "source": [
 120 |     "doc = nlp(\"Oi mundo!\")\n",
 121 |     "\n",
 122 |     "# Index into the Doc to get a single Token\n",
 123 |     "token = doc[1]\n",
 124 |     "\n",
 125 |     "# Get the token text via the .text attribute\n",
 126 |     "print(token.text)"
 127 |    ]
 128 |   },
 129 |   {
 130 |    "cell_type": "markdown",
 131 |    "metadata": {},
 132 |    "source": [
 133 |     "<img src=\"img/span.png\" /> \n",
 134 |     "\n",
 135 |     "A Span object is a slice of the document consisting of one or more tokens. It's only a view of the Doc and doesn't contain any data itself.\n",
 136 |     "\n",
 137 |     "To create a Span, you can use Python's slice notation. For example, 1 colon 3 will create a slice starting from the token at position 1, up to – but not including! – the token at position 3."
 138 |    ]
 139 |   },
 140 |   {
 141 |    "cell_type": "code",
 142 |    "execution_count": 6,
 143 |    "metadata": {},
 144 |    "outputs": [
 145 |     {
 146 |      "name": "stdout",
 147 |      "output_type": "stream",
 148 |      "text": [
 149 |       "mundo!\n"
 150 |      ]
 151 |     }
 152 |    ],
 153 |    "source": [
 154 |     "doc = nlp(\"Oi mundo!\")\n",
 155 |     "\n",
 156 |     "# A slice from the Doc is a Span object\n",
 157 |     "span = doc[1:4]\n",
 158 |     "\n",
 159 |     "# Get the span text via the .text attribute\n",
 160 |     "print(span.text)"
 161 |    ]
 162 |   },
 163 |   {
 164 |    "cell_type": "markdown",
 165 |    "metadata": {},
 166 |    "source": [
 167 |     "Here you can see some of the available token attributes:\n",
 168 |     "\n",
 169 |     "\"i\" is the index of the token within the parent document.\n",
 170 |     "\n",
 171 |     "\"text\" returns the token text.\n",
 172 |     "\n",
 173 |     "\"is alpha\", \"is punct\" and \"like num\" return boolean values indicating whether the token consists of alphanumeric characters, whether it's punctuation or whether it resembles a number. For example, a token \"10\" – one, zero – or the word \"ten\" – T, E, N.\n",
 174 |     "\n",
 175 |     "These attributes are also called lexical attributes: they refer to the entry in the vocabulary and don't depend on the token's context."
 176 |    ]
 177 |   },
 178 |   {
 179 |    "cell_type": "code",
 180 |    "execution_count": 7,
 181 |    "metadata": {},
 182 |    "outputs": [
 183 |     {
 184 |      "name": "stdout",
 185 |      "output_type": "stream",
 186 |      "text": [
 187 |       "Index:    [0, 1, 2, 3, 4, 5]\n",
 188 |       "Text:     ['O', 'ingresso', 'custa', '$', '50', '.']\n",
 189 |       "is_alpha: [True, True, True, False, False, False]\n",
 190 |       "is_punct: [False, False, False, False, False, True]\n",
 191 |       "like_num: [False, False, False, False, True, False]\n"
 192 |      ]
 193 |     }
 194 |    ],
 195 |    "source": [
 196 |     "doc = nlp(\"O ingresso custa $50.\")\n",
 197 |     "\n",
 198 |     "print('Index:   ', [token.i for token in doc])\n",
 199 |     "print('Text:    ', [token.text for token in doc])\n",
 200 |     "\n",
 201 |     "print('is_alpha:', [token.is_alpha for token in doc])\n",
 202 |     "print('is_punct:', [token.is_punct for token in doc])\n",
 203 |     "print('like_num:', [token.like_num for token in doc])"
 204 |    ]
 205 |   },
 206 |   {
 207 |    "cell_type": "markdown",
 208 |    "metadata": {},
 209 |    "source": [
 210 |     "## Getting Started\n",
 211 |     "\n",
 212 |     "Let’s get started and try out spaCy! In this exercise, you’ll be able to try out some of the 45+ available languages.\n",
 213 |     "\n",
 214 |     "Part 1: English\n",
 215 |     "\n",
 216 |     "Import the English class from spacy.lang.en and create the nlp object.\n",
 217 |     "\n",
 218 |     "Create a doc and print its text."
 219 |    ]
 220 |   },
 221 |   {
 222 |    "cell_type": "code",
 223 |    "execution_count": 8,
 224 |    "metadata": {},
 225 |    "outputs": [
 226 |     {
 227 |      "name": "stdout",
 228 |      "output_type": "stream",
 229 |      "text": [
 230 |       "Essa é uma frase.\n"
 231 |      ]
 232 |     }
 233 |    ],
 234 |    "source": [
 235 |     "# Import the English language class\n",
 236 |     "from spacy.lang.pt import Portuguese\n",
 237 |     "\n",
 238 |     "# Create the nlp object\n",
 239 |     "nlp = Portuguese()\n",
 240 |     "\n",
 241 |     "# Process a text\n",
 242 |     "doc = nlp(\"Essa é uma frase.\")\n",
 243 |     "\n",
 244 |     "# Print the document text\n",
 245 |     "print(doc.text)"
 246 |    ]
 247 |   },
 248 |   {
 249 |    "cell_type": "markdown",
 250 |    "metadata": {},
 251 |    "source": [
 252 |     "## Documents, Spans and Tokens\n",
 253 |     "\n",
 254 |     "When you call nlp on a string, spaCy first tokenizes the text and creates a document object. In this exercise, you’ll learn more about the Doc, as well as its views Token and Span.\n",
 255 |     "\n",
 256 |     "Step 1\n",
 257 |     "\n",
 258 |     "Import the English language class and create the nlp object.\n",
 259 |     "\n",
 260 |     "Process the text and instantiate a Doc object in the variable doc.\n",
 261 |     "\n",
 262 |     "Select the first token of the Doc and print its text.\n",
 263 |     "\n",
 264 |     "Create a slice of the Doc for the tokens “tree kangaroos” and “tree kangaroos and narwhals”."
 265 |    ]
 266 |   },
 267 |   {
 268 |    "cell_type": "code",
 269 |    "execution_count": 9,
 270 |    "metadata": {},
 271 |    "outputs": [
 272 |     {
 273 |      "name": "stdout",
 274 |      "output_type": "stream",
 275 |      "text": [
 276 |       "I\n"
 277 |      ]
 278 |     }
 279 |    ],
 280 |    "source": [
 281 |     "# Import the English language class and create the nlp object\n",
 282 |     "from spacy.lang.en import English\n",
 283 |     "\n",
 284 |     "nlp = English()\n",
 285 |     "\n",
 286 |     "# Process the text\n",
 287 |     "doc = nlp(\"I like tree kangaroos and narwhals.\")\n",
 288 |     "\n",
 289 |     "# Select the first token\n",
 290 |     "first_token = doc[0]\n",
 291 |     "\n",
 292 |     "# Print the first token's text\n",
 293 |     "print(first_token.text)"
 294 |    ]
 295 |   },
 296 |   {
 297 |    "cell_type": "markdown",
 298 |    "metadata": {},
 299 |    "source": [
 300 |     "Import the English language class and create the nlp object.\n",
 301 |     "\n",
 302 |     "Process the text and instantiate a Doc object in the variable doc.\n",
 303 |     "\n",
 304 |     "Create a slice of the Doc for the tokens “tree kangaroos” and “tree kangaroos and narwhals”."
 305 |    ]
 306 |   },
 307 |   {
 308 |    "cell_type": "code",
 309 |    "execution_count": 10,
 310 |    "metadata": {},
 311 |    "outputs": [
 312 |     {
 313 |      "name": "stdout",
 314 |      "output_type": "stream",
 315 |      "text": [
 316 |       "tree kangaroos\n",
 317 |       "tree kangaroos and narwhals\n"
 318 |      ]
 319 |     }
 320 |    ],
 321 |    "source": [
 322 |     "# Import the English language class and create the nlp object\n",
 323 |     "from spacy.lang.en import English\n",
 324 |     "\n",
 325 |     "nlp = English()\n",
 326 |     "\n",
 327 |     "# Process the text\n",
 328 |     "doc = nlp(\"I like tree kangaroos and narwhals.\")\n",
 329 |     "\n",
 330 |     "# A slice of the Doc for \"tree kangaroos\"\n",
 331 |     "tree_kangaroos = doc[2:4]\n",
 332 |     "print(tree_kangaroos.text)\n",
 333 |     "\n",
 334 |     "# A slice of the Doc for \"tree kangaroos and narwhals\" (without the \".\")\n",
 335 |     "tree_kangaroos_and_narwhals = doc[2:6]\n",
 336 |     "print(tree_kangaroos_and_narwhals.text)"
 337 |    ]
 338 |   },
 339 |   {
 340 |    "cell_type": "markdown",
 341 |    "metadata": {},
 342 |    "source": [
 343 |     "## Lexical Attributes\n",
 344 |     "\n",
 345 |     "In this example, you’ll use spaCy’s Doc and Token objects, and lexical attributes to find percentages in a text. You’ll be looking for two subsequent tokens: a number and a percent sign.\n",
 346 |     "\n",
 347 |     "Use the like_num token attribute to check whether a token in the doc resembles a number.\n",
 348 |     "\n",
 349 |     "Get the token following the current token in the document. The index of the next token in the doc is token.i + 1.\n",
 350 |     "\n",
 351 |     "Check whether the next token’s text attribute is a percent sign ”%“."
 352 |    ]
 353 |   },
 354 |   {
 355 |    "cell_type": "code",
 356 |    "execution_count": 11,
 357 |    "metadata": {},
 358 |    "outputs": [
 359 |     {
 360 |      "name": "stdout",
 361 |      "output_type": "stream",
 362 |      "text": [
 363 |       "Percentage found: 60\n",
 364 |       "Percentage found: 4\n",
 365 |       "like_num: [False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False]\n"
 366 |      ]
 367 |     }
 368 |    ],
 369 |    "source": [
 370 |     "from spacy.lang.en import English\n",
 371 |     "\n",
 372 |     "nlp = English()\n",
 373 |     "\n",
 374 |     "# Process the text\n",
 375 |     "doc = nlp(\n",
 376 |     "    \"In 1990, more than 60% of people in East Asia were in extreme poverty. \"\n",
 377 |     "    \"Now less than 4% are.\"\n",
 378 |     ")\n",
 379 |     "\n",
 380 |     "# Iterate over the tokens in the doc\n",
 381 |     "for token in doc:\n",
 382 |     "    # Check if the token resembles a number\n",
 383 |     "    if token.like_num:\n",
 384 |     "        # Get the next token in the document\n",
 385 |     "        next_token = doc[token.i + 1]\n",
 386 |     "        # Check if the next token's text equals '%'\n",
 387 |     "        if next_token.text == \"%\":\n",
 388 |     "            print(\"Percentage found:\", token.text)\n",
 389 |     "            \n",
 390 |     "            \n",
 391 |     "print('like_num:', [token.like_num for token in doc])            "
 392 |    ]
 393 |   },
 394 |   {
 395 |    "cell_type": "markdown",
 396 |    "metadata": {},
 397 |    "source": [
 398 |     "## Statistical Models\n",
 399 |     "\n",
 400 |     "Some of the most interesting things you can analyze are context-specific: for example, whether a word is a verb or whether a span of text is a person name.\n",
 401 |     "\n",
 402 |     "### What are Statistical models ?\n",
 403 |     "\n",
 404 |     "Statistical models enable spaCy to make predictions in context. This usually includes part-of speech tags, syntactic dependencies and named entities.\n",
 405 |     "\n",
 406 |     "Models are trained on large datasets of labeled example texts.\n",
 407 |     "\n",
 408 |     "They can be updated with more examples to fine-tune their predictions – for example, to perform better on your specific data.\n",
 409 |     "\n",
 410 |     "### Model Packages\n",
 411 |     "\n",
 412 |     "spaCy provides a number of pre-trained model packages you can download using the \"spacy download\" command. For example, the \"en_core_web_sm\" package is a small English model that supports all core capabilities and is trained on web text.\n",
 413 |     "\n",
 414 |     "The spacy dot load method loads a model package by name and returns an nlp object.\n",
 415 |     "\n",
 416 |     "The package provides the binary weights that enable spaCy to make predictions.\n",
 417 |     "\n",
 418 |     "It also includes the vocabulary, and meta information to tell spaCy which language class to use and how to configure the processing pipeline."
 419 |    ]
 420 |   },
 421 |   {
 422 |    "cell_type": "markdown",
 423 |    "metadata": {},
 424 |    "source": [
 425 |     "<img src=\"img/models.png\" />"
 426 |    ]
 427 |   },
 428 |   {
 429 |    "cell_type": "markdown",
 430 |    "metadata": {},
 431 |    "source": [
 432 |     "Visit:\n",
 433 |     "https://github.com/explosion/spacy-models/releases/\n",
 434 |     "    \n",
 435 |     "Remember, to download: \n",
 436 |     "\n",
 437 |     " python -m spacy download pt_core_news_sm\n",
 438 |     " \n",
 439 |     " python -m spacy download en_core_news_sm\n",
 440 |     " \n",
 441 |     " python -m spacy download en_core_web_sm"
 442 |    ]
 443 |   },
 444 |   {
 445 |    "cell_type": "code",
 446 |    "execution_count": 12,
 447 |    "metadata": {
 448 |     "scrolled": true
 449 |    },
 450 |    "outputs": [
 451 |     {
 452 |      "name": "stdout",
 453 |      "output_type": "stream",
 454 |      "text": [
 455 |       "She PRON\n",
 456 |       "ate VERB\n",
 457 |       "the DET\n",
 458 |       "pizza NOUN\n"
 459 |      ]
 460 |     }
 461 |    ],
 462 |    "source": [
 463 |     "import spacy\n",
 464 |     "\n",
 465 |     "# Load the small English model\n",
 466 |     "nlp = spacy.load('en_core_web_sm')\n",
 467 |     "\n",
 468 |     "# Process a text\n",
 469 |     "doc = nlp(\"She ate the pizza\")\n",
 470 |     "\n",
 471 |     "# Iterate over the tokens\n",
 472 |     "for token in doc:\n",
 473 |     "    # Print the text and the predicted part-of-speech tag\n",
 474 |     "    print(token.text, token.pos_)"
 475 |    ]
 476 |   },
 477 |   {
 478 |    "cell_type": "code",
 479 |    "execution_count": 13,
 480 |    "metadata": {},
 481 |    "outputs": [
 482 |     {
 483 |      "name": "stdout",
 484 |      "output_type": "stream",
 485 |      "text": [
 486 |       "Ela PRON\n",
 487 |       "comeu VERB\n",
 488 |       "uma DET\n",
 489 |       "pizza NOUN\n",
 490 |       "enorme ADJ\n"
 491 |      ]
 492 |     }
 493 |    ],
 494 |    "source": [
 495 |     "import spacy\n",
 496 |     "\n",
 497 |     "# Load the small Portuguese model\n",
 498 |     "nlp = spacy.load('pt_core_news_sm')\n",
 499 |     "\n",
 500 |     "# Process a text\n",
 501 |     "doc = nlp(\"Ela comeu uma pizza enorme\")\n",
 502 |     "\n",
 503 |     "# Iterate over the tokens\n",
 504 |     "for token in doc:\n",
 505 |     "    # Print the text and the predicted part-of-speech tag\n",
 506 |     "    print(token.text, token.pos_)"
 507 |    ]
 508 |   },
 509 |   {
 510 |    "cell_type": "markdown",
 511 |    "metadata": {},
 512 |    "source": [
 513 |     "### Predicting part of speech tags\n",
 514 |     "\n",
 515 |     "For each token in the Doc, we can print the text and the \"pos underscore\" attribute, the predicted part-of-speech tag.\n",
 516 |     "\n",
 517 |     "In spaCy, attributes that return strings usually end with an underscore – attributes without the underscore return an ID.\n",
 518 |     "\n",
 519 |     "Here, the model correctly predicted \"ate\" as a verb and \"pizza\" as a noun."
 520 |    ]
 521 |   },
 522 |   {
 523 |    "cell_type": "markdown",
 524 |    "metadata": {},
 525 |    "source": [
 526 |     "Let's take a look at the model's predictions. In this example, we're using spaCy to predict part-of-speech tags, the word types in context.\n",
 527 |     "\n",
 528 |     "First, we load the small English model and receive an nlp object.\n",
 529 |     "\n",
 530 |     "Next, we're processing the text \"She ate the pizza\".\n",
 531 |     "\n",
 532 |     "For each token in the Doc, we can print the text and the \"pos underscore\" attribute, the predicted part-of-speech tag.\n",
 533 |     "\n",
 534 |     "In spaCy, attributes that return strings usually end with an underscore – attributes without the underscore return an ID.\n",
 535 |     "\n",
 536 |     "Here, the model correctly predicted \"ate\" as a verb and \"pizza\" as a noun."
 537 |    ]
 538 |   },
 539 |   {
 540 |    "cell_type": "code",
 541 |    "execution_count": 14,
 542 |    "metadata": {},
 543 |    "outputs": [
 544 |     {
 545 |      "name": "stdout",
 546 |      "output_type": "stream",
 547 |      "text": [
 548 |       "She PRON\n",
 549 |       "ate VERB\n",
 550 |       "the DET\n",
 551 |       "pizza NOUN\n"
 552 |      ]
 553 |     }
 554 |    ],
 555 |    "source": [
 556 |     "import spacy\n",
 557 |     "\n",
 558 |     "nlp= spacy.load(\"en_core_web_sm\")\n",
 559 |     "\n",
 560 |     "doc= nlp(\"She ate the pizza\")\n",
 561 |     "\n",
 562 |     "for token in doc:\n",
 563 |     "    print(token.text, token.pos_)"
 564 |    ]
 565 |   },
 566 |   {
 567 |    "cell_type": "code",
 568 |    "execution_count": 4,
 569 |    "metadata": {},
 570 |    "outputs": [
 571 |     {
 572 |      "name": "stdout",
 573 |      "output_type": "stream",
 574 |      "text": [
 575 |       "Posso VERB\n",
 576 |       "comprar VERB\n",
 577 |       "ingresso NOUN\n",
 578 |       "com ADP\n",
 579 |       "cartao NOUN\n",
 580 |       "de ADP\n",
 581 |       "débito NOUN\n"
 582 |      ]
 583 |     }
 584 |    ],
 585 |    "source": [
 586 |     "import spacy\n",
 587 |     "\n",
 588 |     "# Load the small English model\n",
 589 |     "nlp = spacy.load('pt_core_news_sm')\n",
 590 |     "\n",
 591 |     "# Process a text\n",
 592 |     "doc = nlp(\"Posso comprar ingresso com cartao de débito\")\n",
 593 |     "\n",
 594 |     "# Iterate over the tokens\n",
 595 |     "for token in doc:\n",
 596 |     "    # Print the text and the predicted part-of-speech tag\n",
 597 |     "    print(token.text, token.pos_)"
 598 |    ]
 599 |   },
 600 |   {
 601 |    "cell_type": "code",
 602 |    "execution_count": 16,
 603 |    "metadata": {},
 604 |    "outputs": [
 605 |     {
 606 |      "name": "stdout",
 607 |      "output_type": "stream",
 608 |      "text": [
 609 |       "Ela PRON\n",
 610 |       "comeu VERB\n",
 611 |       "o DET\n",
 612 |       "macarrão NOUN\n",
 613 |       "com ADP\n",
 614 |       "molho NOUN\n",
 615 |       "de ADP\n",
 616 |       "tomate NOUN\n"
 617 |      ]
 618 |     }
 619 |    ],
 620 |    "source": [
 621 |     "# Process a text\n",
 622 |     "doc = nlp(\"Ela comeu o macarrão com molho de tomate\")\n",
 623 |     "\n",
 624 |     "# Iterate over the tokens\n",
 625 |     "for token in doc:\n",
 626 |     "    # Print the text and the predicted part-of-speech tag\n",
 627 |     "    print(token.text, token.pos_)"
 628 |    ]
 629 |   },
 630 |   {
 631 |    "cell_type": "code",
 632 |    "execution_count": 5,
 633 |    "metadata": {},
 634 |    "outputs": [
 635 |     {
 636 |      "name": "stdout",
 637 |      "output_type": "stream",
 638 |      "text": [
 639 |       "Quero VERB\n",
 640 |       "comprar VERB\n",
 641 |       "um DET\n",
 642 |       "ingresso NOUN\n",
 643 |       "para ADP\n",
 644 |       "o DET\n",
 645 |       "jogo NOUN\n"
 646 |      ]
 647 |     }
 648 |    ],
 649 |    "source": [
 650 |     "# Process a text\n",
 651 |     "doc = nlp(\"Quero comprar um ingresso para o jogo\")\n",
 652 |     "\n",
 653 |     "# Iterate over the tokens\n",
 654 |     "for token in doc:\n",
 655 |     "    # Print the text and the predicted part-of-speech tag\n",
 656 |     "    print(token.text, token.pos_)"
 657 |    ]
 658 |   },
 659 |   {
 660 |    "cell_type": "markdown",
 661 |    "metadata": {},
 662 |    "source": [
 663 |     "### Predicting Syntactic Dependencies\n",
 664 |     "\n",
 665 |     "In addition to the part-of-speech tags, we can also predict how the words are related. For example, whether a word is the subject of the sentence or an object.\n",
 666 |     "\n",
 667 |     "The \"dep underscore\" attribute returns the predicted dependency label.\n",
 668 |     "\n",
 669 |     "The head attribute returns the syntactic head token. You can also think of it as the parent token this word is attached to."
 670 |    ]
 671 |   },
 672 |   {
 673 |    "cell_type": "code",
 674 |    "execution_count": 6,
 675 |    "metadata": {},
 676 |    "outputs": [
 677 |     {
 678 |      "name": "stdout",
 679 |      "output_type": "stream",
 680 |      "text": [
 681 |       "Quero VERB ROOT Quero\n",
 682 |       "comprar VERB xcomp Quero\n",
 683 |       "um DET det ingresso\n",
 684 |       "ingresso NOUN obj comprar\n",
 685 |       "para ADP case jogo\n",
 686 |       "o DET det jogo\n",
 687 |       "jogo NOUN nmod ingresso\n"
 688 |      ]
 689 |     }
 690 |    ],
 691 |    "source": [
 692 |     "for token in doc:\n",
 693 |     "    print(token.text, token.pos_, token.dep_, token.head.text)"
 694 |    ]
 695 |   },
 696 |   {
 697 |    "cell_type": "markdown",
 698 |    "metadata": {},
 699 |    "source": [
 700 |     "### Dependency Label Scheme"
 701 |    ]
 702 |   },
 703 |   {
 704 |    "cell_type": "markdown",
 705 |    "metadata": {},
 706 |    "source": [
 707 |     "<img src=\"img/dep_example.png\" />"
 708 |    ]
 709 |   },
 710 |   {
 711 |    "cell_type": "markdown",
 712 |    "metadata": {},
 713 |    "source": [
 714 |     "To describe syntactic dependencies, spaCy uses a standardized label scheme. Here's an example of some common labels:\n",
 715 |     "\n",
 716 |     "The pronoun \"She\" is a nominal subject attached to the verb – in this case, to \"ate\".\n",
 717 |     "\n",
 718 |     "The noun \"pizza\" is a direct object attached to the verb \"ate\". It is eaten by the subject, \"she\".\n",
 719 |     "\n",
 720 |     "The determiner \"the\", also known as an article, is attached to the noun \"pizza\"."
 721 |    ]
 722 |   },
 723 |   {
 724 |    "cell_type": "markdown",
 725 |    "metadata": {},
 726 |    "source": [
 727 |     "### Predicting Name Entities\n",
 728 |     "\n",
 729 |     "Named entities are \"real world objects\" that are assigned a name – for example, a person, an organization or a country.\n",
 730 |     "\n",
 731 |     "The doc dot ents property lets you access the named entities predicted by the model.\n",
 732 |     "\n",
 733 |     "It returns an iterator of Span objects, so we can print the entity text and the entity label using the \"label underscore\" attribute.\n",
 734 |     "\n",
 735 |     "In this case, the model is correctly predicting \"Apple\" as an organization, \"U.K.\" as a geopolitical entity and \"$1 billion\" as money."
 736 |    ]
 737 |   },
 738 |   {
 739 |    "cell_type": "code",
 740 |    "execution_count": 19,
 741 |    "metadata": {},
 742 |    "outputs": [
 743 |     {
 744 |      "name": "stdout",
 745 |      "output_type": "stream",
 746 |      "text": [
 747 |       "Apple ORG\n",
 748 |       "U.K. GPE\n",
 749 |       "$1 billion MONEY\n"
 750 |      ]
 751 |     }
 752 |    ],
 753 |    "source": [
 754 |     "import spacy\n",
 755 |     "\n",
 756 |     "nlp= spacy.load(\"en_core_web_sm\")\n",
 757 |     "\n",
 758 |     "doc= nlp(u\"Apple is looking at buying U.K. startup for $1 billion\")\n",
 759 |     "\n",
 760 |     "for ent in doc.ents:\n",
 761 |     "    print(ent.text, ent.label_)"
 762 |    ]
 763 |   },
 764 |   {
 765 |    "cell_type": "code",
 766 |    "execution_count": 20,
 767 |    "metadata": {},
 768 |    "outputs": [
 769 |     {
 770 |      "name": "stdout",
 771 |      "output_type": "stream",
 772 |      "text": [
 773 |       "Apple ORG\n",
 774 |       "Inglaterra LOC\n",
 775 |       "R$ PER\n"
 776 |      ]
 777 |     }
 778 |    ],
 779 |    "source": [
 780 |     "# Load the small Portuguese model\n",
 781 |     "nlp = spacy.load('pt_core_news_sm')\n",
 782 |     "\n",
 783 |     "# Process a text\n",
 784 |     "doc = nlp(u\"A empresa Apple está buscando comprar uma empresa na Inglaterra por R$1 bilhão de reais\")\n",
 785 |     "\n",
 786 |     "# Iterate over the predicted entities\n",
 787 |     "for ent in doc.ents:\n",
 788 |     "    # Print the entity text and its label\n",
 789 |     "    print(ent.text, ent.label_)"
 790 |    ]
 791 |   },
 792 |   {
 793 |    "cell_type": "code",
 794 |    "execution_count": 7,
 795 |    "metadata": {},
 796 |    "outputs": [
 797 |     {
 798 |      "name": "stdout",
 799 |      "output_type": "stream",
 800 |      "text": [
 801 |       "Sócio Torcedor PER\n",
 802 |       "Flamengo ORG\n"
 803 |      ]
 804 |     }
 805 |    ],
 806 |    "source": [
 807 |     "# Load the small Portuguese model\n",
 808 |     "nlp = spacy.load('pt_core_news_sm')\n",
 809 |     "\n",
 810 |     "# Process a text\n",
 811 |     "doc = nlp(u\"Como faço para contratar um plano de Sócio Torcedor do Flamengo\")\n",
 812 |     "\n",
 813 |     "# Iterate over the predicted entities\n",
 814 |     "for ent in doc.ents:\n",
 815 |     "    # Print the entity text and its label\n",
 816 |     "    print(ent.text, ent.label_)"
 817 |    ]
 818 |   },
 819 |   {
 820 |    "cell_type": "code",
 821 |    "execution_count": 9,
 822 |    "metadata": {},
 823 |    "outputs": [
 824 |     {
 825 |      "name": "stdout",
 826 |      "output_type": "stream",
 827 |      "text": [
 828 |       "Rio de Janeiro LOC\n"
 829 |      ]
 830 |     }
 831 |    ],
 832 |    "source": [
 833 |     "# Load the small Portuguese model\n",
 834 |     "nlp = spacy.load('pt_core_news_sm')\n",
 835 |     "\n",
 836 |     "# Process a text\n",
 837 |     "doc = nlp(u\"Quais são os planos para quem mora fora do Rio de Janeiro\")\n",
 838 |     "\n",
 839 |     "# Iterate over the predicted entities\n",
 840 |     "for ent in doc.ents:\n",
 841 |     "    # Print the entity text and its label\n",
 842 |     "    print(ent.text, ent.label_)"
 843 |    ]
 844 |   },
 845 |   {
 846 |    "cell_type": "markdown",
 847 |    "metadata": {},
 848 |    "source": [
 849 |     "### Tip: the explain method\n",
 850 |     "\n",
 851 |     "A quick tip: To get definitions for the most common tags and labels, you can use the spacy dot explain helper function.\n",
 852 |     "\n",
 853 |     "For example, \"GPE\" for geopolitical entity isn't exactly intuitive – but spacy dot explain can tell you that it refers to countries, cities and states.\n",
 854 |     "\n",
 855 |     "The same works for part-of-speech tags and dependency labels"
 856 |    ]
 857 |   },
 858 |   {
 859 |    "cell_type": "code",
 860 |    "execution_count": 8,
 861 |    "metadata": {},
 862 |    "outputs": [
 863 |     {
 864 |      "data": {
 865 |       "text/plain": [
 866 |        "'Companies, agencies, institutions, etc.'"
 867 |       ]
 868 |      },
 869 |      "execution_count": 8,
 870 |      "metadata": {},
 871 |      "output_type": "execute_result"
 872 |     }
 873 |    ],
 874 |    "source": [
 875 |     "spacy.explain('ORG')"
 876 |    ]
 877 |   },
 878 |   {
 879 |    "cell_type": "code",
 880 |    "execution_count": 21,
 881 |    "metadata": {},
 882 |    "outputs": [
 883 |     {
 884 |      "data": {
 885 |       "text/plain": [
 886 |        "'Named person or family.'"
 887 |       ]
 888 |      },
 889 |      "execution_count": 21,
 890 |      "metadata": {},
 891 |      "output_type": "execute_result"
 892 |     }
 893 |    ],
 894 |    "source": [
 895 |     "spacy.explain('PER')"
 896 |    ]
 897 |   },
 898 |   {
 899 |    "cell_type": "code",
 900 |    "execution_count": 22,
 901 |    "metadata": {},
 902 |    "outputs": [
 903 |     {
 904 |      "data": {
 905 |       "text/plain": [
 906 |        "'Countries, cities, states'"
 907 |       ]
 908 |      },
 909 |      "execution_count": 22,
 910 |      "metadata": {},
 911 |      "output_type": "execute_result"
 912 |     }
 913 |    ],
 914 |    "source": [
 915 |     "spacy.explain('GPE')"
 916 |    ]
 917 |   },
 918 |   {
 919 |    "cell_type": "code",
 920 |    "execution_count": 23,
 921 |    "metadata": {},
 922 |    "outputs": [
 923 |     {
 924 |      "data": {
 925 |       "text/plain": [
 926 |        "'noun, proper singular'"
 927 |       ]
 928 |      },
 929 |      "execution_count": 23,
 930 |      "metadata": {},
 931 |      "output_type": "execute_result"
 932 |     }
 933 |    ],
 934 |    "source": [
 935 |     "spacy.explain('NNP')"
 936 |    ]
 937 |   },
 938 |   {
 939 |    "cell_type": "code",
 940 |    "execution_count": 24,
 941 |    "metadata": {},
 942 |    "outputs": [
 943 |     {
 944 |      "data": {
 945 |       "text/plain": [
 946 |        "'direct object'"
 947 |       ]
 948 |      },
 949 |      "execution_count": 24,
 950 |      "metadata": {},
 951 |      "output_type": "execute_result"
 952 |     }
 953 |    ],
 954 |    "source": [
 955 |     "spacy.explain('dobj')"
 956 |    ]
 957 |   },
 958 |   {
 959 |    "cell_type": "code",
 960 |    "execution_count": 25,
 961 |    "metadata": {},
 962 |    "outputs": [
 963 |     {
 964 |      "data": {
 965 |       "text/plain": [
 966 |        "'Miscellaneous entities, e.g. events, nationalities, products or works of art'"
 967 |       ]
 968 |      },
 969 |      "execution_count": 25,
 970 |      "metadata": {},
 971 |      "output_type": "execute_result"
 972 |     }
 973 |    ],
 974 |    "source": [
 975 |     "spacy.explain('MISC')"
 976 |    ]
 977 |   },
 978 |   {
 979 |    "cell_type": "code",
 980 |    "execution_count": 26,
 981 |    "metadata": {},
 982 |    "outputs": [
 983 |     {
 984 |      "data": {
 985 |       "text/plain": [
 986 |        "'Non-GPE locations, mountain ranges, bodies of water'"
 987 |       ]
 988 |      },
 989 |      "execution_count": 26,
 990 |      "metadata": {},
 991 |      "output_type": "execute_result"
 992 |     }
 993 |    ],
 994 |    "source": [
 995 |     "spacy.explain('LOC')"
 996 |    ]
 997 |   },
 998 |   {
 999 |    "cell_type": "markdown",
1000 |    "metadata": {
1001 |     "heading_collapsed": true
1002 |    },
1003 |    "source": [
1004 |     "## Model Packages"
1005 |    ]
1006 |   },
1007 |   {
1008 |    "cell_type": "markdown",
1009 |    "metadata": {
1010 |     "heading_collapsed": true
1011 |    },
1012 |    "source": [
1013 |     "## Loading Models\n",
1014 |     "\n",
1015 |     "The models we’re using in this course are already pre-installed. For more details on spaCy’s statistical models and how to install them on your machine, see the documentation.\n",
1016 |     "\n",
1017 |     "https://spacy.io/usage/models\n",
1018 |     "\n",
1019 |     "Use spacy.load to load the small English model 'en_core_web_sm'.\n",
1020 |     "Process the text and print the document text.\n"
1021 |    ]
1022 |   },
1023 |   {
1024 |    "cell_type": "code",
1025 |    "execution_count": 27,
1026 |    "metadata": {
1027 |     "hidden": true
1028 |    },
1029 |    "outputs": [
1030 |     {
1031 |      "name": "stdout",
1032 |      "output_type": "stream",
1033 |      "text": [
1034 |       "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value\n"
1035 |      ]
1036 |     }
1037 |    ],
1038 |    "source": [
1039 |     "import spacy\n",
1040 |     "\n",
1041 |     "# Load the 'en_core_web_sm' model\n",
1042 |     "nlp = spacy.load(\"en_core_web_sm\")\n",
1043 |     "\n",
1044 |     "text = \"It’s official: Apple is the first U.S. public company to reach a $1 trillion market value\"\n",
1045 |     "\n",
1046 |     "# Process the text\n",
1047 |     "doc = nlp(text)\n",
1048 |     "\n",
1049 |     "# Print the document text\n",
1050 |     "print(doc.text)"
1051 |    ]
1052 |   },
1053 |   {
1054 |    "cell_type": "markdown",
1055 |    "metadata": {},
1056 |    "source": [
1057 |     "## Predicting Linguistic Annotations\n",
1058 |     "\n",
1059 |     "You’ll now get to try one of spaCy’s pre-trained model packages and see its predictions in action. Feel free to try it out on your own text! To find out what a tag or label means, you can call spacy.explain in the. For example: spacy.explain('PROPN') or spacy.explain('GPE').\n",
1060 |     "\n",
1061 |     "### Part 1\n",
1062 |     "\n",
1063 |     "Process the text with the nlp object and create a doc.\n",
1064 |     "\n",
1065 |     "For each token, print the token text, the token’s .pos_ (part-of-speech tag) and the token’s .dep_ (dependency label)."
1066 |    ]
1067 |   },
1068 |   {
1069 |    "cell_type": "code",
1070 |    "execution_count": 28,
1071 |    "metadata": {},
1072 |    "outputs": [
1073 |     {
1074 |      "name": "stdout",
1075 |      "output_type": "stream",
1076 |      "text": [
1077 |       "It          PRON      nsubj     \n",
1078 |       "’s          PROPN     ROOT      \n",
1079 |       "official    NOUN      acomp     \n",
1080 |       ":           PUNCT     punct     \n",
1081 |       "Apple       PROPN     nsubj     \n",
1082 |       "is          VERB      ROOT      \n",
1083 |       "the         DET       det       \n",
1084 |       "first       ADJ       amod      \n",
1085 |       "U.S.        PROPN     nmod      \n",
1086 |       "public      ADJ       amod      \n",
1087 |       "company     NOUN      attr      \n",
1088 |       "to          PART      aux       \n",
1089 |       "reach       VERB      relcl     \n",
1090 |       "a           DET       det       \n",
1091 |       "$           SYM       quantmod  \n",
1092 |       "1           NUM       compound  \n",
1093 |       "trillion    NUM       nummod    \n",
1094 |       "market      NOUN      compound  \n",
1095 |       "value       NOUN      dobj      \n"
1096 |      ]
1097 |     }
1098 |    ],
1099 |    "source": [
1100 |     "import spacy\n",
1101 |     "\n",
1102 |     "nlp = spacy.load(\"en_core_web_sm\")\n",
1103 |     "\n",
1104 |     "text = \"It’s official: Apple is the first U.S. public company to reach a $1 trillion market value\"\n",
1105 |     "\n",
1106 |     "# Process the text\n",
1107 |     "doc = nlp(text)\n",
1108 |     "\n",
1109 |     "for token in doc:\n",
1110 |     "    # Get the token text, part-of-speech tag and dependency label\n",
1111 |     "    token_text = token.text\n",
1112 |     "    token_pos = token.pos_\n",
1113 |     "    token_dep = token.dep_\n",
1114 |     "    # This is for formatting only\n",
1115 |     "    print(\"{:<12}{:<10}{:<10}\".format(token_text, token_pos, token_dep))"
1116 |    ]
1117 |   },
1118 |   {
1119 |    "cell_type": "code",
1120 |    "execution_count": 29,
1121 |    "metadata": {},
1122 |    "outputs": [
1123 |     {
1124 |      "data": {
1125 |       "text/plain": [
1126 |        "'direct object'"
1127 |       ]
1128 |      },
1129 |      "execution_count": 29,
1130 |      "metadata": {},
1131 |      "output_type": "execute_result"
1132 |     }
1133 |    ],
1134 |    "source": [
1135 |     "spacy.explain('dobj')"
1136 |    ]
1137 |   },
1138 |   {
1139 |    "cell_type": "code",
1140 |    "execution_count": 30,
1141 |    "metadata": {},
1142 |    "outputs": [
1143 |     {
1144 |      "data": {
1145 |       "text/plain": [
1146 |        "'attribute'"
1147 |       ]
1148 |      },
1149 |      "execution_count": 30,
1150 |      "metadata": {},
1151 |      "output_type": "execute_result"
1152 |     }
1153 |    ],
1154 |    "source": [
1155 |     "spacy.explain('attr')"
1156 |    ]
1157 |   },
1158 |   {
1159 |    "cell_type": "code",
1160 |    "execution_count": 31,
1161 |    "metadata": {},
1162 |    "outputs": [
1163 |     {
1164 |      "data": {
1165 |       "text/plain": [
1166 |        "'nominal subject'"
1167 |       ]
1168 |      },
1169 |      "execution_count": 31,
1170 |      "metadata": {},
1171 |      "output_type": "execute_result"
1172 |     }
1173 |    ],
1174 |    "source": [
1175 |     "spacy.explain('nsubj')"
1176 |    ]
1177 |   },
1178 |   {
1179 |    "cell_type": "code",
1180 |    "execution_count": 32,
1181 |    "metadata": {},
1182 |    "outputs": [
1183 |     {
1184 |      "name": "stdout",
1185 |      "output_type": "stream",
1186 |      "text": [
1187 |       "É           VERB      cop       \n",
1188 |       "oficial     ADJ       ROOT      \n",
1189 |       ":           PUNCT     punct     \n",
1190 |       "A           DET       det       \n",
1191 |       "Apple       PROPN     nsubj     \n",
1192 |       "é           VERB      cop       \n",
1193 |       "a           DET       det       \n",
1194 |       "primeira    ADJ       amod      \n",
1195 |       "empresa     NOUN      ROOT      \n",
1196 |       "americana   ADJ       amod      \n",
1197 |       "a           ADP       mark      \n",
1198 |       "atingir     VERB      acl       \n",
1199 |       "o           DET       det       \n",
1200 |       "valor       NOUN      obj       \n",
1201 |       "de          ADP       case      \n",
1202 |       "mercado     NOUN      nmod      \n",
1203 |       "de          ADP       case      \n",
1204 |       "1           NUM       nummod    \n",
1205 |       "bilhão      NOUN      nmod      \n",
1206 |       "de          ADP       case      \n",
1207 |       "reais       NOUN      nmod      \n"
1208 |      ]
1209 |     }
1210 |    ],
1211 |    "source": [
1212 |     "import spacy\n",
1213 |     "\n",
1214 |     "nlp = spacy.load(\"pt_core_news_sm\")\n",
1215 |     "\n",
1216 |     "text = \"É oficial: A Apple é a primeira empresa americana a atingir o valor de mercado de 1 bilhão de reais\"\n",
1217 |     "\n",
1218 |     "# Process the text\n",
1219 |     "doc = nlp(text)\n",
1220 |     "\n",
1221 |     "for token in doc:\n",
1222 |     "    # Get the token text, part-of-speech tag and dependency label\n",
1223 |     "    token_text = token.text\n",
1224 |     "    token_pos = token.pos_\n",
1225 |     "    token_dep = token.dep_\n",
1226 |     "    # This is for formatting only\n",
1227 |     "    print(\"{:<12}{:<10}{:<10}\".format(token_text, token_pos, token_dep))"
1228 |    ]
1229 |   },
1230 |   {
1231 |    "cell_type": "code",
1232 |    "execution_count": 33,
1233 |    "metadata": {},
1234 |    "outputs": [
1235 |     {
1236 |      "data": {
1237 |       "text/plain": [
1238 |        "'copula'"
1239 |       ]
1240 |      },
1241 |      "execution_count": 33,
1242 |      "metadata": {},
1243 |      "output_type": "execute_result"
1244 |     }
1245 |    ],
1246 |    "source": [
1247 |     "spacy.explain('cop')"
1248 |    ]
1249 |   },
1250 |   {
1251 |    "cell_type": "code",
1252 |    "execution_count": 34,
1253 |    "metadata": {},
1254 |    "outputs": [],
1255 |    "source": [
1256 |     "spacy.explain('ROOT')"
1257 |    ]
1258 |   },
1259 |   {
1260 |    "cell_type": "code",
1261 |    "execution_count": 35,
1262 |    "metadata": {},
1263 |    "outputs": [],
1264 |    "source": [
1265 |     "spacy.explain('acl')"
1266 |    ]
1267 |   },
1268 |   {
1269 |    "cell_type": "code",
1270 |    "execution_count": 36,
1271 |    "metadata": {},
1272 |    "outputs": [],
1273 |    "source": [
1274 |     "spacy.explain('nummod')"
1275 |    ]
1276 |   },
1277 |   {
1278 |    "cell_type": "code",
1279 |    "execution_count": 37,
1280 |    "metadata": {},
1281 |    "outputs": [
1282 |     {
1283 |      "data": {
1284 |       "text/plain": [
1285 |        "'modifier of nominal'"
1286 |       ]
1287 |      },
1288 |      "execution_count": 37,
1289 |      "metadata": {},
1290 |      "output_type": "execute_result"
1291 |     }
1292 |    ],
1293 |    "source": [
1294 |     "spacy.explain('nmod')"
1295 |    ]
1296 |   },
1297 |   {
1298 |    "cell_type": "code",
1299 |    "execution_count": 10,
1300 |    "metadata": {},
1301 |    "outputs": [
1302 |     {
1303 |      "name": "stdout",
1304 |      "output_type": "stream",
1305 |      "text": [
1306 |       "Meu         DET       det       \n",
1307 |       "cartão      NOUN      nsubj     \n",
1308 |       "ingresso    ADJ       acl       \n",
1309 |       "ainda       ADV       advmod    \n",
1310 |       "não         ADV       advmod    \n",
1311 |       "chegou      VERB      ROOT      \n",
1312 |       ".           PUNCT     punct     \n",
1313 |       "Como        SCONJ     case      \n",
1314 |       "faço        NOUN      ROOT      \n",
1315 |       "para        ADP       mark      \n",
1316 |       "entrar      VERB      acl       \n",
1317 |       "no          ADP       nummod    \n",
1318 |       "jogo        NOUN      obj       \n"
1319 |      ]
1320 |     }
1321 |    ],
1322 |    "source": [
1323 |     "import spacy\n",
1324 |     "\n",
1325 |     "nlp = spacy.load(\"pt_core_news_sm\")\n",
1326 |     "\n",
1327 |     "text = u\"Meu cartão ingresso ainda não chegou. Como faço para entrar no jogo\"\n",
1328 |     "\n",
1329 |     "# Process the text\n",
1330 |     "doc = nlp(text)\n",
1331 |     "\n",
1332 |     "for token in doc:\n",
1333 |     "    # Get the token text, part-of-speech tag and dependency label\n",
1334 |     "    token_text = token.text\n",
1335 |     "    token_pos = token.pos_\n",
1336 |     "    token_dep = token.dep_\n",
1337 |     "    # This is for formatting only\n",
1338 |     "    print(\"{:<12}{:<10}{:<10}\".format(token_text, token_pos, token_dep))"
1339 |    ]
1340 |   },
1341 |   {
1342 |    "cell_type": "code",
1343 |    "execution_count": 11,
1344 |    "metadata": {},
1345 |    "outputs": [],
1346 |    "source": [
1347 |     "spacy.explain('acl')"
1348 |    ]
1349 |   },
1350 |   {
1351 |    "cell_type": "code",
1352 |    "execution_count": null,
1353 |    "metadata": {},
1354 |    "outputs": [],
1355 |    "source": []
1356 |   },
1357 |   {
1358 |    "cell_type": "markdown",
1359 |    "metadata": {},
1360 |    "source": [
1361 |     "### Part 2\n",
1362 |     "\n",
1363 |     "Process the text and create a doc object.\n",
1364 |     "\n",
1365 |     "Iterate over the doc.ents and print the entity text and label_ attribute."
1366 |    ]
1367 |   },
1368 |   {
1369 |    "cell_type": "code",
1370 |    "execution_count": 38,
1371 |    "metadata": {},
1372 |    "outputs": [
1373 |     {
1374 |      "name": "stdout",
1375 |      "output_type": "stream",
1376 |      "text": [
1377 |       "Apple ORG\n",
1378 |       "first ORDINAL\n",
1379 |       "U.S. GPE\n",
1380 |       "$1 trillion MONEY\n"
1381 |      ]
1382 |     }
1383 |    ],
1384 |    "source": [
1385 |     "import spacy\n",
1386 |     "\n",
1387 |     "nlp = spacy.load(\"en_core_web_sm\")\n",
1388 |     "\n",
1389 |     "text = \"It’s official: Apple is the first U.S. public company to reach a $1 trillion market value\"\n",
1390 |     "\n",
1391 |     "# Process the text\n",
1392 |     "doc = nlp(text)\n",
1393 |     "\n",
1394 |     "# Iterate over the predicted entities\n",
1395 |     "for ent in doc.ents:\n",
1396 |     "    # Print the entity text and its label\n",
1397 |     "    print(ent.text, ent.label_)"
1398 |    ]
1399 |   },
1400 |   {
1401 |    "cell_type": "code",
1402 |    "execution_count": 39,
1403 |    "metadata": {},
1404 |    "outputs": [
1405 |     {
1406 |      "name": "stdout",
1407 |      "output_type": "stream",
1408 |      "text": [
1409 |       "Apple ORG\n"
1410 |      ]
1411 |     }
1412 |    ],
1413 |    "source": [
1414 |     "import spacy\n",
1415 |     "\n",
1416 |     "nlp = spacy.load(\"pt_core_news_sm\")\n",
1417 |     "\n",
1418 |     "text = \"É oficial: A Apple é a primeira empresa americana a atingir o valor de mercado de 1 bilhão de reais\"\n",
1419 |     "\n",
1420 |     "# Process the text\n",
1421 |     "doc = nlp(text)\n",
1422 |     "\n",
1423 |     "# Iterate over the predicted entities\n",
1424 |     "for ent in doc.ents:\n",
1425 |     "    # Print the entity text and its label\n",
1426 |     "    print(ent.text, ent.label_)"
1427 |    ]
1428 |   },
1429 |   {
1430 |    "cell_type": "code",
1431 |    "execution_count": 40,
1432 |    "metadata": {},
1433 |    "outputs": [],
1434 |    "source": [
1435 |     "import spacy\n",
1436 |     "\n",
1437 |     "nlp = spacy.load(\"pt_core_news_sm\")\n",
1438 |     "\n",
1439 |     "text = \"Os carros de passeio estão custando 120% mais caros\"\n",
1440 |     "\n",
1441 |     "# Process the text\n",
1442 |     "doc = nlp(text)\n",
1443 |     "\n",
1444 |     "# Iterate over the predicted entities\n",
1445 |     "for ent in doc.ents:\n",
1446 |     "    # Print the entity text and its label\n",
1447 |     "    print(ent.text, ent.label_)"
1448 |    ]
1449 |   },
1450 |   {
1451 |    "cell_type": "code",
1452 |    "execution_count": 41,
1453 |    "metadata": {},
1454 |    "outputs": [
1455 |     {
1456 |      "name": "stdout",
1457 |      "output_type": "stream",
1458 |      "text": [
1459 |       "Inglaterra LOC\n"
1460 |      ]
1461 |     }
1462 |    ],
1463 |    "source": [
1464 |     "import spacy\n",
1465 |     "\n",
1466 |     "nlp = spacy.load(\"pt_core_news_sm\")\n",
1467 |     "\n",
1468 |     "text = \"O rei da Inglaterra morreu de acidente de carro\"\n",
1469 |     "\n",
1470 |     "# Process the text\n",
1471 |     "doc = nlp(text)\n",
1472 |     "\n",
1473 |     "# Iterate over the predicted entities\n",
1474 |     "for ent in doc.ents:\n",
1475 |     "    # Print the entity text and its label\n",
1476 |     "    print(ent.text, ent.label_)"
1477 |    ]
1478 |   },
1479 |   {
1480 |    "cell_type": "code",
1481 |    "execution_count": 42,
1482 |    "metadata": {},
1483 |    "outputs": [
1484 |     {
1485 |      "name": "stdout",
1486 |      "output_type": "stream",
1487 |      "text": [
1488 |       "England GPE\n"
1489 |      ]
1490 |     }
1491 |    ],
1492 |    "source": [
1493 |     "import spacy\n",
1494 |     "\n",
1495 |     "nlp = spacy.load(\"en_core_web_sm\")\n",
1496 |     "\n",
1497 |     "text = \"The king of England died on a car accident\"\n",
1498 |     "\n",
1499 |     "# Process the text\n",
1500 |     "doc = nlp(text)\n",
1501 |     "\n",
1502 |     "# Iterate over the predicted entities\n",
1503 |     "for ent in doc.ents:\n",
1504 |     "    # Print the entity text and its label\n",
1505 |     "    print(ent.text, ent.label_)"
1506 |    ]
1507 |   },
1508 |   {
1509 |    "cell_type": "code",
1510 |    "execution_count": 43,
1511 |    "metadata": {},
1512 |    "outputs": [
1513 |     {
1514 |      "name": "stdout",
1515 |      "output_type": "stream",
1516 |      "text": [
1517 |       "120 dollars MONEY\n"
1518 |      ]
1519 |     }
1520 |    ],
1521 |    "source": [
1522 |     "import spacy\n",
1523 |     "\n",
1524 |     "nlp = spacy.load(\"en_core_web_sm\")\n",
1525 |     "\n",
1526 |     "text = \"I want to buy a tennis shoes and pay 120 dollars\"\n",
1527 |     "\n",
1528 |     "# Process the text\n",
1529 |     "doc = nlp(text)\n",
1530 |     "\n",
1531 |     "# Iterate over the predicted entities\n",
1532 |     "for ent in doc.ents:\n",
1533 |     "    # Print the entity text and its label\n",
1534 |     "    print(ent.text, ent.label_)"
1535 |    ]
1536 |   },
1537 |   {
1538 |    "cell_type": "markdown",
1539 |    "metadata": {},
1540 |    "source": [
1541 |     "## Predicting name entities in context\n",
1542 |     "\n",
1543 |     "Models are statistical and not always right. Whether their predictions are correct depends on the training data and the text you’re processing. Let’s take a look at an example.\n",
1544 |     "\n",
1545 |     "Process the text with the nlp object.\n",
1546 |     "\n",
1547 |     "Iterate over the entities and print the entity text and label.\n",
1548 |     "\n",
1549 |     "Looks like the model didn’t predict “iPhone X”. Create a span for those tokens manually."
1550 |    ]
1551 |   },
1552 |   {
1553 |    "cell_type": "code",
1554 |    "execution_count": 44,
1555 |    "metadata": {},
1556 |    "outputs": [
1557 |     {
1558 |      "name": "stdout",
1559 |      "output_type": "stream",
1560 |      "text": [
1561 |       "Apple ORG\n",
1562 |       "Missing entity: iPhone X\n"
1563 |      ]
1564 |     }
1565 |    ],
1566 |    "source": [
1567 |     "import spacy\n",
1568 |     "\n",
1569 |     "nlp = spacy.load(\"en_core_web_sm\")\n",
1570 |     "\n",
1571 |     "text = \"New iPhone X release date leaked as Apple reveals pre-orders by mistake\"\n",
1572 |     "\n",
1573 |     "# Process the text\n",
1574 |     "doc = nlp(text)\n",
1575 |     "\n",
1576 |     "# Iterate over the entities\n",
1577 |     "for token in doc.ents:\n",
1578 |     "    # Print the entity text and label\n",
1579 |     "    print(token.text, token.label_)\n",
1580 |     "\n",
1581 |     "# Get the span for \"iPhone X\"\n",
1582 |     "iphone_x = doc[1:3]\n",
1583 |     "\n",
1584 |     "# Print the span text\n",
1585 |     "print(\"Missing entity:\", iphone_x.text)"
1586 |    ]
1587 |   },
1588 |   {
1589 |    "cell_type": "markdown",
1590 |    "metadata": {},
1591 |    "source": [
1592 |     "## Rule based matching\n",
1593 |     "\n",
1594 |     "Now we'll take a look at spaCy's matcher, which lets you write rules to find words and phrases in text.\n",
1595 |     "\n",
1596 |     "Compared to regular expressions, the matcher works with Doc and Token objects instead of only strings.\n",
1597 |     "\n",
1598 |     "It's also more flexible: you can search for texts but also other lexical attributes.\n",
1599 |     "\n",
1600 |     "You can even write rules that use the model's predictions.\n",
1601 |     "\n",
1602 |     "For example, find the word \"duck\" only if it's a verb, not a noun."
1603 |    ]
1604 |   },
1605 |   {
1606 |    "cell_type": "markdown",
1607 |    "metadata": {},
1608 |    "source": [
1609 |     "Why not just regular expressions ?\n",
1610 |     "\n",
1611 |     "<ul>\n",
1612 |     "<li>Match on Doc objects, not just strings</li>\n",
1613 |     "\n",
1614 |     "<li>Match on tokens and token attributes</li>\n",
1615 |     "\n",
1616 |     "<li>Use the model's predictions</li>\n",
1617 |     "\n",
1618 |     "<li>Example: \"duck\" (verb) vs. \"duck\" (noun)</li></ul>"
1619 |    ]
1620 |   },
1621 |   {
1622 |    "cell_type": "markdown",
1623 |    "metadata": {},
1624 |    "source": [
1625 |     "Match patterns are lists of dictionaries. Each dictionary describes one token. The keys are the names of token attributes, mapped to their expected values.\n",
1626 |     "\n",
1627 |     "In this example, we're looking for two tokens with the text \"iPhone\" and \"X\".\n",
1628 |     "\n",
1629 |     "We can also match on other token attributes. Here, we're looking for two tokens whose lowercase forms equal \"iphone\" and \"x\".\n",
1630 |     "\n",
1631 |     "We can even write patterns using attributes predicted by the model. Here, we're matching a token with the lemma \"buy\", plus a noun. The lemma is the base form, so this pattern would match phrases like \"buying milk\" or \"bought flowers\"."
1632 |    ]
1633 |   },
1634 |   {
1635 |    "cell_type": "markdown",
1636 |    "metadata": {},
1637 |    "source": [
1638 |     "<b> Match patterns</b>\n",
1639 |     "\n",
1640 |     "<ul>\n",
1641 |     "<li>Lists of dictionaries, one per token</li>\n",
1642 |     "\n",
1643 |     "<li>Match exact token texts \n",
1644 |     "\n",
1645 |     "[{'ORTH': 'iPhone'}, {'ORTH': 'X'}]\n",
1646 |     "\n",
1647 |     "<li>Match lexical attributes\n",
1648 |     "    \n",
1649 |     "[{'LOWER': 'iphone'}, {'LOWER': 'x'}] </li>\n",
1650 |     "\n",
1651 |     "<li> Match any token attributes\n",
1652 |     "\n",
1653 |     "[{'LEMMA': 'buy'}, {'POS': 'NOUN'}]\n",
1654 |     "</li>\n",
1655 |     "</ul>"
1656 |    ]
1657 |   },
1658 |   {
1659 |    "cell_type": "markdown",
1660 |    "metadata": {},
1661 |    "source": [
1662 |     "### Using the matcher\n",
1663 |     "\n",
1664 |     "To use a pattern, we first import the matcher from spacy dot matcher.\n",
1665 |     "\n",
1666 |     "We also load a model and create the nlp object.\n",
1667 |     "\n",
1668 |     "The matcher is initialized with the shared vocabulary, nlp dot vocab. You'll learn more about this later – for now, just remember to always pass it in.\n",
1669 |     "\n",
1670 |     "The matcher dot add method lets you add a pattern. The first argument is a unique ID to identify which pattern was matched. The second argument is an optional callback. We don't need one here, so we set it to None. The third argument is the pattern.\n",
1671 |     "\n",
1672 |     "To match the pattern on a text, we can call the matcher on any doc.\n",
1673 |     "\n",
1674 |     "This will return the matches."
1675 |    ]
1676 |   },
1677 |   {
1678 |    "cell_type": "code",
1679 |    "execution_count": 45,
1680 |    "metadata": {},
1681 |    "outputs": [],
1682 |    "source": [
1683 |     "import spacy\n",
1684 |     "\n",
1685 |     "# Import the Matcher\n",
1686 |     "from spacy.matcher import Matcher\n",
1687 |     "\n",
1688 |     "# Load a model and create the nlp object\n",
1689 |     "nlp = spacy.load('en_core_web_sm')\n",
1690 |     "\n",
1691 |     "# Initialize the matcher with the shared vocab\n",
1692 |     "matcher = Matcher(nlp.vocab)\n",
1693 |     "\n",
1694 |     "# Add the pattern to the matcher\n",
1695 |     "pattern = [{'ORTH': 'iPhone'}, {'ORTH': 'X'}]\n",
1696 |     "matcher.add('IPHONE_PATTERN', None, pattern)\n",
1697 |     "\n",
1698 |     "# Process some text\n",
1699 |     "doc = nlp(\"New iPhone X release date leaked\")\n",
1700 |     "\n",
1701 |     "# Call the matcher on the doc\n",
1702 |     "matches = matcher(doc)"
1703 |    ]
1704 |   },
1705 |   {
1706 |    "cell_type": "code",
1707 |    "execution_count": 46,
1708 |    "metadata": {},
1709 |    "outputs": [
1710 |     {
1711 |      "data": {
1712 |       "text/plain": [
1713 |        "[(9528407286733565721, 1, 3)]"
1714 |       ]
1715 |      },
1716 |      "execution_count": 46,
1717 |      "metadata": {},
1718 |      "output_type": "execute_result"
1719 |     }
1720 |    ],
1721 |    "source": [
1722 |     "matches"
1723 |    ]
1724 |   },
1725 |   {
1726 |    "cell_type": "markdown",
1727 |    "metadata": {},
1728 |    "source": [
1729 |     "When you call the matcher on a doc, it returns a list of tuples.\n",
1730 |     "\n",
1731 |     "Each tuple consists of three values: the match ID, the start index and the end index of the matched span.\n",
1732 |     "\n",
1733 |     "This means we can iterate over the matches and create a Span object: a slice of the doc at the start and end index."
1734 |    ]
1735 |   },
1736 |   {
1737 |    "cell_type": "code",
1738 |    "execution_count": 47,
1739 |    "metadata": {},
1740 |    "outputs": [
1741 |     {
1742 |      "name": "stdout",
1743 |      "output_type": "stream",
1744 |      "text": [
1745 |       "iPhone X\n"
1746 |      ]
1747 |     }
1748 |    ],
1749 |    "source": [
1750 |     "# Call the matcher on the doc\n",
1751 |     "doc = nlp(\"New iPhone X release date leaked\")\n",
1752 |     "matches = matcher(doc)\n",
1753 |     "\n",
1754 |     "# Iterate over the matches\n",
1755 |     "for match_id, start, end in matches:\n",
1756 |     "    # Get the matched span\n",
1757 |     "    matched_span = doc[start:end]\n",
1758 |     "    print(matched_span.text)"
1759 |    ]
1760 |   },
1761 |   {
1762 |    "cell_type": "markdown",
1763 |    "metadata": {},
1764 |    "source": [
1765 |     "match_id: hash value of the pattern name\n",
1766 |     "\n",
1767 |     "start: start index of matched span\n",
1768 |     "\n",
1769 |     "end: end index of matched span"
1770 |    ]
1771 |   },
1772 |   {
1773 |    "cell_type": "markdown",
1774 |    "metadata": {},
1775 |    "source": [
1776 |     "### Matching lexical attributes\n",
1777 |     "\n",
1778 |     "Here's an example of a more complex pattern using lexical attributes.\n",
1779 |     "\n",
1780 |     "We're looking for five tokens:\n",
1781 |     "\n",
1782 |     "A token consisting of only digits.\n",
1783 |     "\n",
1784 |     "Three case-insensitive tokens for \"fifa\", \"world\" and \"cup\".\n",
1785 |     "\n",
1786 |     "And a token that consists of punctuation.\n",
1787 |     "\n",
1788 |     "The pattern matches the tokens \"2018 FIFA World Cup:\"."
1789 |    ]
1790 |   },
1791 |   {
1792 |    "cell_type": "code",
1793 |    "execution_count": 48,
1794 |    "metadata": {},
1795 |    "outputs": [
1796 |     {
1797 |      "name": "stdout",
1798 |      "output_type": "stream",
1799 |      "text": [
1800 |       "2018 FIFA World Cup:\n"
1801 |      ]
1802 |     }
1803 |    ],
1804 |    "source": [
1805 |     "pattern = [\n",
1806 |     "    {'IS_DIGIT': True},\n",
1807 |     "    {'LOWER': 'fifa'},\n",
1808 |     "    {'LOWER': 'world'},\n",
1809 |     "    {'LOWER': 'cup'},\n",
1810 |     "    {'IS_PUNCT': True}\n",
1811 |     "]\n",
1812 |     "\n",
1813 |     "doc = nlp(\"2018 FIFA World Cup: France won!\")\n",
1814 |     "\n",
1815 |     "# Initialize the matcher with the shared vocab\n",
1816 |     "matcher = Matcher(nlp.vocab)\n",
1817 |     "\n",
1818 |     "# Add the pattern to the matcher\n",
1819 |     "matcher.add('FIFA', None, pattern)\n",
1820 |     "\n",
1821 |     "# Process some text\n",
1822 |     "\n",
1823 |     "# Call the matcher on the doc\n",
1824 |     "matches = matcher(doc)\n",
1825 |     "\n",
1826 |     "# Iterate over the matches\n",
1827 |     "for match_id, start, end in matches:\n",
1828 |     "    # Get the matched span\n",
1829 |     "    matched_span = doc[start:end]\n",
1830 |     "    print(matched_span.text)"
1831 |    ]
1832 |   },
1833 |   {
1834 |    "cell_type": "markdown",
1835 |    "metadata": {},
1836 |    "source": [
1837 |     "In this example, we're looking for two tokens:\n",
1838 |     "\n",
1839 |     "A verb with the lemma \"love\", followed by a noun.\n",
1840 |     "\n",
1841 |     "This pattern will match \"loved dogs\" and \"love cats\"."
1842 |    ]
1843 |   },
1844 |   {
1845 |    "cell_type": "code",
1846 |    "execution_count": 49,
1847 |    "metadata": {},
1848 |    "outputs": [
1849 |     {
1850 |      "name": "stdout",
1851 |      "output_type": "stream",
1852 |      "text": [
1853 |       "loved dogs\n",
1854 |       "love cats\n"
1855 |      ]
1856 |     }
1857 |    ],
1858 |    "source": [
1859 |     "pattern = [\n",
1860 |     "    {'LEMMA': 'love', 'POS': 'VERB'},\n",
1861 |     "    {'POS': 'NOUN'}\n",
1862 |     "]\n",
1863 |     "\n",
1864 |     "doc = nlp(\"I loved dogs but now I love cats more.\")\n",
1865 |     "\n",
1866 |     "# Initialize the matcher with the shared vocab\n",
1867 |     "matcher = Matcher(nlp.vocab)\n",
1868 |     "\n",
1869 |     "# Add the pattern to the matcher\n",
1870 |     "matcher.add('PETS', None, pattern)\n",
1871 |     "\n",
1872 |     "# Process some text\n",
1873 |     "\n",
1874 |     "# Call the matcher on the doc\n",
1875 |     "matches = matcher(doc)\n",
1876 |     "\n",
1877 |     "# Iterate over the matches\n",
1878 |     "for match_id, start, end in matches:\n",
1879 |     "    # Get the matched span\n",
1880 |     "    matched_span = doc[start:end]\n",
1881 |     "    print(matched_span.text)"
1882 |    ]
1883 |   },
1884 |   {
1885 |    "cell_type": "code",
1886 |    "execution_count": 50,
1887 |    "metadata": {},
1888 |    "outputs": [
1889 |     {
1890 |      "name": "stdout",
1891 |      "output_type": "stream",
1892 |      "text": [
1893 |       "loved dogs\n",
1894 |       "love cats\n"
1895 |      ]
1896 |     }
1897 |    ],
1898 |    "source": [
1899 |     "pattern = [\n",
1900 |     "    {'LEMMA': 'love', 'POS': 'VERB'},\n",
1901 |     "    {'POS': 'NOUN'}\n",
1902 |     "]\n",
1903 |     "\n",
1904 |     "doc = nlp(\"I loved dogs. Now I love cats more.\")\n",
1905 |     "\n",
1906 |     "# Initialize the matcher with the shared vocab\n",
1907 |     "matcher = Matcher(nlp.vocab)\n",
1908 |     "\n",
1909 |     "# Add the pattern to the matcher\n",
1910 |     "matcher.add('PETS', None, pattern)\n",
1911 |     "\n",
1912 |     "# Process some text\n",
1913 |     "\n",
1914 |     "# Call the matcher on the doc\n",
1915 |     "matches = matcher(doc)\n",
1916 |     "\n",
1917 |     "# Iterate over the matches\n",
1918 |     "for match_id, start, end in matches:\n",
1919 |     "    # Get the matched span\n",
1920 |     "    matched_span = doc[start:end]\n",
1921 |     "    print(matched_span.text)"
1922 |    ]
1923 |   },
1924 |   {
1925 |    "cell_type": "markdown",
1926 |    "metadata": {},
1927 |    "source": [
1928 |     "### Using Operations and quantifiers\n",
1929 |     "\n",
1930 |     "Operators and quantifiers let you define how often a token should be matched. They can be added using the \"OP\" key.\n",
1931 |     "\n",
1932 |     "Here, the \"?\" operator makes the determiner token optional, so it will match a token with the lemma \"buy\", an optional article and a noun."
1933 |    ]
1934 |   },
1935 |   {
1936 |    "cell_type": "code",
1937 |    "execution_count": 51,
1938 |    "metadata": {},
1939 |    "outputs": [
1940 |     {
1941 |      "name": "stdout",
1942 |      "output_type": "stream",
1943 |      "text": [
1944 |       "bought a smartphone\n",
1945 |       "buying apps\n"
1946 |      ]
1947 |     }
1948 |    ],
1949 |    "source": [
1950 |     "pattern = [\n",
1951 |     "    {'LEMMA': 'buy'},\n",
1952 |     "    {'POS': 'DET', 'OP': '?'},  # optional: match 0 or 1 times\n",
1953 |     "    {'POS': 'NOUN'}\n",
1954 |     "]\n",
1955 |     "\n",
1956 |     "doc = nlp(\"I bought a smartphone. Now I'm buying apps.\")\n",
1957 |     "\n",
1958 |     "# Initialize the matcher with the shared vocab\n",
1959 |     "matcher = Matcher(nlp.vocab)\n",
1960 |     "\n",
1961 |     "# Add the pattern to the matcher\n",
1962 |     "matcher.add('CONSUME', None, pattern)\n",
1963 |     "\n",
1964 |     "# Process some text\n",
1965 |     "\n",
1966 |     "# Call the matcher on the doc\n",
1967 |     "matches = matcher(doc)\n",
1968 |     "\n",
1969 |     "# Iterate over the matches\n",
1970 |     "for match_id, start, end in matches:\n",
1971 |     "    # Get the matched span\n",
1972 |     "    matched_span = doc[start:end]\n",
1973 |     "    print(matched_span.text)"
1974 |    ]
1975 |   },
1976 |   {
1977 |    "cell_type": "markdown",
1978 |    "metadata": {},
1979 |    "source": [
1980 |     "\"OP\" can have one of four values:\n",
1981 |     "\n",
1982 |     "An \"!\" negates the token, so it's matched 0 times.\n",
1983 |     "\n",
1984 |     "A \"?\" makes the token optional, and matches it 0 or 1 times.\n",
1985 |     "\n",
1986 |     "A \"+\" matches a token 1 or more times.\n",
1987 |     "\n",
1988 |     "And finally, an \"*\" matches 0 or more times.\n",
1989 |     "\n",
1990 |     "Operators can make your patterns a lot more powerful, but they also add more complexity – so use them wisely."
1991 |    ]
1992 |   },
1993 |   {
1994 |    "cell_type": "markdown",
1995 |    "metadata": {},
1996 |    "source": [
1997 |     "{'OP': '!'}\tNegation: match 0 times\n",
1998 |     "\n",
1999 |     "{'OP': '?'}\tOptional: match 0 or 1 times\n",
2000 |     "\n",
2001 |     "{'OP': '+'}\tMatch 1 or more times\n",
2002 |     "\n",
2003 |     "{'OP': '*'}\tMatch 0 or more times"
2004 |    ]
2005 |   },
2006 |   {
2007 |    "cell_type": "markdown",
2008 |    "metadata": {},
2009 |    "source": [
2010 |     "## Using the matcher \n",
2011 |     "\n",
2012 |     "Let’s try spaCy’s rule-based Matcher. You’ll be using the example from the previous exercise and write a pattern that can match the phrase “iPhone X” in the text.\n",
2013 |     "\n",
2014 |     "Import the Matcher from spacy.matcher.\n",
2015 |     "\n",
2016 |     "Initialize it with the nlp object’s shared vocab.\n",
2017 |     "\n",
2018 |     "Create a pattern that matches the 'TEXT' values of two tokens: \"iPhone\" and \"X\".\n",
2019 |     "    \n",
2020 |     "Use the matcher.add method to add the pattern to the matcher.\n",
2021 |     "\n",
2022 |     "Call the matcher on the doc and store the result in the variable matches.\n",
2023 |     "\n",
2024 |     "Iterate over the matches and get the matched span from the start to the end index.\n"
2025 |    ]
2026 |   },
2027 |   {
2028 |    "cell_type": "code",
2029 |    "execution_count": 52,
2030 |    "metadata": {},
2031 |    "outputs": [
2032 |     {
2033 |      "name": "stdout",
2034 |      "output_type": "stream",
2035 |      "text": [
2036 |       "New iPhone X release date leaked as Apple reveals pre-orders by mistake\n",
2037 |       "Matches: ['iPhone X']\n"
2038 |      ]
2039 |     }
2040 |    ],
2041 |    "source": [
2042 |     "import spacy\n",
2043 |     "\n",
2044 |     "nlp = spacy.load(\"en_core_web_sm\")\n",
2045 |     "doc = nlp(\"New iPhone X release date leaked as Apple reveals pre-orders by mistake\")\n",
2046 |     "print(doc.text)\n",
2047 |     "\n",
2048 |     "# Import the Matcher\n",
2049 |     "from spacy.matcher import Matcher\n",
2050 |     "\n",
2051 |     "# Initialize the Matcher with the shared vocabulary\n",
2052 |     "matcher = Matcher(nlp.vocab)\n",
2053 |     "\n",
2054 |     "# Create a pattern matching two tokens: \"iPhone\" and \"X\"\n",
2055 |     "pattern = [{'ORTH': 'iPhone'}, {'ORTH': 'X'}]\n",
2056 |     "\n",
2057 |     "# Add the pattern to the matcher\n",
2058 |     "matcher.add('IPHONE_PATTERN', None, pattern)\n",
2059 |     "\n",
2060 |     "# Use the matcher on the doc\n",
2061 |     "matches = matcher(doc)\n",
2062 |     "\n",
2063 |     "print(\"Matches:\", [doc[start:end].text for match_id, start, end in matches])"
2064 |    ]
2065 |   },
2066 |   {
2067 |    "cell_type": "markdown",
2068 |    "metadata": {},
2069 |    "source": [
2070 |     "## Writing Match Patterns\n",
2071 |     "\n",
2072 |     "In this exercise, you’ll practice writing more complex match patterns using different token attributes and operators.\n",
2073 |     "\n",
2074 |     "### Part 1\n",
2075 |     "\n",
2076 |     "Write one pattern that only matches mentions of the full iOS versions: “iOS 7”, “iOS 11” and “iOS 10”."
2077 |    ]
2078 |   },
2079 |   {
2080 |    "cell_type": "code",
2081 |    "execution_count": 53,
2082 |    "metadata": {},
2083 |    "outputs": [
2084 |     {
2085 |      "name": "stdout",
2086 |      "output_type": "stream",
2087 |      "text": [
2088 |       "Total matches found: 3\n",
2089 |       "Match found: iOS 7\n",
2090 |       "Match found: iOS 11\n",
2091 |       "Match found: iOS 10\n"
2092 |      ]
2093 |     }
2094 |    ],
2095 |    "source": [
2096 |     "import spacy\n",
2097 |     "from spacy.matcher import Matcher\n",
2098 |     "\n",
2099 |     "nlp = spacy.load(\"en_core_web_sm\")\n",
2100 |     "matcher = Matcher(nlp.vocab)\n",
2101 |     "\n",
2102 |     "doc = nlp(\n",
2103 |     "    \"After making the iOS update you won't notice a radical system-wide \"\n",
2104 |     "    \"redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of \"\n",
2105 |     "    \"iOS 11's furniture remains the same as in iOS 10. But you will discover \"\n",
2106 |     "    \"some tweaks once you delve a little deeper.\"\n",
2107 |     ")\n",
2108 |     "\n",
2109 |     "# Write a pattern for full iOS versions (\"iOS 7\", \"iOS 11\", \"iOS 10\")\n",
2110 |     "pattern = [{\"TEXT\": 'iOS'}, {\"IS_DIGIT\": True}]\n",
2111 |     "\n",
2112 |     "# Add the pattern to the matcher and apply the matcher to the doc\n",
2113 |     "matcher.add(\"IOS_VERSION_PATTERN\", None, pattern)\n",
2114 |     "matches = matcher(doc)\n",
2115 |     "print(\"Total matches found:\", len(matches))\n",
2116 |     "\n",
2117 |     "# Iterate over the matches and print the span text\n",
2118 |     "for match_id, start, end in matches:\n",
2119 |     "    print(\"Match found:\", doc[start:end].text)"
2120 |    ]
2121 |   },
2122 |   {
2123 |    "cell_type": "markdown",
2124 |    "metadata": {},
2125 |    "source": [
2126 |     "### Part 2\n",
2127 |     "\n",
2128 |     "Write one pattern that only matches forms of “download” (tokens with the lemma “download”), followed by a token with the part-of-speech tag 'PROPN' (proper noun)."
2129 |    ]
2130 |   },
2131 |   {
2132 |    "cell_type": "code",
2133 |    "execution_count": 54,
2134 |    "metadata": {},
2135 |    "outputs": [
2136 |     {
2137 |      "name": "stdout",
2138 |      "output_type": "stream",
2139 |      "text": [
2140 |       "Total matches found: 3\n",
2141 |       "Match found: downloaded Fortnite\n",
2142 |       "Match found: downloading Minecraft\n",
2143 |       "Match found: download Winzip\n"
2144 |      ]
2145 |     }
2146 |    ],
2147 |    "source": [
2148 |     "import spacy\n",
2149 |     "from spacy.matcher import Matcher\n",
2150 |     "\n",
2151 |     "nlp = spacy.load(\"en_core_web_sm\")\n",
2152 |     "matcher = Matcher(nlp.vocab)\n",
2153 |     "\n",
2154 |     "doc = nlp(\n",
2155 |     "    \"i downloaded Fortnite on my laptop and can't open the game at all. Help? \"\n",
2156 |     "    \"so when I was downloading Minecraft, I got the Windows version where it \"\n",
2157 |     "    \"is the '.zip' folder and I used the default program to unpack it... do \"\n",
2158 |     "    \"I also need to download Winzip?\"\n",
2159 |     ")\n",
2160 |     "\n",
2161 |     "# Write a pattern that matches a form of \"download\" plus proper noun\n",
2162 |     "pattern = [{\"LEMMA\": 'download'}, {\"POS\": 'PROPN'}]\n",
2163 |     "\n",
2164 |     "# Add the pattern to the matcher and apply the matcher to the doc\n",
2165 |     "matcher.add(\"DOWNLOAD_THINGS_PATTERN\", None, pattern)\n",
2166 |     "matches = matcher(doc)\n",
2167 |     "print(\"Total matches found:\", len(matches))\n",
2168 |     "\n",
2169 |     "# Iterate over the matches and print the span text\n",
2170 |     "for match_id, start, end in matches:\n",
2171 |     "    print(\"Match found:\", doc[start:end].text)"
2172 |    ]
2173 |   },
2174 |   {
2175 |    "cell_type": "code",
2176 |    "execution_count": 55,
2177 |    "metadata": {},
2178 |    "outputs": [
2179 |     {
2180 |      "name": "stdout",
2181 |      "output_type": "stream",
2182 |      "text": [
2183 |       "Total matches found: 0\n"
2184 |      ]
2185 |     }
2186 |    ],
2187 |    "source": [
2188 |     "import spacy\n",
2189 |     "from spacy.matcher import Matcher\n",
2190 |     "\n",
2191 |     "nlp = spacy.load(\"pt_core_news_sm\")\n",
2192 |     "matcher = Matcher(nlp.vocab)\n",
2193 |     "\n",
2194 |     "doc = nlp(\n",
2195 |     "    \"Eu baixei varios aplicativos na minha vida \"\n",
2196 |     "    \"mas quando eu baixo o fortinite sempre dá problemas \"\n",
2197 |     "    \"eu realmente nao sei mais o que fazer \"\n",
2198 |     "    \"Tem algum problema para baixar o winzip?\"\n",
2199 |     ")\n",
2200 |     "\n",
2201 |     "# Write a pattern that matches a form of \"download\" plus proper noun\n",
2202 |     "pattern = [{\"LEMMA\": 'baixar'}, {\"POS\": 'PROPN'}]\n",
2203 |     "\n",
2204 |     "# Add the pattern to the matcher and apply the matcher to the doc\n",
2205 |     "matcher.add(\"DOWNLOAD_THINGS_PATTERN\", None, pattern)\n",
2206 |     "matches = matcher(doc)\n",
2207 |     "print(\"Total matches found:\", len(matches))\n",
2208 |     "\n",
2209 |     "# Iterate over the matches and print the span text\n",
2210 |     "for match_id, start, end in matches:\n",
2211 |     "    print(\"Match found:\", doc[start:end].text)"
2212 |    ]
2213 |   },
2214 |   {
2215 |    "cell_type": "markdown",
2216 |    "metadata": {},
2217 |    "source": [
2218 |     "Part 3\n",
2219 |     "\n",
2220 |     "Write one pattern that matches adjectives ('ADJ') followed by one or two 'NOUN's (one noun and one optional noun)."
2221 |    ]
2222 |   },
2223 |   {
2224 |    "cell_type": "code",
2225 |    "execution_count": 56,
2226 |    "metadata": {},
2227 |    "outputs": [
2228 |     {
2229 |      "name": "stdout",
2230 |      "output_type": "stream",
2231 |      "text": [
2232 |       "Total matches found: 4\n",
2233 |       "Match found: beautiful design\n",
2234 |       "Match found: smart search\n",
2235 |       "Match found: automatic labels\n",
2236 |       "Match found: optional voice responses\n"
2237 |      ]
2238 |     }
2239 |    ],
2240 |    "source": [
2241 |     "import spacy\n",
2242 |     "from spacy.matcher import Matcher\n",
2243 |     "\n",
2244 |     "nlp = spacy.load(\"en_core_web_sm\")\n",
2245 |     "matcher = Matcher(nlp.vocab)\n",
2246 |     "\n",
2247 |     "doc = nlp(\n",
2248 |     "    \"Features of the app include a beautiful design, smart search, automatic \"\n",
2249 |     "    \"labels and optional voice responses.\"\n",
2250 |     ")\n",
2251 |     "\n",
2252 |     "# Write a pattern for adjective plus one or two nouns\n",
2253 |     "pattern = [{\"POS\": \"ADJ\"}, {\"POS\": \"NOUN\"}, {\"POS\": \"NOUN\", \"OP\": \"?\"}]\n",
2254 |     "\n",
2255 |     "# Add the pattern to the matcher and apply the matcher to the doc\n",
2256 |     "matcher.add(\"ADJ_NOUN_PATTERN\", None, pattern)\n",
2257 |     "matches = matcher(doc)\n",
2258 |     "print(\"Total matches found:\", len(matches))\n",
2259 |     "\n",
2260 |     "# Iterate over the matches and print the span text\n",
2261 |     "for match_id, start, end in matches:\n",
2262 |     "    print(\"Match found:\", doc[start:end].text)"
2263 |    ]
2264 |   }
2265 |  ],
2266 |  "metadata": {
2267 |   "kernelspec": {
2268 |    "display_name": "Python 3",
2269 |    "language": "python",
2270 |    "name": "python3"
2271 |   },
2272 |   "language_info": {
2273 |    "codemirror_mode": {
2274 |     "name": "ipython",
2275 |     "version": 3
2276 |    },
2277 |    "file_extension": ".py",
2278 |    "mimetype": "text/x-python",
2279 |    "name": "python",
2280 |    "nbconvert_exporter": "python",
2281 |    "pygments_lexer": "ipython3",
2282 |    "version": "3.6.6"
2283 |   },
2284 |   "toc": {
2285 |    "base_numbering": 1,
2286 |    "nav_menu": {},
2287 |    "number_sections": false,
2288 |    "sideBar": false,
2289 |    "skip_h1_title": false,
2290 |    "title_cell": "Table of Contens",
2291 |    "title_sidebar": "Contents",
2292 |    "toc_cell": false,
2293 |    "toc_position": {
2294 |     "height": "calc(100% - 180px)",
2295 |     "left": "10px",
2296 |     "top": "150px",
2297 |     "width": "332.807px"
2298 |    },
2299 |    "toc_section_display": false,
2300 |    "toc_window_display": false
2301 |   }
2302 |  },
2303 |  "nbformat": 4,
2304 |  "nbformat_minor": 2
2305 | }
2306 | 


--------------------------------------------------------------------------------