├── NLP_notebook.ipynb ├── NLP_notebook_spacy.ipynb ├── README.md └── Spacy_Demo.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # how-to-solve-NLP 2 | 3 | Look at the [wiki](https://github.com/lapolonio/how-to-solve-NLP/wiki) 4 | -------------------------------------------------------------------------------- /Spacy_Demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Spacy Demo.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | } 14 | }, 15 | "cells": [ 16 | { 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "id": "view-in-github", 20 | "colab_type": "text" 21 | }, 22 | "source": [ 23 | "[View in Colaboratory](https://colab.research.google.com/github/lapolonio/how-to-solve-NLP/blob/master/Spacy_Demo.ipynb)" 24 | ] 25 | }, 26 | { 27 | "metadata": { 28 | "id": "6wug493vfiZi", 29 | "colab_type": "code", 30 | "colab": { 31 | "base_uri": "https://localhost:8080/", 32 | "height": 1482 33 | }, 34 | "outputId": "0e054e3b-76ec-4d0a-badf-b5551aa1f3f8" 35 | }, 36 | "cell_type": "code", 37 | "source": [ 38 | "!pip install spacy\n", 39 | "!python -m spacy download en_core_web_sm\n", 40 | "# !python -m spacy download en\n", 41 | "# !python -m spacy download custom_ner_model" 42 | ], 43 | "execution_count": 2, 44 | "outputs": [ 45 | { 46 | "output_type": "stream", 47 | "text": [ 48 | "Collecting spacy\n", 49 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/3c/31/e60f88751e48851b002f78a35221d12300783d5a43d4ef12fbf10cca96c3/spacy-2.0.11.tar.gz (17.6MB)\n", 50 | "\u001b[K 100% |████████████████████████████████| 17.6MB 2.1MB/s \n", 51 | "\u001b[?25hRequirement already satisfied: numpy>=1.7 in /usr/local/lib/python3.6/dist-packages (from spacy) (1.14.3)\n", 52 | "Collecting murmurhash<0.29,>=0.28 (from spacy)\n", 53 | " Downloading https://files.pythonhosted.org/packages/5e/31/c8c1ecafa44db30579c8c457ac7a0f819e8b1dbc3e58308394fff5ff9ba7/murmurhash-0.28.0.tar.gz\n", 54 | "Collecting cymem<1.32,>=1.30 (from spacy)\n", 55 | " Downloading https://files.pythonhosted.org/packages/f8/9e/273fbea507de99166c11cd0cb3fde1ac01b5bc724d9a407a2f927ede91a1/cymem-1.31.2.tar.gz\n", 56 | "Collecting preshed<2.0.0,>=1.0.0 (from spacy)\n", 57 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/1b/ac/7c17b1fd54b60972785b646d37da2826311cca70842c011c4ff84fbe95e0/preshed-1.0.0.tar.gz (89kB)\n", 58 | "\u001b[K 100% |████████████████████████████████| 92kB 23.6MB/s \n", 59 | "\u001b[?25hCollecting thinc<6.11.0,>=6.10.1 (from spacy)\n", 60 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/55/fd/e9f36081e6f53699943381858848f3b4d759e0dd03c43b98807dde34c252/thinc-6.10.2.tar.gz (1.2MB)\n", 61 | "\u001b[K 100% |████████████████████████████████| 1.2MB 18.8MB/s \n", 62 | "\u001b[?25hCollecting plac<1.0.0,>=0.9.6 (from spacy)\n", 63 | " Downloading https://files.pythonhosted.org/packages/9e/9b/62c60d2f5bc135d2aa1d8c8a86aaf84edb719a59c7f11a4316259e61a298/plac-0.9.6-py2.py3-none-any.whl\n", 64 | "Collecting pathlib (from spacy)\n", 65 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/ac/aa/9b065a76b9af472437a0059f77e8f962fe350438b927cb80184c32f075eb/pathlib-1.0.1.tar.gz (49kB)\n", 66 | "\u001b[K 100% |████████████████████████████████| 51kB 20.7MB/s \n", 67 | "\u001b[?25hCollecting ujson>=1.35 (from spacy)\n", 68 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/16/c4/79f3409bc710559015464e5f49b9879430d8f87498ecdc335899732e5377/ujson-1.35.tar.gz (192kB)\n", 69 | "\u001b[K 100% |████████████████████████████████| 194kB 25.0MB/s \n", 70 | "\u001b[?25hCollecting dill<0.3,>=0.2 (from spacy)\n", 71 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/91/a0/19d4d31dee064fc553ae01263b5c55e7fb93daff03a69debbedee647c5a0/dill-0.2.7.1.tar.gz (64kB)\n", 72 | "\u001b[K 100% |████████████████████████████████| 71kB 22.8MB/s \n", 73 | "\u001b[?25hCollecting regex==2017.4.5 (from spacy)\n", 74 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/36/62/c0c0d762ffd4ffaf39f372eb8561b8d491a11ace5a7884610424a8b40f95/regex-2017.04.05.tar.gz (601kB)\n", 75 | "\u001b[K 100% |████████████████████████████████| 604kB 25.1MB/s \n", 76 | "\u001b[?25hCollecting wrapt (from thinc<6.11.0,>=6.10.1->spacy)\n", 77 | " Downloading https://files.pythonhosted.org/packages/a0/47/66897906448185fcb77fc3c2b1bc20ed0ecca81a0f2f88eda3fc5a34fc3d/wrapt-1.10.11.tar.gz\n", 78 | "Collecting tqdm<5.0.0,>=4.10.0 (from thinc<6.11.0,>=6.10.1->spacy)\n", 79 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/93/24/6ab1df969db228aed36a648a8959d1027099ce45fad67532b9673d533318/tqdm-4.23.4-py2.py3-none-any.whl (42kB)\n", 80 | "\u001b[K 100% |████████████████████████████████| 51kB 11.9MB/s \n", 81 | "\u001b[?25hCollecting cytoolz<0.9,>=0.8 (from thinc<6.11.0,>=6.10.1->spacy)\n", 82 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/0f/e6/ccc124714dcc1bd511e64ddafb4d5d20ada2533b92e3173a4cf09e0d0831/cytoolz-0.8.2.tar.gz (386kB)\n", 83 | "\u001b[K 100% |████████████████████████████████| 389kB 25.8MB/s \n", 84 | "\u001b[?25hRequirement already satisfied: six<2.0.0,>=1.10.0 in /usr/local/lib/python3.6/dist-packages (from thinc<6.11.0,>=6.10.1->spacy) (1.11.0)\n", 85 | "Requirement already satisfied: termcolor in /usr/local/lib/python3.6/dist-packages (from thinc<6.11.0,>=6.10.1->spacy) (1.1.0)\n", 86 | "Collecting msgpack-python (from thinc<6.11.0,>=6.10.1->spacy)\n", 87 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/8a/20/6eca772d1a5830336f84aca1d8198e5a3f4715cd1c7fc36d3cc7f7185091/msgpack-python-0.5.6.tar.gz (138kB)\n", 88 | "\u001b[K 100% |████████████████████████████████| 143kB 26.8MB/s \n", 89 | "\u001b[?25hCollecting msgpack-numpy==0.4.1 (from thinc<6.11.0,>=6.10.1->spacy)\n", 90 | " Downloading https://files.pythonhosted.org/packages/2e/43/393e30e2768b0357541ac95891f96b80ccc4d517e0dd2fa3042fc8926538/msgpack_numpy-0.4.1-py2.py3-none-any.whl\n", 91 | "Requirement already satisfied: toolz>=0.8.0 in /usr/local/lib/python3.6/dist-packages (from cytoolz<0.9,>=0.8->thinc<6.11.0,>=6.10.1->spacy) (0.9.0)\n", 92 | "Building wheels for collected packages: spacy, murmurhash, cymem, preshed, thinc, pathlib, ujson, dill, regex, wrapt, cytoolz, msgpack-python\n", 93 | " Running setup.py bdist_wheel for spacy ... \u001b[?25l-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/" 94 | ], 95 | "name": "stdout" 96 | }, 97 | { 98 | "output_type": "stream", 99 | "text": [ 100 | "\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \bdone\n", 101 | "\u001b[?25h Stored in directory: /content/.cache/pip/wheels/fb/00/28/75c85d5135e7d9a100639137d1847d41e914ed16c962d467e4\n", 102 | " Running setup.py bdist_wheel for murmurhash ... \u001b[?25l-\b \b\\\b \b|\b \b/\b \bdone\n", 103 | "\u001b[?25h Stored in directory: /content/.cache/pip/wheels/b8/94/a4/f69f8664cdc1098603df44771b7fec5fd1b3d8364cdd83f512\n", 104 | " Running setup.py bdist_wheel for cymem ... \u001b[?25l-\b \b\\\b \bdone\n", 105 | "\u001b[?25h Stored in directory: /content/.cache/pip/wheels/55/8d/4a/f6328252aa2aaec0b1cb906fd96a1566d77f0f67701071ad13\n", 106 | " Running setup.py bdist_wheel for preshed ... \u001b[?25l-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \bdone\n", 107 | "\u001b[?25h Stored in directory: /content/.cache/pip/wheels/8f/85/06/2d132fb649a6bbcab22487e4147880a55b0dd0f4b18fdfd6b5\n", 108 | " Running setup.py bdist_wheel for thinc ... \u001b[?25l-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \bdone\n", 109 | "\u001b[?25h Stored in directory: /content/.cache/pip/wheels/d8/5c/3e/9acf5d9974fb1c9e7b467563ea5429c9325f67306e93147961\n", 110 | " Running setup.py bdist_wheel for pathlib ... \u001b[?25l-\b \bdone\n", 111 | "\u001b[?25h Stored in directory: /content/.cache/pip/wheels/f9/b2/4a/68efdfe5093638a9918bd1bb734af625526e849487200aa171\n", 112 | " Running setup.py bdist_wheel for ujson ... \u001b[?25l-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \bdone\n", 113 | "\u001b[?25h Stored in directory: /content/.cache/pip/wheels/28/77/e4/0311145b9c2e2f01470e744855131f9e34d6919687550f87d1\n", 114 | " Running setup.py bdist_wheel for dill ... \u001b[?25l-\b \bdone\n", 115 | "\u001b[?25h Stored in directory: /content/.cache/pip/wheels/99/c4/ed/1b64d2d5809e60d5a3685530432f6159d6a9959739facb61f2\n", 116 | " Running setup.py bdist_wheel for regex ... \u001b[?25l-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \bdone\n", 117 | "\u001b[?25h Stored in directory: /content/.cache/pip/wheels/75/07/38/3c16b529d50cb4e0cd3dbc7b75cece8a09c132692c74450b01\n", 118 | " Running setup.py bdist_wheel for wrapt ... \u001b[?25l-\b \b\\\b \bdone\n", 119 | "\u001b[?25h Stored in directory: /content/.cache/pip/wheels/48/5d/04/22361a593e70d23b1f7746d932802efe1f0e523376a74f321e\n", 120 | " Running setup.py bdist_wheel for cytoolz ... \u001b[?25l-\b \b\\\b \b|\b \b/\b \b-" 121 | ], 122 | "name": "stdout" 123 | }, 124 | { 125 | "output_type": "stream", 126 | "text": [ 127 | "\b \b\\\b \b|\b \b/\b \bdone\n", 128 | "\u001b[?25h Stored in directory: /content/.cache/pip/wheels/f8/b1/86/c92e4d36b690208fff8471711b85eaa6bc6d19860a86199a09\n", 129 | " Running setup.py bdist_wheel for msgpack-python ... \u001b[?25l-\b \b\\\b \b|\b \b/\b \bdone\n", 130 | "\u001b[?25h Stored in directory: /content/.cache/pip/wheels/d5/de/86/7fa56fda12511be47ea0808f3502bc879df4e63ab168ec0406\n", 131 | "Successfully built spacy murmurhash cymem preshed thinc pathlib ujson dill regex wrapt cytoolz msgpack-python\n", 132 | "Installing collected packages: murmurhash, cymem, preshed, wrapt, tqdm, cytoolz, plac, dill, pathlib, msgpack-python, msgpack-numpy, thinc, ujson, regex, spacy\n", 133 | "Successfully installed cymem-1.31.2 cytoolz-0.8.2 dill-0.2.7.1 msgpack-numpy-0.4.1 msgpack-python-0.5.6 murmurhash-0.28.0 pathlib-1.0.1 plac-0.9.6 preshed-1.0.0 regex-2017.4.5 spacy-2.0.11 thinc-6.10.2 tqdm-4.23.4 ujson-1.35 wrapt-1.10.11\n", 134 | "Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz\n", 135 | "\u001b[?25l Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz (37.4MB)\n", 136 | "\u001b[K 100% |████████████████████████████████| 37.4MB 64.8MB/s \n", 137 | "\u001b[?25hInstalling collected packages: en-core-web-sm\n", 138 | " Running setup.py install for en-core-web-sm ... \u001b[?25l-\b \b\\\b \b|\b \bdone\n", 139 | "\u001b[?25hSuccessfully installed en-core-web-sm-2.0.0\n", 140 | "\n", 141 | "\u001b[93m Linking successful\u001b[0m\n", 142 | " /usr/local/lib/python3.6/dist-packages/en_core_web_sm -->\n", 143 | " /usr/local/lib/python3.6/dist-packages/spacy/data/en_core_web_sm\n", 144 | "\n", 145 | " You can now load the model via spacy.load('en_core_web_sm')\n", 146 | "\n" 147 | ], 148 | "name": "stdout" 149 | } 150 | ] 151 | }, 152 | { 153 | "metadata": { 154 | "id": "GSPoSml7YaMn", 155 | "colab_type": "code", 156 | "colab": {} 157 | }, 158 | "cell_type": "code", 159 | "source": [ 160 | "import spacy\n", 161 | "\n", 162 | "nlp = spacy.load('en_core_web_sm')\n", 163 | "doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')\n", 164 | "\n", 165 | "for token in doc:\n", 166 | " print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,\n", 167 | " token.shape_, token.is_alpha, token.is_stop)" 168 | ], 169 | "execution_count": 0, 170 | "outputs": [] 171 | }, 172 | { 173 | "metadata": { 174 | "id": "4D1BhA0cYtj8", 175 | "colab_type": "code", 176 | "colab": { 177 | "base_uri": "https://localhost:8080/", 178 | "height": 87 179 | }, 180 | "outputId": "63212f3f-a786-4e5e-be62-4ad17219d1be" 181 | }, 182 | "cell_type": "code", 183 | "source": [ 184 | "import spacy\n", 185 | "from spacy import displacy\n", 186 | "\n", 187 | "text = \"\"\"But Google is starting from behind. The company made a late push\n", 188 | "into hardware, and Apple’s Siri, available on iPhones, and Amazon’s Alexa\n", 189 | "software, which runs on its Echo and Dot devices, have clear leads in\n", 190 | "consumer adoption.\"\"\"\n", 191 | "\n", 192 | "nlp = spacy.load('en_core_web_sm')\n", 193 | "doc = nlp(text)\n", 194 | "displacy.render(doc, style='ent', jupyter=True)" 195 | ], 196 | "execution_count": 5, 197 | "outputs": [ 198 | { 199 | "output_type": "display_data", 200 | "data": { 201 | "text/html": [ 202 | "