├── .gitignore
├── LICENSE
├── README.md
├── docs
    ├── Graph NLU- Natural Language Understanding with Python and Neo4j.pdf
    └── IPA_Memory_Dan_Kondratyuk_2017.04.30.pdf
├── notebooks
    ├── babi_dialogue_ridge.ipynb
    ├── dynamic_memory_1.ipynb
    ├── dynamic_memory_2.ipynb
    ├── dynamic_memory_3.ipynb
    ├── dynamic_memory_4.ipynb
    ├── resources
    │   ├── qa1_single-supporting-fact_test.txt
    │   ├── qa1_single-supporting-fact_train.txt
    │   ├── qa2_two-supporting-facts_test.txt
    │   ├── qa2_two-supporting-facts_train.txt
    │   ├── qa3_three-supporting-facts_test.txt
    │   ├── qa3_three-supporting-facts_train.txt
    │   ├── qa6_yes-no-questions_test.txt
    │   ├── qa6_yes-no-questions_train.txt
    │   ├── restaurants_props.pkl
    │   └── utts_refs.pkl
    └── screenshots
    │   ├── dialog-system.png
    │   ├── global-and-local-list.png
    │   ├── local-list.png
    │   ├── mary-john-example.png
    │   ├── prezzo.png
    │   ├── qa2-multiple-list.png
    │   ├── simple-relation.png
    │   ├── state-graph-1.png
    │   ├── state-graph-2.png
    │   └── v4-mary.png
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by .ignore support plugin (hsz.mobi)
  2 | ### Vim template
  3 | # swap
  4 | [._]*.s[a-w][a-z]
  5 | [._]s[a-w][a-z]
  6 | # session
  7 | Session.vim
  8 | # temporary
  9 | .netrwhist
 10 | *~
 11 | # auto-generated tag files
 12 | tags
 13 | 
 14 | ### Java template
 15 | *.class
 16 | 
 17 | # Mobile Tools for Java (J2ME)
 18 | .mtj.tmp/
 19 | 
 20 | # Package Files #
 21 | *.jar
 22 | *.war
 23 | *.ear
 24 | 
 25 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
 26 | hs_err_pid*
 27 | 
 28 | ### Maven template
 29 | target/
 30 | pom.xml.tag
 31 | pom.xml.releaseBackup
 32 | pom.xml.versionsBackup
 33 | pom.xml.next
 34 | release.properties
 35 | dependency-reduced-pom.xml
 36 | buildNumber.properties
 37 | .mvn/timing.properties
 38 | 
 39 | ### VisualStudioCode template
 40 | .vscode
 41 | 
 42 | ### Gradle template
 43 | .gradle
 44 | build/
 45 | 
 46 | # Ignore Gradle GUI config
 47 | gradle-app.setting
 48 | 
 49 | # Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored)
 50 | !gradle-wrapper.jar
 51 | 
 52 | # Cache of project
 53 | .gradletasknamecache
 54 | 
 55 | # # Work around https://youtrack.jetbrains.com/issue/IDEA-116898
 56 | # gradle/wrapper/gradle-wrapper.properties
 57 | 
 58 | ### JetBrains template
 59 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
 60 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 61 | 
 62 | # User-specific stuff:
 63 | .idea/workspace.xml
 64 | .idea/tasks.xml
 65 | .idea/dictionaries
 66 | .idea/vcs.xml
 67 | .idea/jsLibraryMappings.xml
 68 | 
 69 | # Sensitive or high-churn files:
 70 | .idea/dataSources.ids
 71 | .idea/dataSources.xml
 72 | .idea/dataSources.local.xml
 73 | .idea/sqlDataSources.xml
 74 | .idea/dynamic.xml
 75 | .idea/uiDesigner.xml
 76 | 
 77 | # Gradle:
 78 | .idea/gradle.xml
 79 | .idea/libraries
 80 | 
 81 | # Mongo Explorer plugin:
 82 | .idea/mongoSettings.xml
 83 | 
 84 | ## File-based project format:
 85 | *.iws
 86 | 
 87 | ## Plugin-specific files:
 88 | 
 89 | # IntelliJ
 90 | /out/
 91 | .idea
 92 | 
 93 | # mpeltonen/sbt-idea plugin
 94 | .idea_modules/
 95 | 
 96 | # JIRA plugin
 97 | atlassian-ide-plugin.xml
 98 | 
 99 | # Crashlytics plugin (for Android Studio and IntelliJ)
100 | com_crashlytics_export_strings.xml
101 | crashlytics.properties
102 | crashlytics-build.properties
103 | fabric.properties
104 | 
105 | ### Windows template
106 | # Windows image file caches
107 | Thumbs.db
108 | ehthumbs.db
109 | 
110 | # Folder config file
111 | Desktop.ini
112 | 
113 | # Recycle Bin used on file shares
114 | $RECYCLE.BIN/
115 | 
116 | # Windows Installer files
117 | *.cab
118 | *.msi
119 | *.msm
120 | *.msp
121 | 
122 | # Windows shortcuts
123 | *.lnk
124 | 
125 | ### SublimeText template
126 | # cache files for sublime text
127 | *.tmlanguage.cache
128 | *.tmPreferences.cache
129 | *.stTheme.cache
130 | 
131 | # workspace files are user-specific
132 | *.sublime-workspace
133 | 
134 | # project files should be checked into the repository, unless a significant
135 | # proportion of contributors will probably not be using SublimeText
136 | # *.sublime-project
137 | 
138 | # sftp configuration file
139 | sftp-config.json
140 | 
141 | # Package control specific files
142 | Package Control.last-run
143 | Package Control.ca-list
144 | Package Control.ca-bundle
145 | Package Control.system-ca-bundle
146 | Package Control.cache/
147 | Package Control.ca-certs/
148 | bh_unicode_properties.cache
149 | 
150 | # Sublime-github package stores a github token in this file
151 | # https://packagecontrol.io/packages/sublime-github
152 | GitHub.sublime-settings
153 | 
154 | ### Linux template
155 | # temporary files which can be created if a process still has a handle open of a deleted file
156 | .fuse_hidden*
157 | 
158 | # KDE directory preferences
159 | .directory
160 | 
161 | # Linux trash folder which might appear on any partition or disk
162 | .Trash-*
163 | 
164 | ### Eclipse template
165 | 
166 | .metadata
167 | bin/
168 | tmp/
169 | *.tmp
170 | *.bak
171 | *.swp
172 | *~.nib
173 | local.properties
174 | .settings/
175 | .loadpath
176 | .recommenders
177 | 
178 | # Eclipse Core
179 | .project
180 | 
181 | # External tool builders
182 | .externalToolBuilders/
183 | 
184 | # Locally stored "Eclipse launch configurations"
185 | *.launch
186 | 
187 | # PyDev specific (Python IDE for Eclipse)
188 | *.pydevproject
189 | 
190 | # CDT-specific (C/C++ Development Tooling)
191 | .cproject
192 | 
193 | # JDT-specific (Eclipse Java Development Tools)
194 | .classpath
195 | 
196 | # Java annotation processor (APT)
197 | .factorypath
198 | 
199 | # PDT-specific (PHP Development Tools)
200 | .buildpath
201 | 
202 | # sbteclipse plugin
203 | .target
204 | 
205 | # Tern plugin
206 | .tern-project
207 | 
208 | # TeXlipse plugin
209 | .texlipse
210 | 
211 | # STS (Spring Tool Suite)
212 | .springBeans
213 | 
214 | # Code Recommenders
215 | .recommenders/
216 | 
217 | ### Project Specific
218 | 
219 | .ipynb_checkpoints


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Dan Kondratyuk
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Graph NLU
 2 | 
 3 | [![Preview Image](/notebooks/screenshots/qa2-multiple-list.png)](notebooks/dynamic_memory_1.ipynb)
 4 | 
 5 | ## Motivation :bar_chart:
 6 | 
 7 | Graph NLU uses graph databases as a means to represent natural language relationships flexibly and dynamically.
 8 | 
 9 | The primary motivation for this project is to develop a way to understand natural language dialog in an interactive setting by remembering previous dialog states. [Virtual assistants](https://en.wikipedia.org/wiki/Virtual_assistant_(artificial_intelligence)) like Siri, Google Assistant, and Alexa have the common problem that they behave like amnesiacs, i.e., they do not remember much about previous interactions.
10 | 
11 | One proposal to get around the memory problem is by representing the previous dialog states using a persistent graph. Because graphs offer a powerful and interpretable way of encoding high-level representations of entities and their associated relationships, an attractive proposition is to leverage them in processing natural language. Graph databases (e.g., [Neo4j](https://neo4j.com/)) offer a rich suite of tools to quickly construct such graphs and persist them over the long term.
12 | 
13 | This project is in its research phase, hence all code in this repository is exploratory. The supplied Jupyter (iPython) notebooks do the following:
14 | 
15 | 1. Examine several dialog domains
16 | 1. Explain some of the design considerations for using graphs to process natural language
17 | 1. Define models for solving a dialog domain
18 | 1. Evaluate these models for accuracy and usefulness
19 | 
20 | Explanations behind each code snippet are given where possible. [Read the research paper (PDF)](docs/IPA_Memory_Dan_Kondratyuk_2017.04.30.pdf) discussing a more detailed approach to the personal assistant memory problem.
21 | 
22 | ## Getting Started :traffic_light:
23 | 
24 | - [Video talk overview (YouTube)](https://www.youtube.com/watch?v=mTCqQ2e08Q8)
25 | - [Video talk slides (PDF)](docs/Graph%20NLU-%20Natural%20Language%20Understanding%20with%20Python%20and%20Neo4j.pdf)
26 | 
27 | Get an introduction to this project by viewing the supplied Jupyter notebooks in GitHub under the `notebooks` directory:
28 | 
29 | - [dynamic_memory_1](notebooks/dynamic_memory_1.ipynb) - Evaluates the bAbI QA tasks using Neo4j queries
30 | 
31 | ## Running the Code :snake:
32 | 
33 | The Python code uses the Neo4j graph database to store and query natural language relationships. In addition, several processing steps will require popular Python data processing tools like `pandas`, `numpy`, `sklearn`, and `nltk`.
34 | 
35 | ### Prerequisites
36 | 
37 | 1. Make sure these are on your system:
38 | 
39 | - [Python](https://www.python.org/downloads/) (3.5+)
40 | - [Neo4j](https://neo4j.com/download/community-edition/) (3.1+)
41 | 
42 | 2. Install the python packages in `requirements.txt` if you don't have them already.
43 | 
44 | ```bash
45 | pip install -r ./requirements.txt
46 | ```
47 | 
48 | ### Running Jupyter Notebooks
49 | 
50 | 3. Clone the repository.
51 | 
52 | ```bash
53 | git clone https://github.com/Hyperparticle/graph-nlu.git
54 | cd ./graph-nlu/notebooks
55 | ```
56 | 
57 | 4. Run the iPython notebooks with Jupyter.
58 | 
59 | ```bash
60 | jupyter notebook
61 | ```
62 | 
63 | 5. Get an introduction to the project with [dynamic_memory_1](notebooks/dynamic_memory_1.ipynb).
64 | 
65 | ## Contributing :mega:
66 | 
67 | Interested in the project? We'd love to hear your ideas! Open a [GitHub issue](https://github.com/Hyperparticle/graph-nlu/issues) with your comments.
68 | 
69 | ## About :clipboard:
70 | 
71 | Created by [Dan Kondratyuk](https://hyperparticle.com/about/), a member of [Speech, Language & Interactive Machines (SLIM)](http://coen.boisestate.edu/slim/) at Boise State University.
72 | 


--------------------------------------------------------------------------------
/docs/Graph NLU- Natural Language Understanding with Python and Neo4j.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hyperparticle/graph-nlu/2aa7ef3ce67e4dadd5d1b89b9d7bf40d3d53d9fc/docs/Graph NLU- Natural Language Understanding with Python and Neo4j.pdf


--------------------------------------------------------------------------------
/docs/IPA_Memory_Dan_Kondratyuk_2017.04.30.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hyperparticle/graph-nlu/2aa7ef3ce67e4dadd5d1b89b9d7bf40d3d53d9fc/docs/IPA_Memory_Dan_Kondratyuk_2017.04.30.pdf


--------------------------------------------------------------------------------
/notebooks/dynamic_memory_1.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {
   6 |     "deletable": true,
   7 |     "editable": true
   8 |    },
   9 |    "source": [
  10 |     "# Memory Representation in Dialogue Systems\n",
  11 |     "\n",
  12 |     "The following notebook is the result of an NLP project that explores the question, \"How could interaction be stored in memory, and how can that information be leveraged for further use?\" \n",
  13 |     "\n",
  14 |     "[Dialog systems](https://en.wikipedia.org/wiki/Dialog_system) can be quite useful, but have difficulty keeping track of concepts and entities dynamically. Commercial implementations among the likes of Siri, Google Assistant, and Alexa are great for performing simple tasks, but fall short when remembering ad-hoc relationships that regularly present themselves in conversation. For more information on dialogue systems, graph databases, and ontologies as they relate to this project, see the white paper entitled [IPA_Memory](/files/docs/IPA_Memory_Dan_Kondratyuk_2017.04.30.pdf) under the `docs` directory of this repository.\n",
  15 |     "\n",
  16 |     "To enhance the capabilities of dialogue systems, this notebook will provide a simple software implementation of a model that is intended to by dynamic, incremental, flexible, and interpretable. By forming high-level concepts that evolve over time, this model will evaluate the dialogue system's ability to understand user input. This notebook will show how such a system can update its internal state based on natural language facts, and retrieve results based on natural language questions. See the white paper for more details on the rationale behind these design decisions.\n",
  17 |     "\n",
  18 |     "The code below is written in Python, and uses a [Neo4j Graph Database](https://neo4j.com/product/) to provide non-volatile storage and efficient querying capabilities.\n",
  19 |     "\n",
  20 |     "The test corpus is supplied by the [bAbI Tasks Data 1-20 (v1.2)](https://research.fb.com/downloads/babi/). It contains sequences of English sentences to provide the system knowledge of a simple domain involving characters moving to different rooms and interacting with objects. Questions are inserted periodically to evaluate that the system is keeping track of these relationships accurately.\n",
  21 |     "\n",
  22 |     "## Prerequisites to Running this Notebook\n",
  23 |     "- [Python](https://www.python.org/downloads/) (3.5+)\n",
  24 |     "- Python packages (install via pip): `pandas`, `numpy`, `nltk`, `scikit-learn`, `neo4j-driver`\n",
  25 |     "- [Neo4j](https://neo4j.com/download/) (3.1+)"
  26 |    ]
  27 |   },
  28 |   {
  29 |    "cell_type": "markdown",
  30 |    "metadata": {
  31 |     "deletable": true,
  32 |     "editable": true
  33 |    },
  34 |    "source": [
  35 |     "# Part 1: bAbI QA 1\n",
  36 |     "\n",
  37 |     "## Process the Text\n",
  38 |     "\n",
  39 |     "### Import DataFrames\n",
  40 |     "First we will use `pandas` to import `qa1_single-supporting-fact_train.txt` from our corpus into a DataFrame called `data`. Every line in this document represents one sentence, which will be split using `nltk`'s word tokenizer."
  41 |    ]
  42 |   },
  43 |   {
  44 |    "cell_type": "code",
  45 |    "execution_count": 1,
  46 |    "metadata": {
  47 |     "collapsed": true,
  48 |     "deletable": true,
  49 |     "editable": true
  50 |    },
  51 |    "outputs": [],
  52 |    "source": [
  53 |     "# Import the necessary packages\n",
  54 |     "import pandas as pd\n",
  55 |     "import numpy as np\n",
  56 |     "import nltk\n",
  57 |     "from sklearn.metrics import accuracy_score"
  58 |    ]
  59 |   },
  60 |   {
  61 |    "cell_type": "code",
  62 |    "execution_count": 2,
  63 |    "metadata": {
  64 |     "collapsed": false,
  65 |     "deletable": true,
  66 |     "editable": true
  67 |    },
  68 |    "outputs": [
  69 |     {
  70 |      "name": "stdout",
  71 |      "output_type": "stream",
  72 |      "text": [
  73 |       "showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml\n"
  74 |      ]
  75 |     },
  76 |     {
  77 |      "data": {
  78 |       "text/plain": [
  79 |        "True"
  80 |       ]
  81 |      },
  82 |      "execution_count": 2,
  83 |      "metadata": {},
  84 |      "output_type": "execute_result"
  85 |     }
  86 |    ],
  87 |    "source": [
  88 |     "# Download NLTK packages\n",
  89 |     "# An OS window should pop up for you to download the appropriate packages\n",
  90 |     "# Select all-nltk and click on the download button. Once download has finished exit the window and continue.\n",
  91 |     "nltk.download()"
  92 |    ]
  93 |   },
  94 |   {
  95 |    "cell_type": "code",
  96 |    "execution_count": 3,
  97 |    "metadata": {
  98 |     "collapsed": false,
  99 |     "deletable": true,
 100 |     "editable": true
 101 |    },
 102 |    "outputs": [],
 103 |    "source": [
 104 |     "# Read the bAbI data as CSV\n",
 105 |     "filename = 'resources/qa1_single-supporting-fact_train.txt'\n",
 106 |     "data_qa1 = pd.read_csv(filename, delimiter='\\t', names=['sentence', 'answer', 'factid'])\n",
 107 |     "data_qa1 = data_qa1.fillna('')"
 108 |    ]
 109 |   },
 110 |   {
 111 |    "cell_type": "markdown",
 112 |    "metadata": {
 113 |     "deletable": true,
 114 |     "editable": true
 115 |    },
 116 |    "source": [
 117 |     "The cell below shows what the input data looks like. Every `sentence` in this frame can either be a factual statement, or a question about the preceeding statements. Each statement describes four characters moving between six different rooms. The questions periodically ask the room in which a person is currently in, and the objective is to answer them all correctly, matching the corresponding `answer` column (it is blank if the sentence is a statement). The `factid` column indicates the index of the supporting facts for each answer, but we won't be needing it.\n",
 118 |     "\n",
 119 |     "Due to the nature of the model, training will not be necessary to answer each question. Therefore, the entire document will be used for test evaluation."
 120 |    ]
 121 |   },
 122 |   {
 123 |    "cell_type": "code",
 124 |    "execution_count": 4,
 125 |    "metadata": {
 126 |     "collapsed": false,
 127 |     "deletable": true,
 128 |     "editable": true
 129 |    },
 130 |    "outputs": [
 131 |     {
 132 |      "data": {
 133 |       "text/html": [
 134 |        "<div>\n",
 135 |        "<style scoped>\n",
 136 |        "    .dataframe tbody tr th:only-of-type {\n",
 137 |        "        vertical-align: middle;\n",
 138 |        "    }\n",
 139 |        "\n",
 140 |        "    .dataframe tbody tr th {\n",
 141 |        "        vertical-align: top;\n",
 142 |        "    }\n",
 143 |        "\n",
 144 |        "    .dataframe thead th {\n",
 145 |        "        text-align: right;\n",
 146 |        "    }\n",
 147 |        "</style>\n",
 148 |        "<table border=\"1\" class=\"dataframe\">\n",
 149 |        "  <thead>\n",
 150 |        "    <tr style=\"text-align: right;\">\n",
 151 |        "      <th></th>\n",
 152 |        "      <th>sentence</th>\n",
 153 |        "      <th>answer</th>\n",
 154 |        "      <th>factid</th>\n",
 155 |        "    </tr>\n",
 156 |        "  </thead>\n",
 157 |        "  <tbody>\n",
 158 |        "    <tr>\n",
 159 |        "      <th>0</th>\n",
 160 |        "      <td>1 Mary moved to the bathroom.</td>\n",
 161 |        "      <td></td>\n",
 162 |        "      <td></td>\n",
 163 |        "    </tr>\n",
 164 |        "    <tr>\n",
 165 |        "      <th>1</th>\n",
 166 |        "      <td>2 John went to the hallway.</td>\n",
 167 |        "      <td></td>\n",
 168 |        "      <td></td>\n",
 169 |        "    </tr>\n",
 170 |        "    <tr>\n",
 171 |        "      <th>2</th>\n",
 172 |        "      <td>3 Where is Mary?</td>\n",
 173 |        "      <td>bathroom</td>\n",
 174 |        "      <td>1</td>\n",
 175 |        "    </tr>\n",
 176 |        "    <tr>\n",
 177 |        "      <th>3</th>\n",
 178 |        "      <td>4 Daniel went back to the hallway.</td>\n",
 179 |        "      <td></td>\n",
 180 |        "      <td></td>\n",
 181 |        "    </tr>\n",
 182 |        "    <tr>\n",
 183 |        "      <th>4</th>\n",
 184 |        "      <td>5 Sandra moved to the garden.</td>\n",
 185 |        "      <td></td>\n",
 186 |        "      <td></td>\n",
 187 |        "    </tr>\n",
 188 |        "    <tr>\n",
 189 |        "      <th>5</th>\n",
 190 |        "      <td>6 Where is Daniel?</td>\n",
 191 |        "      <td>hallway</td>\n",
 192 |        "      <td>4</td>\n",
 193 |        "    </tr>\n",
 194 |        "  </tbody>\n",
 195 |        "</table>\n",
 196 |        "</div>"
 197 |       ],
 198 |       "text/plain": [
 199 |        "                             sentence    answer factid\n",
 200 |        "0       1 Mary moved to the bathroom.                 \n",
 201 |        "1         2 John went to the hallway.                 \n",
 202 |        "2                   3 Where is Mary?   bathroom      1\n",
 203 |        "3  4 Daniel went back to the hallway.                 \n",
 204 |        "4       5 Sandra moved to the garden.                 \n",
 205 |        "5                 6 Where is Daniel?    hallway      4"
 206 |       ]
 207 |      },
 208 |      "execution_count": 4,
 209 |      "metadata": {},
 210 |      "output_type": "execute_result"
 211 |     }
 212 |    ],
 213 |    "source": [
 214 |     "data_qa1[:6]"
 215 |    ]
 216 |   },
 217 |   {
 218 |    "cell_type": "markdown",
 219 |    "metadata": {
 220 |     "deletable": true,
 221 |     "editable": true
 222 |    },
 223 |    "source": [
 224 |     "Next, we process this data frame by splitting the sentences and tagging each sentence by its type (statement or question). "
 225 |    ]
 226 |   },
 227 |   {
 228 |    "cell_type": "code",
 229 |    "execution_count": 5,
 230 |    "metadata": {
 231 |     "collapsed": false,
 232 |     "deletable": true,
 233 |     "editable": true
 234 |    },
 235 |    "outputs": [],
 236 |    "source": [
 237 |     "# Tag each sentence as a statement (S) or question (Q)\n",
 238 |     "tag_sentence = lambda row: 'S' if row.answer == '' else 'Q'\n",
 239 |     "data_qa1['type'] = data_qa1.apply(tag_sentence, axis=1)\n",
 240 |     "\n",
 241 |     "# Use NLTK to tokenize the sentences into arrays of words\n",
 242 |     "# If you get an error here, make sure you have downloaded the NLTK packages above\n",
 243 |     "tokenize = lambda row: nltk.word_tokenize(row.sentence)[1:]\n",
 244 |     "data_qa1.sentence = data_qa1.apply(tokenize, axis=1)\n",
 245 |     "\n",
 246 |     "# Drop the factid column, as we won't need it\n",
 247 |     "data_qa1 = data_qa1.drop('factid', axis=1)"
 248 |    ]
 249 |   },
 250 |   {
 251 |    "cell_type": "code",
 252 |    "execution_count": 6,
 253 |    "metadata": {
 254 |     "collapsed": false,
 255 |     "deletable": true,
 256 |     "editable": true
 257 |    },
 258 |    "outputs": [
 259 |     {
 260 |      "data": {
 261 |       "text/html": [
 262 |        "<div>\n",
 263 |        "<style scoped>\n",
 264 |        "    .dataframe tbody tr th:only-of-type {\n",
 265 |        "        vertical-align: middle;\n",
 266 |        "    }\n",
 267 |        "\n",
 268 |        "    .dataframe tbody tr th {\n",
 269 |        "        vertical-align: top;\n",
 270 |        "    }\n",
 271 |        "\n",
 272 |        "    .dataframe thead th {\n",
 273 |        "        text-align: right;\n",
 274 |        "    }\n",
 275 |        "</style>\n",
 276 |        "<table border=\"1\" class=\"dataframe\">\n",
 277 |        "  <thead>\n",
 278 |        "    <tr style=\"text-align: right;\">\n",
 279 |        "      <th></th>\n",
 280 |        "      <th>sentence</th>\n",
 281 |        "      <th>answer</th>\n",
 282 |        "      <th>type</th>\n",
 283 |        "    </tr>\n",
 284 |        "  </thead>\n",
 285 |        "  <tbody>\n",
 286 |        "    <tr>\n",
 287 |        "      <th>0</th>\n",
 288 |        "      <td>[Mary, moved, to, the, bathroom, .]</td>\n",
 289 |        "      <td></td>\n",
 290 |        "      <td>S</td>\n",
 291 |        "    </tr>\n",
 292 |        "    <tr>\n",
 293 |        "      <th>1</th>\n",
 294 |        "      <td>[John, went, to, the, hallway, .]</td>\n",
 295 |        "      <td></td>\n",
 296 |        "      <td>S</td>\n",
 297 |        "    </tr>\n",
 298 |        "    <tr>\n",
 299 |        "      <th>2</th>\n",
 300 |        "      <td>[Where, is, Mary, ?]</td>\n",
 301 |        "      <td>bathroom</td>\n",
 302 |        "      <td>Q</td>\n",
 303 |        "    </tr>\n",
 304 |        "    <tr>\n",
 305 |        "      <th>3</th>\n",
 306 |        "      <td>[Daniel, went, back, to, the, hallway, .]</td>\n",
 307 |        "      <td></td>\n",
 308 |        "      <td>S</td>\n",
 309 |        "    </tr>\n",
 310 |        "    <tr>\n",
 311 |        "      <th>4</th>\n",
 312 |        "      <td>[Sandra, moved, to, the, garden, .]</td>\n",
 313 |        "      <td></td>\n",
 314 |        "      <td>S</td>\n",
 315 |        "    </tr>\n",
 316 |        "    <tr>\n",
 317 |        "      <th>5</th>\n",
 318 |        "      <td>[Where, is, Daniel, ?]</td>\n",
 319 |        "      <td>hallway</td>\n",
 320 |        "      <td>Q</td>\n",
 321 |        "    </tr>\n",
 322 |        "  </tbody>\n",
 323 |        "</table>\n",
 324 |        "</div>"
 325 |       ],
 326 |       "text/plain": [
 327 |        "                                    sentence    answer type\n",
 328 |        "0        [Mary, moved, to, the, bathroom, .]              S\n",
 329 |        "1          [John, went, to, the, hallway, .]              S\n",
 330 |        "2                       [Where, is, Mary, ?]  bathroom    Q\n",
 331 |        "3  [Daniel, went, back, to, the, hallway, .]              S\n",
 332 |        "4        [Sandra, moved, to, the, garden, .]              S\n",
 333 |        "5                     [Where, is, Daniel, ?]   hallway    Q"
 334 |       ]
 335 |      },
 336 |      "execution_count": 6,
 337 |      "metadata": {},
 338 |      "output_type": "execute_result"
 339 |     }
 340 |    ],
 341 |    "source": [
 342 |     "data_qa1[:6]"
 343 |    ]
 344 |   },
 345 |   {
 346 |    "cell_type": "markdown",
 347 |    "metadata": {
 348 |     "deletable": true,
 349 |     "editable": true
 350 |    },
 351 |    "source": [
 352 |     "We further split the `data_qa1` DataFrame into `statements` and `questions` DataFrames for easy access to all statements and questions respectively."
 353 |    ]
 354 |   },
 355 |   {
 356 |    "cell_type": "code",
 357 |    "execution_count": 7,
 358 |    "metadata": {
 359 |     "collapsed": true,
 360 |     "deletable": true,
 361 |     "editable": true
 362 |    },
 363 |    "outputs": [],
 364 |    "source": [
 365 |     "# Create a DataFrame with just the statements\n",
 366 |     "def statements(df):\n",
 367 |     "    return df[df.type == 'S'] \\\n",
 368 |     "        .reset_index(drop=True) \\\n",
 369 |     "        .drop('answer', axis=1) \\\n",
 370 |     "        .drop('type', axis=1)\n",
 371 |     "\n",
 372 |     "# Create a DataFrame with just the questions\n",
 373 |     "def questions(df):\n",
 374 |     "    return df[df.type == 'Q'] \\\n",
 375 |     "        .reset_index(drop=True) \\\n",
 376 |     "        .drop('type', axis=1)"
 377 |    ]
 378 |   },
 379 |   {
 380 |    "cell_type": "code",
 381 |    "execution_count": 8,
 382 |    "metadata": {
 383 |     "collapsed": false,
 384 |     "deletable": true,
 385 |     "editable": true
 386 |    },
 387 |    "outputs": [
 388 |     {
 389 |      "data": {
 390 |       "text/html": [
 391 |        "<div>\n",
 392 |        "<style scoped>\n",
 393 |        "    .dataframe tbody tr th:only-of-type {\n",
 394 |        "        vertical-align: middle;\n",
 395 |        "    }\n",
 396 |        "\n",
 397 |        "    .dataframe tbody tr th {\n",
 398 |        "        vertical-align: top;\n",
 399 |        "    }\n",
 400 |        "\n",
 401 |        "    .dataframe thead th {\n",
 402 |        "        text-align: right;\n",
 403 |        "    }\n",
 404 |        "</style>\n",
 405 |        "<table border=\"1\" class=\"dataframe\">\n",
 406 |        "  <thead>\n",
 407 |        "    <tr style=\"text-align: right;\">\n",
 408 |        "      <th></th>\n",
 409 |        "      <th>sentence</th>\n",
 410 |        "    </tr>\n",
 411 |        "  </thead>\n",
 412 |        "  <tbody>\n",
 413 |        "    <tr>\n",
 414 |        "      <th>0</th>\n",
 415 |        "      <td>[Mary, moved, to, the, bathroom, .]</td>\n",
 416 |        "    </tr>\n",
 417 |        "    <tr>\n",
 418 |        "      <th>1</th>\n",
 419 |        "      <td>[John, went, to, the, hallway, .]</td>\n",
 420 |        "    </tr>\n",
 421 |        "    <tr>\n",
 422 |        "      <th>2</th>\n",
 423 |        "      <td>[Daniel, went, back, to, the, hallway, .]</td>\n",
 424 |        "    </tr>\n",
 425 |        "    <tr>\n",
 426 |        "      <th>3</th>\n",
 427 |        "      <td>[Sandra, moved, to, the, garden, .]</td>\n",
 428 |        "    </tr>\n",
 429 |        "  </tbody>\n",
 430 |        "</table>\n",
 431 |        "</div>"
 432 |       ],
 433 |       "text/plain": [
 434 |        "                                    sentence\n",
 435 |        "0        [Mary, moved, to, the, bathroom, .]\n",
 436 |        "1          [John, went, to, the, hallway, .]\n",
 437 |        "2  [Daniel, went, back, to, the, hallway, .]\n",
 438 |        "3        [Sandra, moved, to, the, garden, .]"
 439 |       ]
 440 |      },
 441 |      "execution_count": 8,
 442 |      "metadata": {},
 443 |      "output_type": "execute_result"
 444 |     }
 445 |    ],
 446 |    "source": [
 447 |     "statements(data_qa1)[:4]"
 448 |    ]
 449 |   },
 450 |   {
 451 |    "cell_type": "code",
 452 |    "execution_count": 9,
 453 |    "metadata": {
 454 |     "collapsed": false,
 455 |     "deletable": true,
 456 |     "editable": true
 457 |    },
 458 |    "outputs": [
 459 |     {
 460 |      "data": {
 461 |       "text/html": [
 462 |        "<div>\n",
 463 |        "<style scoped>\n",
 464 |        "    .dataframe tbody tr th:only-of-type {\n",
 465 |        "        vertical-align: middle;\n",
 466 |        "    }\n",
 467 |        "\n",
 468 |        "    .dataframe tbody tr th {\n",
 469 |        "        vertical-align: top;\n",
 470 |        "    }\n",
 471 |        "\n",
 472 |        "    .dataframe thead th {\n",
 473 |        "        text-align: right;\n",
 474 |        "    }\n",
 475 |        "</style>\n",
 476 |        "<table border=\"1\" class=\"dataframe\">\n",
 477 |        "  <thead>\n",
 478 |        "    <tr style=\"text-align: right;\">\n",
 479 |        "      <th></th>\n",
 480 |        "      <th>sentence</th>\n",
 481 |        "      <th>answer</th>\n",
 482 |        "    </tr>\n",
 483 |        "  </thead>\n",
 484 |        "  <tbody>\n",
 485 |        "    <tr>\n",
 486 |        "      <th>0</th>\n",
 487 |        "      <td>[Where, is, Mary, ?]</td>\n",
 488 |        "      <td>bathroom</td>\n",
 489 |        "    </tr>\n",
 490 |        "    <tr>\n",
 491 |        "      <th>1</th>\n",
 492 |        "      <td>[Where, is, Daniel, ?]</td>\n",
 493 |        "      <td>hallway</td>\n",
 494 |        "    </tr>\n",
 495 |        "  </tbody>\n",
 496 |        "</table>\n",
 497 |        "</div>"
 498 |       ],
 499 |       "text/plain": [
 500 |        "                 sentence    answer\n",
 501 |        "0    [Where, is, Mary, ?]  bathroom\n",
 502 |        "1  [Where, is, Daniel, ?]   hallway"
 503 |       ]
 504 |      },
 505 |      "execution_count": 9,
 506 |      "metadata": {},
 507 |      "output_type": "execute_result"
 508 |     }
 509 |    ],
 510 |    "source": [
 511 |     "questions(data_qa1)[:2]"
 512 |    ]
 513 |   },
 514 |   {
 515 |    "cell_type": "markdown",
 516 |    "metadata": {
 517 |     "deletable": true,
 518 |     "editable": true
 519 |    },
 520 |    "source": [
 521 |     "### Extract Entities\n",
 522 |     "Next, we will extract the relevant entities from each statement and question so that we can more easily reason with these sentences.\n",
 523 |     "\n",
 524 |     "#### POS Tagging\n",
 525 |     "To process each sentence and produce a useful statement or question object, all that is necessary (for this dataset) is to use a part-of-speech tagger. The generated frame below displays the tagged list of (token, word) pairs."
 526 |    ]
 527 |   },
 528 |   {
 529 |    "cell_type": "code",
 530 |    "execution_count": 10,
 531 |    "metadata": {
 532 |     "collapsed": false,
 533 |     "deletable": true,
 534 |     "editable": true
 535 |    },
 536 |    "outputs": [],
 537 |    "source": [
 538 |     "# Tag each token as a part of speech\n",
 539 |     "pos_tag = lambda row: nltk.pos_tag(row.sentence)\n",
 540 |     "data_qa1['tag'] = data_qa1.apply(pos_tag, axis=1)"
 541 |    ]
 542 |   },
 543 |   {
 544 |    "cell_type": "code",
 545 |    "execution_count": 11,
 546 |    "metadata": {
 547 |     "collapsed": false,
 548 |     "deletable": true,
 549 |     "editable": true
 550 |    },
 551 |    "outputs": [
 552 |     {
 553 |      "data": {
 554 |       "text/html": [
 555 |        "<div>\n",
 556 |        "<style scoped>\n",
 557 |        "    .dataframe tbody tr th:only-of-type {\n",
 558 |        "        vertical-align: middle;\n",
 559 |        "    }\n",
 560 |        "\n",
 561 |        "    .dataframe tbody tr th {\n",
 562 |        "        vertical-align: top;\n",
 563 |        "    }\n",
 564 |        "\n",
 565 |        "    .dataframe thead th {\n",
 566 |        "        text-align: right;\n",
 567 |        "    }\n",
 568 |        "</style>\n",
 569 |        "<table border=\"1\" class=\"dataframe\">\n",
 570 |        "  <thead>\n",
 571 |        "    <tr style=\"text-align: right;\">\n",
 572 |        "      <th></th>\n",
 573 |        "      <th>sentence</th>\n",
 574 |        "      <th>tag</th>\n",
 575 |        "    </tr>\n",
 576 |        "  </thead>\n",
 577 |        "  <tbody>\n",
 578 |        "    <tr>\n",
 579 |        "      <th>0</th>\n",
 580 |        "      <td>[Mary, moved, to, the, bathroom, .]</td>\n",
 581 |        "      <td>[(Mary, NNP), (moved, VBD), (to, TO), (the, DT...</td>\n",
 582 |        "    </tr>\n",
 583 |        "    <tr>\n",
 584 |        "      <th>1</th>\n",
 585 |        "      <td>[John, went, to, the, hallway, .]</td>\n",
 586 |        "      <td>[(John, NNP), (went, VBD), (to, TO), (the, DT)...</td>\n",
 587 |        "    </tr>\n",
 588 |        "    <tr>\n",
 589 |        "      <th>2</th>\n",
 590 |        "      <td>[Where, is, Mary, ?]</td>\n",
 591 |        "      <td>[(Where, WRB), (is, VBZ), (Mary, NNP), (?, .)]</td>\n",
 592 |        "    </tr>\n",
 593 |        "    <tr>\n",
 594 |        "      <th>3</th>\n",
 595 |        "      <td>[Daniel, went, back, to, the, hallway, .]</td>\n",
 596 |        "      <td>[(Daniel, NNP), (went, VBD), (back, RB), (to, ...</td>\n",
 597 |        "    </tr>\n",
 598 |        "    <tr>\n",
 599 |        "      <th>4</th>\n",
 600 |        "      <td>[Sandra, moved, to, the, garden, .]</td>\n",
 601 |        "      <td>[(Sandra, NNP), (moved, VBD), (to, TO), (the, ...</td>\n",
 602 |        "    </tr>\n",
 603 |        "  </tbody>\n",
 604 |        "</table>\n",
 605 |        "</div>"
 606 |       ],
 607 |       "text/plain": [
 608 |        "                                    sentence  \\\n",
 609 |        "0        [Mary, moved, to, the, bathroom, .]   \n",
 610 |        "1          [John, went, to, the, hallway, .]   \n",
 611 |        "2                       [Where, is, Mary, ?]   \n",
 612 |        "3  [Daniel, went, back, to, the, hallway, .]   \n",
 613 |        "4        [Sandra, moved, to, the, garden, .]   \n",
 614 |        "\n",
 615 |        "                                                 tag  \n",
 616 |        "0  [(Mary, NNP), (moved, VBD), (to, TO), (the, DT...  \n",
 617 |        "1  [(John, NNP), (went, VBD), (to, TO), (the, DT)...  \n",
 618 |        "2     [(Where, WRB), (is, VBZ), (Mary, NNP), (?, .)]  \n",
 619 |        "3  [(Daniel, NNP), (went, VBD), (back, RB), (to, ...  \n",
 620 |        "4  [(Sandra, NNP), (moved, VBD), (to, TO), (the, ...  "
 621 |       ]
 622 |      },
 623 |      "execution_count": 11,
 624 |      "metadata": {},
 625 |      "output_type": "execute_result"
 626 |     }
 627 |    ],
 628 |    "source": [
 629 |     "data_qa1[['sentence', 'tag']][:5]"
 630 |    ]
 631 |   },
 632 |   {
 633 |    "cell_type": "markdown",
 634 |    "metadata": {
 635 |     "deletable": true,
 636 |     "editable": true
 637 |    },
 638 |    "source": [
 639 |     "#### Statements\n",
 640 |     "Due to the simplicity of the data, each statement can be thought of as a `(subject, relation, object)` triple. We would like to define a function called `extract_statement`, that when given a sequence of statement tokens, produces this triple. For instance,\n",
 641 |     "```\n",
 642 |     "extract_statement([Mary, moved, to, the, bathroom, .]) = (Mary, moved, bathroom).\n",
 643 |     "```\n",
 644 |     "This allows one to construct a graph of relationships between objects, as we will see in the next sections. \n",
 645 |     "\n",
 646 |     "We can use the POS tags in the sentence to achieve this. If there is a word tagged as a proper noun, it is the subject, if there's a verb, it is the relation, and if there's a simple noun, it is the object."
 647 |    ]
 648 |   },
 649 |   {
 650 |    "cell_type": "code",
 651 |    "execution_count": 12,
 652 |    "metadata": {
 653 |     "collapsed": true,
 654 |     "deletable": true,
 655 |     "editable": true
 656 |    },
 657 |    "outputs": [],
 658 |    "source": [
 659 |     "def extract_statement(tags):\n",
 660 |     "    '''Extracts a (subject, relation, object) triple from each statement based on the POS tags'''\n",
 661 |     "    subject, relation, obj = '', '', ''\n",
 662 |     "    for word,tag in tags:\n",
 663 |     "        if tag == 'NNP':\n",
 664 |     "            subject = word\n",
 665 |     "        elif tag == 'VBD' or word == 'journeyed': # TODO: 'journeyed' is tagged improperly\n",
 666 |     "            relation = word\n",
 667 |     "        if tag == 'NNP' or tag == 'NN':\n",
 668 |     "            obj = word\n",
 669 |     "    return (subject, relation, obj)"
 670 |    ]
 671 |   },
 672 |   {
 673 |    "cell_type": "markdown",
 674 |    "metadata": {
 675 |     "deletable": true,
 676 |     "editable": true
 677 |    },
 678 |    "source": [
 679 |     "#### Questions\n",
 680 |     "To test the graph, we would like to define another function `extract_question`, that when given a sequence of quesiton tokens, returns the entity that the question is asking for.\n",
 681 |     "```\n",
 682 |     "extract_question([Where, is, Mary, ?]) = Mary\n",
 683 |     "```\n",
 684 |     "\n",
 685 |     "The result is the subject we are querying for, whose query should return us a room to answer the question."
 686 |    ]
 687 |   },
 688 |   {
 689 |    "cell_type": "code",
 690 |    "execution_count": 13,
 691 |    "metadata": {
 692 |     "collapsed": true,
 693 |     "deletable": true,
 694 |     "editable": true
 695 |    },
 696 |    "outputs": [],
 697 |    "source": [
 698 |     "def extract_question(tags):\n",
 699 |     "    '''Extracts the entity under discussion from each question based on the POS tags'''\n",
 700 |     "    entityUnderDiscussion = ''\n",
 701 |     "    # This will find the last noun in the sentence\n",
 702 |     "    for word,tag in tags:\n",
 703 |     "        if tag == 'NNP' or tag == 'NN':\n",
 704 |     "            entityUnderDiscussion = word\n",
 705 |     "    return entityUnderDiscussion"
 706 |    ]
 707 |   },
 708 |   {
 709 |    "cell_type": "markdown",
 710 |    "metadata": {
 711 |     "deletable": true,
 712 |     "editable": true
 713 |    },
 714 |    "source": [
 715 |     "Then, call the appropriate function on each `DataFrame` row to extract the corresponding info."
 716 |    ]
 717 |   },
 718 |   {
 719 |    "cell_type": "code",
 720 |    "execution_count": 14,
 721 |    "metadata": {
 722 |     "collapsed": true,
 723 |     "deletable": true,
 724 |     "editable": true
 725 |    },
 726 |    "outputs": [],
 727 |    "source": [
 728 |     "def extract(row):\n",
 729 |     "    '''Extracts the appropriate data given a processed DataFrame row'''\n",
 730 |     "    if row.type == 'S':\n",
 731 |     "        return extract_statement(row.tag)\n",
 732 |     "    else:\n",
 733 |     "        return extract_question(row.tag)"
 734 |    ]
 735 |   },
 736 |   {
 737 |    "cell_type": "code",
 738 |    "execution_count": 15,
 739 |    "metadata": {
 740 |     "collapsed": false,
 741 |     "deletable": true,
 742 |     "editable": true
 743 |    },
 744 |    "outputs": [],
 745 |    "source": [
 746 |     "data_qa1['extracted'] = data_qa1.apply(extract, axis=1)"
 747 |    ]
 748 |   },
 749 |   {
 750 |    "cell_type": "code",
 751 |    "execution_count": 16,
 752 |    "metadata": {
 753 |     "collapsed": false,
 754 |     "deletable": true,
 755 |     "editable": true
 756 |    },
 757 |    "outputs": [
 758 |     {
 759 |      "data": {
 760 |       "text/html": [
 761 |        "<div>\n",
 762 |        "<style scoped>\n",
 763 |        "    .dataframe tbody tr th:only-of-type {\n",
 764 |        "        vertical-align: middle;\n",
 765 |        "    }\n",
 766 |        "\n",
 767 |        "    .dataframe tbody tr th {\n",
 768 |        "        vertical-align: top;\n",
 769 |        "    }\n",
 770 |        "\n",
 771 |        "    .dataframe thead th {\n",
 772 |        "        text-align: right;\n",
 773 |        "    }\n",
 774 |        "</style>\n",
 775 |        "<table border=\"1\" class=\"dataframe\">\n",
 776 |        "  <thead>\n",
 777 |        "    <tr style=\"text-align: right;\">\n",
 778 |        "      <th></th>\n",
 779 |        "      <th>sentence</th>\n",
 780 |        "      <th>extracted</th>\n",
 781 |        "    </tr>\n",
 782 |        "  </thead>\n",
 783 |        "  <tbody>\n",
 784 |        "    <tr>\n",
 785 |        "      <th>0</th>\n",
 786 |        "      <td>[Mary, moved, to, the, bathroom, .]</td>\n",
 787 |        "      <td>(Mary, moved, bathroom)</td>\n",
 788 |        "    </tr>\n",
 789 |        "    <tr>\n",
 790 |        "      <th>1</th>\n",
 791 |        "      <td>[John, went, to, the, hallway, .]</td>\n",
 792 |        "      <td>(John, went, hallway)</td>\n",
 793 |        "    </tr>\n",
 794 |        "    <tr>\n",
 795 |        "      <th>2</th>\n",
 796 |        "      <td>[Where, is, Mary, ?]</td>\n",
 797 |        "      <td>Mary</td>\n",
 798 |        "    </tr>\n",
 799 |        "    <tr>\n",
 800 |        "      <th>3</th>\n",
 801 |        "      <td>[Daniel, went, back, to, the, hallway, .]</td>\n",
 802 |        "      <td>(Daniel, went, hallway)</td>\n",
 803 |        "    </tr>\n",
 804 |        "    <tr>\n",
 805 |        "      <th>4</th>\n",
 806 |        "      <td>[Sandra, moved, to, the, garden, .]</td>\n",
 807 |        "      <td>(Sandra, moved, garden)</td>\n",
 808 |        "    </tr>\n",
 809 |        "  </tbody>\n",
 810 |        "</table>\n",
 811 |        "</div>"
 812 |       ],
 813 |       "text/plain": [
 814 |        "                                    sentence                extracted\n",
 815 |        "0        [Mary, moved, to, the, bathroom, .]  (Mary, moved, bathroom)\n",
 816 |        "1          [John, went, to, the, hallway, .]    (John, went, hallway)\n",
 817 |        "2                       [Where, is, Mary, ?]                     Mary\n",
 818 |        "3  [Daniel, went, back, to, the, hallway, .]  (Daniel, went, hallway)\n",
 819 |        "4        [Sandra, moved, to, the, garden, .]  (Sandra, moved, garden)"
 820 |       ]
 821 |      },
 822 |      "execution_count": 16,
 823 |      "metadata": {},
 824 |      "output_type": "execute_result"
 825 |     }
 826 |    ],
 827 |    "source": [
 828 |     "data_qa1[['sentence', 'extracted']][:5]"
 829 |    ]
 830 |   },
 831 |   {
 832 |    "cell_type": "markdown",
 833 |    "metadata": {
 834 |     "deletable": true,
 835 |     "editable": true
 836 |    },
 837 |    "source": [
 838 |     "Voila, extraction is complete."
 839 |    ]
 840 |   },
 841 |   {
 842 |    "cell_type": "markdown",
 843 |    "metadata": {
 844 |     "deletable": true,
 845 |     "editable": true
 846 |    },
 847 |    "source": [
 848 |     "### Debug Functions\n",
 849 |     "\n",
 850 |     "These are handy debugging functions that we will use for evaluation."
 851 |    ]
 852 |   },
 853 |   {
 854 |    "cell_type": "markdown",
 855 |    "metadata": {
 856 |     "deletable": true,
 857 |     "editable": true
 858 |    },
 859 |    "source": [
 860 |     "This function finds all statements that refer to a person."
 861 |    ]
 862 |   },
 863 |   {
 864 |    "cell_type": "code",
 865 |    "execution_count": 17,
 866 |    "metadata": {
 867 |     "collapsed": true,
 868 |     "deletable": true,
 869 |     "editable": true
 870 |    },
 871 |    "outputs": [],
 872 |    "source": [
 873 |     "def person_statements(person):\n",
 874 |     "    '''Get all statements that refer to the specified person'''\n",
 875 |     "    stat = statements(data_qa1)\n",
 876 |     "    return stat[stat.extracted.map(lambda t: t[0] == person)]"
 877 |    ]
 878 |   },
 879 |   {
 880 |    "cell_type": "markdown",
 881 |    "metadata": {
 882 |     "deletable": true,
 883 |     "editable": true
 884 |    },
 885 |    "source": [
 886 |     "For instance, we can find all statements that refer to Sandra."
 887 |    ]
 888 |   },
 889 |   {
 890 |    "cell_type": "code",
 891 |    "execution_count": 18,
 892 |    "metadata": {
 893 |     "collapsed": false,
 894 |     "deletable": true,
 895 |     "editable": true
 896 |    },
 897 |    "outputs": [
 898 |     {
 899 |      "data": {
 900 |       "text/html": [
 901 |        "<div>\n",
 902 |        "<style scoped>\n",
 903 |        "    .dataframe tbody tr th:only-of-type {\n",
 904 |        "        vertical-align: middle;\n",
 905 |        "    }\n",
 906 |        "\n",
 907 |        "    .dataframe tbody tr th {\n",
 908 |        "        vertical-align: top;\n",
 909 |        "    }\n",
 910 |        "\n",
 911 |        "    .dataframe thead th {\n",
 912 |        "        text-align: right;\n",
 913 |        "    }\n",
 914 |        "</style>\n",
 915 |        "<table border=\"1\" class=\"dataframe\">\n",
 916 |        "  <thead>\n",
 917 |        "    <tr style=\"text-align: right;\">\n",
 918 |        "      <th></th>\n",
 919 |        "      <th>sentence</th>\n",
 920 |        "      <th>tag</th>\n",
 921 |        "      <th>extracted</th>\n",
 922 |        "    </tr>\n",
 923 |        "  </thead>\n",
 924 |        "  <tbody>\n",
 925 |        "    <tr>\n",
 926 |        "      <th>3</th>\n",
 927 |        "      <td>[Sandra, moved, to, the, garden, .]</td>\n",
 928 |        "      <td>[(Sandra, NNP), (moved, VBD), (to, TO), (the, ...</td>\n",
 929 |        "      <td>(Sandra, moved, garden)</td>\n",
 930 |        "    </tr>\n",
 931 |        "    <tr>\n",
 932 |        "      <th>5</th>\n",
 933 |        "      <td>[Sandra, journeyed, to, the, bathroom, .]</td>\n",
 934 |        "      <td>[(Sandra, NNP), (journeyed, VBD), (to, TO), (t...</td>\n",
 935 |        "      <td>(Sandra, journeyed, bathroom)</td>\n",
 936 |        "    </tr>\n",
 937 |        "    <tr>\n",
 938 |        "      <th>10</th>\n",
 939 |        "      <td>[Sandra, travelled, to, the, office, .]</td>\n",
 940 |        "      <td>[(Sandra, NNP), (travelled, VBD), (to, TO), (t...</td>\n",
 941 |        "      <td>(Sandra, travelled, office)</td>\n",
 942 |        "    </tr>\n",
 943 |        "  </tbody>\n",
 944 |        "</table>\n",
 945 |        "</div>"
 946 |       ],
 947 |       "text/plain": [
 948 |        "                                     sentence  \\\n",
 949 |        "3         [Sandra, moved, to, the, garden, .]   \n",
 950 |        "5   [Sandra, journeyed, to, the, bathroom, .]   \n",
 951 |        "10    [Sandra, travelled, to, the, office, .]   \n",
 952 |        "\n",
 953 |        "                                                  tag  \\\n",
 954 |        "3   [(Sandra, NNP), (moved, VBD), (to, TO), (the, ...   \n",
 955 |        "5   [(Sandra, NNP), (journeyed, VBD), (to, TO), (t...   \n",
 956 |        "10  [(Sandra, NNP), (travelled, VBD), (to, TO), (t...   \n",
 957 |        "\n",
 958 |        "                        extracted  \n",
 959 |        "3         (Sandra, moved, garden)  \n",
 960 |        "5   (Sandra, journeyed, bathroom)  \n",
 961 |        "10    (Sandra, travelled, office)  "
 962 |       ]
 963 |      },
 964 |      "execution_count": 18,
 965 |      "metadata": {},
 966 |      "output_type": "execute_result"
 967 |     }
 968 |    ],
 969 |    "source": [
 970 |     "person_statements('Sandra')[:3]"
 971 |    ]
 972 |   },
 973 |   {
 974 |    "cell_type": "markdown",
 975 |    "metadata": {
 976 |     "deletable": true,
 977 |     "editable": true
 978 |    },
 979 |    "source": [
 980 |     "This function finds the `n` most recent statements that refer to a person."
 981 |    ]
 982 |   },
 983 |   {
 984 |    "cell_type": "code",
 985 |    "execution_count": 19,
 986 |    "metadata": {
 987 |     "collapsed": true,
 988 |     "deletable": true,
 989 |     "editable": true
 990 |    },
 991 |    "outputs": [],
 992 |    "source": [
 993 |     "def person_statements_recent(person, n=5):\n",
 994 |     "    '''Get the n most recent statements that refer to the specified person in reverse chronological order'''\n",
 995 |     "    return person_statements(person)[-n:].iloc[::-1]"
 996 |    ]
 997 |   },
 998 |   {
 999 |    "cell_type": "markdown",
1000 |    "metadata": {
1001 |     "deletable": true,
1002 |     "editable": true
1003 |    },
1004 |    "source": [
1005 |     "For instance, we can find the 3 most recent statements Daniel has been referred in."
1006 |    ]
1007 |   },
1008 |   {
1009 |    "cell_type": "code",
1010 |    "execution_count": 20,
1011 |    "metadata": {
1012 |     "collapsed": false,
1013 |     "deletable": true,
1014 |     "editable": true
1015 |    },
1016 |    "outputs": [
1017 |     {
1018 |      "data": {
1019 |       "text/html": [
1020 |        "<div>\n",
1021 |        "<style scoped>\n",
1022 |        "    .dataframe tbody tr th:only-of-type {\n",
1023 |        "        vertical-align: middle;\n",
1024 |        "    }\n",
1025 |        "\n",
1026 |        "    .dataframe tbody tr th {\n",
1027 |        "        vertical-align: top;\n",
1028 |        "    }\n",
1029 |        "\n",
1030 |        "    .dataframe thead th {\n",
1031 |        "        text-align: right;\n",
1032 |        "    }\n",
1033 |        "</style>\n",
1034 |        "<table border=\"1\" class=\"dataframe\">\n",
1035 |        "  <thead>\n",
1036 |        "    <tr style=\"text-align: right;\">\n",
1037 |        "      <th></th>\n",
1038 |        "      <th>sentence</th>\n",
1039 |        "      <th>tag</th>\n",
1040 |        "      <th>extracted</th>\n",
1041 |        "    </tr>\n",
1042 |        "  </thead>\n",
1043 |        "  <tbody>\n",
1044 |        "    <tr>\n",
1045 |        "      <th>1999</th>\n",
1046 |        "      <td>[Daniel, went, to, the, garden, .]</td>\n",
1047 |        "      <td>[(Daniel, NNP), (went, VBD), (to, TO), (the, D...</td>\n",
1048 |        "      <td>(Daniel, went, garden)</td>\n",
1049 |        "    </tr>\n",
1050 |        "    <tr>\n",
1051 |        "      <th>1996</th>\n",
1052 |        "      <td>[Daniel, travelled, to, the, kitchen, .]</td>\n",
1053 |        "      <td>[(Daniel, NNP), (travelled, VBD), (to, TO), (t...</td>\n",
1054 |        "      <td>(Daniel, travelled, kitchen)</td>\n",
1055 |        "    </tr>\n",
1056 |        "    <tr>\n",
1057 |        "      <th>1992</th>\n",
1058 |        "      <td>[Daniel, moved, to, the, office, .]</td>\n",
1059 |        "      <td>[(Daniel, NNP), (moved, VBD), (to, TO), (the, ...</td>\n",
1060 |        "      <td>(Daniel, moved, office)</td>\n",
1061 |        "    </tr>\n",
1062 |        "  </tbody>\n",
1063 |        "</table>\n",
1064 |        "</div>"
1065 |       ],
1066 |       "text/plain": [
1067 |        "                                      sentence  \\\n",
1068 |        "1999        [Daniel, went, to, the, garden, .]   \n",
1069 |        "1996  [Daniel, travelled, to, the, kitchen, .]   \n",
1070 |        "1992       [Daniel, moved, to, the, office, .]   \n",
1071 |        "\n",
1072 |        "                                                    tag  \\\n",
1073 |        "1999  [(Daniel, NNP), (went, VBD), (to, TO), (the, D...   \n",
1074 |        "1996  [(Daniel, NNP), (travelled, VBD), (to, TO), (t...   \n",
1075 |        "1992  [(Daniel, NNP), (moved, VBD), (to, TO), (the, ...   \n",
1076 |        "\n",
1077 |        "                         extracted  \n",
1078 |        "1999        (Daniel, went, garden)  \n",
1079 |        "1996  (Daniel, travelled, kitchen)  \n",
1080 |        "1992       (Daniel, moved, office)  "
1081 |       ]
1082 |      },
1083 |      "execution_count": 20,
1084 |      "metadata": {},
1085 |      "output_type": "execute_result"
1086 |     }
1087 |    ],
1088 |    "source": [
1089 |     "person_statements_recent('Daniel', n=3)"
1090 |    ]
1091 |   },
1092 |   {
1093 |    "cell_type": "markdown",
1094 |    "metadata": {
1095 |     "deletable": true,
1096 |     "editable": true
1097 |    },
1098 |    "source": [
1099 |     "## Build the Graph\n",
1100 |     "Once we have processed the data into triples, we can build graphs from them. Below we have defined a couple functions to reset the database and run queries. We will use Neo4j's Python driver to accomplish this. Note that if the URL or auth credentials of your Neo4j server are different, you will need to change them below."
1101 |    ]
1102 |   },
1103 |   {
1104 |    "cell_type": "code",
1105 |    "execution_count": 21,
1106 |    "metadata": {
1107 |     "collapsed": false,
1108 |     "deletable": true,
1109 |     "editable": true
1110 |    },
1111 |    "outputs": [],
1112 |    "source": [
1113 |     "from neo4j.v1 import GraphDatabase, basic_auth"
1114 |    ]
1115 |   },
1116 |   {
1117 |    "cell_type": "code",
1118 |    "execution_count": 22,
1119 |    "metadata": {
1120 |     "collapsed": false,
1121 |     "deletable": true,
1122 |     "editable": true
1123 |    },
1124 |    "outputs": [],
1125 |    "source": [
1126 |     "# Create a neo4j session\n",
1127 |     "# NOTE: Make sure that URL/credentials are correct and that Neo4j is running\n",
1128 |     "driver = GraphDatabase.driver('bolt://localhost:7687', auth=basic_auth('neo4j', 'neo4j'))"
1129 |    ]
1130 |   },
1131 |   {
1132 |    "cell_type": "code",
1133 |    "execution_count": 23,
1134 |    "metadata": {
1135 |     "collapsed": true,
1136 |     "deletable": true,
1137 |     "editable": true
1138 |    },
1139 |    "outputs": [],
1140 |    "source": [
1141 |     "# WARNING: This function will clear the database when run!\n",
1142 |     "# Make sure all important data is backed up before continuing\n",
1143 |     "def reset_db():\n",
1144 |     "    '''Remove all nodes and relationships from the database'''\n",
1145 |     "    session = driver.session()\n",
1146 |     "    session.run('MATCH (n) DETACH DELETE n')"
1147 |    ]
1148 |   },
1149 |   {
1150 |    "cell_type": "code",
1151 |    "execution_count": 24,
1152 |    "metadata": {
1153 |     "collapsed": true,
1154 |     "deletable": true,
1155 |     "editable": true
1156 |    },
1157 |    "outputs": [],
1158 |    "source": [
1159 |     "def create(query, n=0):\n",
1160 |     "    '''Given a query, create a graph based on each triple in the extracted statements'''\n",
1161 |     "    session = driver.session()\n",
1162 |     "    stat = statements(data_qa1)\n",
1163 |     "    n = len(stat) if n <= 0 else n # Run the first n statements if specified\n",
1164 |     "    for subject,relation,obj in stat[:n].extracted:\n",
1165 |     "        session.run(query, subject=subject, relation=relation, obj=obj)"
1166 |    ]
1167 |   },
1168 |   {
1169 |    "cell_type": "markdown",
1170 |    "metadata": {
1171 |     "deletable": true,
1172 |     "editable": true
1173 |    },
1174 |    "source": [
1175 |     "### V1: Direct relationships\n",
1176 |     "One of the first impulses when building the graph may be to represent the subject and object as nodes, and the relations as edges between them."
1177 |    ]
1178 |   },
1179 |   {
1180 |    "cell_type": "code",
1181 |    "execution_count": 25,
1182 |    "metadata": {
1183 |     "collapsed": false,
1184 |     "deletable": true,
1185 |     "editable": true
1186 |    },
1187 |    "outputs": [],
1188 |    "source": [
1189 |     "reset_db() # This will clear the database!"
1190 |    ]
1191 |   },
1192 |   {
1193 |    "cell_type": "code",
1194 |    "execution_count": 26,
1195 |    "metadata": {
1196 |     "collapsed": false,
1197 |     "deletable": true,
1198 |     "editable": true
1199 |    },
1200 |    "outputs": [],
1201 |    "source": [
1202 |     "# Create a direct relationship between subject and object\n",
1203 |     "v1_query = '''\n",
1204 |     "    MERGE (s:SUBJECT {name: $subject}) \n",
1205 |     "    MERGE (o:OBJECT  {name: $obj}) \n",
1206 |     "    MERGE (s)-[r:RELATION {name: $relation}]->(o)\n",
1207 |     "'''\n",
1208 |     "\n",
1209 |     "create(v1_query)"
1210 |    ]
1211 |   },
1212 |   {
1213 |    "cell_type": "markdown",
1214 |    "metadata": {
1215 |     "deletable": true,
1216 |     "editable": true
1217 |    },
1218 |    "source": [
1219 |     "Run the query below and see what the graph looks like. Pop open a new tab in the Neo4j browser (default http://localhost:7474/browser/) and run the query:\n",
1220 |     "```\n",
1221 |     "MATCH (n) RETURN n LIMIT 50\n",
1222 |     "```\n",
1223 |     "The graph is a reasonable first start, as the relations point each person to where they have been. But this poses a potential problem: how do we know where each person is right now, or where they have been previously? All we can know from the graph is which rooms a person has been in, because they may have visited them all multiple times."
1224 |    ]
1225 |   },
1226 |   {
1227 |    "cell_type": "markdown",
1228 |    "metadata": {
1229 |     "deletable": true,
1230 |     "editable": true
1231 |    },
1232 |    "source": [
1233 |     "<img src=\"screenshots/simple-relation.png\" style=\"width:700px\">"
1234 |    ]
1235 |   },
1236 |   {
1237 |    "cell_type": "markdown",
1238 |    "metadata": {
1239 |     "deletable": true,
1240 |     "editable": true
1241 |    },
1242 |    "source": [
1243 |     "### V2: Nodes for relationships\n",
1244 |     "One approach is to form a linked list of \"events\". Each event corresponds to a person updating the room that they are in. Since we chose edges to be our relations, we cannot form edges between relations. To alleviate this, we can transform the relation to a node, and draw two edges to form a 3-node triple."
1245 |    ]
1246 |   },
1247 |   {
1248 |    "cell_type": "code",
1249 |    "execution_count": 27,
1250 |    "metadata": {
1251 |     "collapsed": true,
1252 |     "deletable": true,
1253 |     "editable": true
1254 |    },
1255 |    "outputs": [],
1256 |    "source": [
1257 |     "reset_db() # This will clear the database!"
1258 |    ]
1259 |   },
1260 |   {
1261 |    "cell_type": "code",
1262 |    "execution_count": 28,
1263 |    "metadata": {
1264 |     "collapsed": true,
1265 |     "deletable": true,
1266 |     "editable": true
1267 |    },
1268 |    "outputs": [],
1269 |    "source": [
1270 |     "# Represent each relation as a node\n",
1271 |     "v2_query = '''\n",
1272 |     "    MERGE (s:SUBJECT {name: $subject})\n",
1273 |     "    MERGE (o:OBJECT  {name: $obj})\n",
1274 |     "    CREATE (s)-[:R0]->(r:RELATION {name: $relation})-[:R1]->(o)\n",
1275 |     "'''\n",
1276 |     "\n",
1277 |     "create(v2_query)"
1278 |    ]
1279 |   },
1280 |   {
1281 |    "cell_type": "markdown",
1282 |    "metadata": {
1283 |     "deletable": true,
1284 |     "editable": true
1285 |    },
1286 |    "source": [
1287 |     "Run the query again and see what changed. This is better, since we can see how often a room has been visited, but still doesn't solve the question as to which room a person is in at any given time."
1288 |    ]
1289 |   },
1290 |   {
1291 |    "cell_type": "markdown",
1292 |    "metadata": {
1293 |     "deletable": true,
1294 |     "editable": true
1295 |    },
1296 |    "source": [
1297 |     "### V3: Linked list of relationships\n",
1298 |     "The final step is to build the linked list based on the order in which the relations were created. This will allow us to not only find the room a person is in right now, but produce a list of rooms that they were in, in the order that they were visited."
1299 |    ]
1300 |   },
1301 |   {
1302 |    "cell_type": "code",
1303 |    "execution_count": 29,
1304 |    "metadata": {
1305 |     "collapsed": false,
1306 |     "deletable": true,
1307 |     "editable": true
1308 |    },
1309 |    "outputs": [],
1310 |    "source": [
1311 |     "reset_db()"
1312 |    ]
1313 |   },
1314 |   {
1315 |    "cell_type": "code",
1316 |    "execution_count": 30,
1317 |    "metadata": {
1318 |     "collapsed": false,
1319 |     "deletable": true,
1320 |     "editable": true
1321 |    },
1322 |    "outputs": [],
1323 |    "source": [
1324 |     "# Represent each relation as a node, ordered by a linked list (per subject)\n",
1325 |     "v3_query = '''\n",
1326 |     "    MERGE (s:SUBJECT {name: $subject})\n",
1327 |     "    MERGE (o:OBJECT  {name: $obj})\n",
1328 |     "    \n",
1329 |     "    WITH s,o\n",
1330 |     "    \n",
1331 |     "    // Create an new relation between the subject and object\n",
1332 |     "    CREATE (s)-[:R0]->(r:RELATION {name: $relation})-[:R1]->(o)\n",
1333 |     "    CREATE (s)-[h:HEAD]->(r) // Make the newly created relation the head of the list\n",
1334 |     "    \n",
1335 |     "    WITH s,r,o,h\n",
1336 |     "    \n",
1337 |     "    // Find the previous head of the list (if none exist, this query will terminate here)\n",
1338 |     "    MATCH (s)-[h_prev:HEAD]->(r_prev:RELATION)\n",
1339 |     "    WHERE h_prev <> h\n",
1340 |     "    \n",
1341 |     "    // Complete the link, remove the previous head pointer\n",
1342 |     "    CREATE (r_prev)-[:NEXT]->(r)\n",
1343 |     "    DELETE h_prev\n",
1344 |     "'''"
1345 |    ]
1346 |   },
1347 |   {
1348 |    "cell_type": "code",
1349 |    "execution_count": 31,
1350 |    "metadata": {
1351 |     "collapsed": true,
1352 |     "deletable": true,
1353 |     "editable": true
1354 |    },
1355 |    "outputs": [],
1356 |    "source": [
1357 |     "session = driver.session()\n",
1358 |     "# Create an index for faster access\n",
1359 |     "session.run('CREATE INDEX ON :SUBJECT(name)')\n",
1360 |     "session.run('CREATE INDEX ON :RELATION(name)')\n",
1361 |     "session.run('CREATE INDEX ON :OBJECT(name)')\n",
1362 |     "create(v3_query)"
1363 |    ]
1364 |   },
1365 |   {
1366 |    "cell_type": "markdown",
1367 |    "metadata": {
1368 |     "deletable": true,
1369 |     "editable": true
1370 |    },
1371 |    "source": [
1372 |     "Check the new graph out and see what changed. It's helpful to change the colors of the nodes and edges to visualize this better."
1373 |    ]
1374 |   },
1375 |   {
1376 |    "cell_type": "markdown",
1377 |    "metadata": {
1378 |     "deletable": true,
1379 |     "editable": true
1380 |    },
1381 |    "source": [
1382 |     "<img src=\"screenshots/local-list.png\" style=\"width:800px\">"
1383 |    ]
1384 |   },
1385 |   {
1386 |    "cell_type": "markdown",
1387 |    "metadata": {
1388 |     "deletable": true,
1389 |     "editable": true
1390 |    },
1391 |    "source": [
1392 |     "## Query the Graph\n",
1393 |     "Now we can ask the graph useful questions.\n",
1394 |     "#### Find the room a person is in"
1395 |    ]
1396 |   },
1397 |   {
1398 |    "cell_type": "code",
1399 |    "execution_count": 32,
1400 |    "metadata": {
1401 |     "collapsed": true,
1402 |     "deletable": true,
1403 |     "editable": true
1404 |    },
1405 |    "outputs": [],
1406 |    "source": [
1407 |     "def find_person(person):\n",
1408 |     "    '''Find the room a person is currently in'''\n",
1409 |     "    query = '''\n",
1410 |     "        MATCH (s:SUBJECT {name:$name})-[:HEAD]->(r:RELATION)-->(o:OBJECT)\n",
1411 |     "        RETURN s AS subject, r AS relation, o AS obj\n",
1412 |     "    '''\n",
1413 |     "    return session.run(query, name=person)"
1414 |    ]
1415 |   },
1416 |   {
1417 |    "cell_type": "markdown",
1418 |    "metadata": {
1419 |     "deletable": true,
1420 |     "editable": true
1421 |    },
1422 |    "source": [
1423 |     "Using the graph-querying function above we can ask, \"Where is Mary?\""
1424 |    ]
1425 |   },
1426 |   {
1427 |    "cell_type": "code",
1428 |    "execution_count": 33,
1429 |    "metadata": {
1430 |     "collapsed": false,
1431 |     "deletable": true,
1432 |     "editable": true
1433 |    },
1434 |    "outputs": [
1435 |     {
1436 |      "name": "stdout",
1437 |      "output_type": "stream",
1438 |      "text": [
1439 |       "kitchen\n"
1440 |      ]
1441 |     }
1442 |    ],
1443 |    "source": [
1444 |     "# Note: If this is run less than a second after creating the knowledge graph, \n",
1445 |     "# the Python driver may cause a race condition where the graph \n",
1446 |     "# isn't finished updating, which could give you the wrong answer.\n",
1447 |     "session = driver.session()\n",
1448 |     "record = find_person('Mary').single()\n",
1449 |     "print(record['obj'].get('name'))"
1450 |    ]
1451 |   },
1452 |   {
1453 |    "cell_type": "markdown",
1454 |    "metadata": {
1455 |     "deletable": true,
1456 |     "editable": true
1457 |    },
1458 |    "source": [
1459 |     "According to the graph, Mary is in the kitchen. We can verify that this is true with the debug function below, and we can see the corresponding sentence that generated the relationship as well."
1460 |    ]
1461 |   },
1462 |   {
1463 |    "cell_type": "code",
1464 |    "execution_count": 34,
1465 |    "metadata": {
1466 |     "collapsed": false,
1467 |     "deletable": true,
1468 |     "editable": true
1469 |    },
1470 |    "outputs": [
1471 |     {
1472 |      "data": {
1473 |       "text/html": [
1474 |        "<div>\n",
1475 |        "<style scoped>\n",
1476 |        "    .dataframe tbody tr th:only-of-type {\n",
1477 |        "        vertical-align: middle;\n",
1478 |        "    }\n",
1479 |        "\n",
1480 |        "    .dataframe tbody tr th {\n",
1481 |        "        vertical-align: top;\n",
1482 |        "    }\n",
1483 |        "\n",
1484 |        "    .dataframe thead th {\n",
1485 |        "        text-align: right;\n",
1486 |        "    }\n",
1487 |        "</style>\n",
1488 |        "<table border=\"1\" class=\"dataframe\">\n",
1489 |        "  <thead>\n",
1490 |        "    <tr style=\"text-align: right;\">\n",
1491 |        "      <th></th>\n",
1492 |        "      <th>sentence</th>\n",
1493 |        "      <th>tag</th>\n",
1494 |        "      <th>extracted</th>\n",
1495 |        "    </tr>\n",
1496 |        "  </thead>\n",
1497 |        "  <tbody>\n",
1498 |        "    <tr>\n",
1499 |        "      <th>1994</th>\n",
1500 |        "      <td>[Mary, journeyed, to, the, kitchen, .]</td>\n",
1501 |        "      <td>[(Mary, NNP), (journeyed, VBD), (to, TO), (the...</td>\n",
1502 |        "      <td>(Mary, journeyed, kitchen)</td>\n",
1503 |        "    </tr>\n",
1504 |        "  </tbody>\n",
1505 |        "</table>\n",
1506 |        "</div>"
1507 |       ],
1508 |       "text/plain": [
1509 |        "                                    sentence  \\\n",
1510 |        "1994  [Mary, journeyed, to, the, kitchen, .]   \n",
1511 |        "\n",
1512 |        "                                                    tag  \\\n",
1513 |        "1994  [(Mary, NNP), (journeyed, VBD), (to, TO), (the...   \n",
1514 |        "\n",
1515 |        "                       extracted  \n",
1516 |        "1994  (Mary, journeyed, kitchen)  "
1517 |       ]
1518 |      },
1519 |      "execution_count": 34,
1520 |      "metadata": {},
1521 |      "output_type": "execute_result"
1522 |     }
1523 |    ],
1524 |    "source": [
1525 |     "person_statements_recent('Mary', n=1)"
1526 |    ]
1527 |   },
1528 |   {
1529 |    "cell_type": "markdown",
1530 |    "metadata": {
1531 |     "deletable": true,
1532 |     "editable": true
1533 |    },
1534 |    "source": [
1535 |     "#### Find the rooms a person has been in (reverse chronological order)"
1536 |    ]
1537 |   },
1538 |   {
1539 |    "cell_type": "code",
1540 |    "execution_count": 35,
1541 |    "metadata": {
1542 |     "collapsed": false,
1543 |     "deletable": true,
1544 |     "editable": true
1545 |    },
1546 |    "outputs": [],
1547 |    "source": [
1548 |     "def find_person_history(person, n=100):\n",
1549 |     "    '''Find the list of rooms a person was in, ordered by recency'''\n",
1550 |     "    length = str(n) if n >= 1 else ''\n",
1551 |     "    \n",
1552 |     "    query = '''\n",
1553 |     "        MATCH (s:SUBJECT {name:$name})-[:HEAD]->(r:RELATION)-->(o:OBJECT)\n",
1554 |     "        MATCH (s)-->(r_prev:RELATION)-[k*1..%s]->(r), (r_prev)-->(o_prev:OBJECT)\n",
1555 |     "        \n",
1556 |     "        WITH size(k) AS dist, r, o, r_prev, o_prev\n",
1557 |     "        ORDER BY size(k)\n",
1558 |     "        \n",
1559 |     "        WITH r, o, r_prev, o_prev\n",
1560 |     "        RETURN [r.name] + collect(r_prev.name) AS relation, [o.name] + collect(o_prev.name) AS obj\n",
1561 |     "    '''\n",
1562 |     "    query = query % length\n",
1563 |     "    \n",
1564 |     "    session = driver.session()\n",
1565 |     "    record = session.run(query, name=person).single()\n",
1566 |     "    history = list(zip(record['relation'], record['obj']))[:-1]\n",
1567 |     "    \n",
1568 |     "    return history"
1569 |    ]
1570 |   },
1571 |   {
1572 |    "cell_type": "markdown",
1573 |    "metadata": {
1574 |     "deletable": true,
1575 |     "editable": true
1576 |    },
1577 |    "source": [
1578 |     "A more advanced question that we get for free based on the graph structure is, \"Where has John been recently?\""
1579 |    ]
1580 |   },
1581 |   {
1582 |    "cell_type": "code",
1583 |    "execution_count": 36,
1584 |    "metadata": {
1585 |     "collapsed": false,
1586 |     "deletable": true,
1587 |     "editable": true,
1588 |     "scrolled": true
1589 |    },
1590 |    "outputs": [
1591 |     {
1592 |      "data": {
1593 |       "text/plain": [
1594 |        "[('went', 'bedroom'),\n",
1595 |        " ('went', 'garden'),\n",
1596 |        " ('went', 'office'),\n",
1597 |        " ('journeyed', 'bedroom'),\n",
1598 |        " ('travelled', 'hallway')]"
1599 |       ]
1600 |      },
1601 |      "execution_count": 36,
1602 |      "metadata": {},
1603 |      "output_type": "execute_result"
1604 |     }
1605 |    ],
1606 |    "source": [
1607 |     "find_person_history('John', n=5)"
1608 |    ]
1609 |   },
1610 |   {
1611 |    "cell_type": "markdown",
1612 |    "metadata": {
1613 |     "deletable": true,
1614 |     "editable": true
1615 |    },
1616 |    "source": [
1617 |     "Verify that John has been to to those places, in that order."
1618 |    ]
1619 |   },
1620 |   {
1621 |    "cell_type": "code",
1622 |    "execution_count": 37,
1623 |    "metadata": {
1624 |     "collapsed": false,
1625 |     "deletable": true,
1626 |     "editable": true
1627 |    },
1628 |    "outputs": [
1629 |     {
1630 |      "data": {
1631 |       "text/html": [
1632 |        "<div>\n",
1633 |        "<style scoped>\n",
1634 |        "    .dataframe tbody tr th:only-of-type {\n",
1635 |        "        vertical-align: middle;\n",
1636 |        "    }\n",
1637 |        "\n",
1638 |        "    .dataframe tbody tr th {\n",
1639 |        "        vertical-align: top;\n",
1640 |        "    }\n",
1641 |        "\n",
1642 |        "    .dataframe thead th {\n",
1643 |        "        text-align: right;\n",
1644 |        "    }\n",
1645 |        "</style>\n",
1646 |        "<table border=\"1\" class=\"dataframe\">\n",
1647 |        "  <thead>\n",
1648 |        "    <tr style=\"text-align: right;\">\n",
1649 |        "      <th></th>\n",
1650 |        "      <th>sentence</th>\n",
1651 |        "      <th>tag</th>\n",
1652 |        "      <th>extracted</th>\n",
1653 |        "    </tr>\n",
1654 |        "  </thead>\n",
1655 |        "  <tbody>\n",
1656 |        "    <tr>\n",
1657 |        "      <th>1995</th>\n",
1658 |        "      <td>[John, went, back, to, the, bedroom, .]</td>\n",
1659 |        "      <td>[(John, NNP), (went, VBD), (back, RB), (to, TO...</td>\n",
1660 |        "      <td>(John, went, bedroom)</td>\n",
1661 |        "    </tr>\n",
1662 |        "    <tr>\n",
1663 |        "      <th>1989</th>\n",
1664 |        "      <td>[John, went, back, to, the, garden, .]</td>\n",
1665 |        "      <td>[(John, NNP), (went, VBD), (back, RB), (to, TO...</td>\n",
1666 |        "      <td>(John, went, garden)</td>\n",
1667 |        "    </tr>\n",
1668 |        "    <tr>\n",
1669 |        "      <th>1986</th>\n",
1670 |        "      <td>[John, went, back, to, the, office, .]</td>\n",
1671 |        "      <td>[(John, NNP), (went, VBD), (back, RB), (to, TO...</td>\n",
1672 |        "      <td>(John, went, office)</td>\n",
1673 |        "    </tr>\n",
1674 |        "    <tr>\n",
1675 |        "      <th>1982</th>\n",
1676 |        "      <td>[John, journeyed, to, the, bedroom, .]</td>\n",
1677 |        "      <td>[(John, NNP), (journeyed, NN), (to, TO), (the,...</td>\n",
1678 |        "      <td>(John, journeyed, bedroom)</td>\n",
1679 |        "    </tr>\n",
1680 |        "    <tr>\n",
1681 |        "      <th>1979</th>\n",
1682 |        "      <td>[John, travelled, to, the, hallway, .]</td>\n",
1683 |        "      <td>[(John, NNP), (travelled, VBD), (to, TO), (the...</td>\n",
1684 |        "      <td>(John, travelled, hallway)</td>\n",
1685 |        "    </tr>\n",
1686 |        "  </tbody>\n",
1687 |        "</table>\n",
1688 |        "</div>"
1689 |       ],
1690 |       "text/plain": [
1691 |        "                                     sentence  \\\n",
1692 |        "1995  [John, went, back, to, the, bedroom, .]   \n",
1693 |        "1989   [John, went, back, to, the, garden, .]   \n",
1694 |        "1986   [John, went, back, to, the, office, .]   \n",
1695 |        "1982   [John, journeyed, to, the, bedroom, .]   \n",
1696 |        "1979   [John, travelled, to, the, hallway, .]   \n",
1697 |        "\n",
1698 |        "                                                    tag  \\\n",
1699 |        "1995  [(John, NNP), (went, VBD), (back, RB), (to, TO...   \n",
1700 |        "1989  [(John, NNP), (went, VBD), (back, RB), (to, TO...   \n",
1701 |        "1986  [(John, NNP), (went, VBD), (back, RB), (to, TO...   \n",
1702 |        "1982  [(John, NNP), (journeyed, NN), (to, TO), (the,...   \n",
1703 |        "1979  [(John, NNP), (travelled, VBD), (to, TO), (the...   \n",
1704 |        "\n",
1705 |        "                       extracted  \n",
1706 |        "1995       (John, went, bedroom)  \n",
1707 |        "1989        (John, went, garden)  \n",
1708 |        "1986        (John, went, office)  \n",
1709 |        "1982  (John, journeyed, bedroom)  \n",
1710 |        "1979  (John, travelled, hallway)  "
1711 |       ]
1712 |      },
1713 |      "execution_count": 37,
1714 |      "metadata": {},
1715 |      "output_type": "execute_result"
1716 |     }
1717 |    ],
1718 |    "source": [
1719 |     "person_statements_recent('John', n=5)"
1720 |    ]
1721 |   },
1722 |   {
1723 |    "cell_type": "markdown",
1724 |    "metadata": {
1725 |     "deletable": true,
1726 |     "editable": true
1727 |    },
1728 |    "source": [
1729 |     "#### Find the history of visitors for a room"
1730 |    ]
1731 |   },
1732 |   {
1733 |    "cell_type": "code",
1734 |    "execution_count": 38,
1735 |    "metadata": {
1736 |     "collapsed": true,
1737 |     "deletable": true,
1738 |     "editable": true
1739 |    },
1740 |    "outputs": [],
1741 |    "source": [
1742 |     "def find_room_visitors(room):\n",
1743 |     "    '''Find the list of visitors a room has, ordered by recency'''\n",
1744 |     "    \n",
1745 |     "    query = '''\n",
1746 |     "        MATCH (r:RELATION)-->(o:OBJECT {name:$name})\n",
1747 |     "        RETURN count(r) AS count\n",
1748 |     "    '''\n",
1749 |     "    \n",
1750 |     "    session = driver.session()\n",
1751 |     "    record = session.run(query, name=room).single()\n",
1752 |     "    \n",
1753 |     "    return record['count']"
1754 |    ]
1755 |   },
1756 |   {
1757 |    "cell_type": "markdown",
1758 |    "metadata": {
1759 |     "deletable": true,
1760 |     "editable": true
1761 |    },
1762 |    "source": [
1763 |     "Just for fun, we can find out how many times a room has been visited. \"How many times has the office been visited?\""
1764 |    ]
1765 |   },
1766 |   {
1767 |    "cell_type": "code",
1768 |    "execution_count": 39,
1769 |    "metadata": {
1770 |     "collapsed": false,
1771 |     "deletable": true,
1772 |     "editable": true
1773 |    },
1774 |    "outputs": [
1775 |     {
1776 |      "data": {
1777 |       "text/plain": [
1778 |        "334"
1779 |       ]
1780 |      },
1781 |      "execution_count": 39,
1782 |      "metadata": {},
1783 |      "output_type": "execute_result"
1784 |     }
1785 |    ],
1786 |    "source": [
1787 |     "find_room_visitors('office')"
1788 |    ]
1789 |   },
1790 |   {
1791 |    "cell_type": "markdown",
1792 |    "metadata": {
1793 |     "collapsed": true,
1794 |     "deletable": true,
1795 |     "editable": true
1796 |    },
1797 |    "source": [
1798 |     "## Calculate an Accuracy Score"
1799 |    ]
1800 |   },
1801 |   {
1802 |    "cell_type": "code",
1803 |    "execution_count": 41,
1804 |    "metadata": {
1805 |     "collapsed": false,
1806 |     "deletable": true,
1807 |     "editable": true
1808 |    },
1809 |    "outputs": [],
1810 |    "source": [
1811 |     "def get_answers(row):\n",
1812 |     "    '''Given an input row merge the statement in the graph, \n",
1813 |     "    or query the graph if it is a question'''\n",
1814 |     "    if row.type == 'S':\n",
1815 |     "        subject,relation,obj = row.extracted\n",
1816 |     "        session.run(v3_query, subject=subject, relation=relation, obj=obj)\n",
1817 |     "        return ''\n",
1818 |     "    elif row.type == 'Q':\n",
1819 |     "        person = row.extracted\n",
1820 |     "        # WARNING: do not consume the result (e.g., call .consume() or .single()) \n",
1821 |     "        # until the entire iteration is done.\n",
1822 |     "        # Failure to do so may cause the queries to be VERY slow!\n",
1823 |     "        return find_person(person)"
1824 |    ]
1825 |   },
1826 |   {
1827 |    "cell_type": "markdown",
1828 |    "metadata": {
1829 |     "deletable": true,
1830 |     "editable": true
1831 |    },
1832 |    "source": [
1833 |     "Start all over, and run through the entire dataset."
1834 |    ]
1835 |   },
1836 |   {
1837 |    "cell_type": "code",
1838 |    "execution_count": 42,
1839 |    "metadata": {
1840 |     "collapsed": false,
1841 |     "deletable": true,
1842 |     "editable": true
1843 |    },
1844 |    "outputs": [],
1845 |    "source": [
1846 |     "reset_db()"
1847 |    ]
1848 |   },
1849 |   {
1850 |    "cell_type": "code",
1851 |    "execution_count": 43,
1852 |    "metadata": {
1853 |     "collapsed": false,
1854 |     "deletable": true,
1855 |     "editable": true
1856 |    },
1857 |    "outputs": [],
1858 |    "source": [
1859 |     "session = driver.session()\n",
1860 |     "results = data_qa1.apply(get_answers, axis=1)\n",
1861 |     "results = [x for x in results if x != '']\n",
1862 |     "predicted = [result.single()['obj'].get('name') for result in results]"
1863 |    ]
1864 |   },
1865 |   {
1866 |    "cell_type": "markdown",
1867 |    "metadata": {
1868 |     "deletable": true,
1869 |     "editable": true
1870 |    },
1871 |    "source": [
1872 |     "The `predicted` array contains the predicted answer to each question.`"
1873 |    ]
1874 |   },
1875 |   {
1876 |    "cell_type": "code",
1877 |    "execution_count": 44,
1878 |    "metadata": {
1879 |     "collapsed": false,
1880 |     "deletable": true,
1881 |     "editable": true
1882 |    },
1883 |    "outputs": [
1884 |     {
1885 |      "data": {
1886 |       "text/plain": [
1887 |        "['bathroom', 'hallway', 'hallway', 'office', 'bathroom']"
1888 |       ]
1889 |      },
1890 |      "execution_count": 44,
1891 |      "metadata": {},
1892 |      "output_type": "execute_result"
1893 |     }
1894 |    ],
1895 |    "source": [
1896 |     "predicted[:5]"
1897 |    ]
1898 |   },
1899 |   {
1900 |    "cell_type": "markdown",
1901 |    "metadata": {
1902 |     "deletable": true,
1903 |     "editable": true
1904 |    },
1905 |    "source": [
1906 |     "The `actual` array contains the actual answers to all questions."
1907 |    ]
1908 |   },
1909 |   {
1910 |    "cell_type": "code",
1911 |    "execution_count": 45,
1912 |    "metadata": {
1913 |     "collapsed": false,
1914 |     "deletable": true,
1915 |     "editable": true
1916 |    },
1917 |    "outputs": [],
1918 |    "source": [
1919 |     "actual = list(data_qa1[data_qa1.type == 'Q'].answer)"
1920 |    ]
1921 |   },
1922 |   {
1923 |    "cell_type": "code",
1924 |    "execution_count": 46,
1925 |    "metadata": {
1926 |     "collapsed": false,
1927 |     "deletable": true,
1928 |     "editable": true
1929 |    },
1930 |    "outputs": [
1931 |     {
1932 |      "data": {
1933 |       "text/plain": [
1934 |        "['bathroom', 'hallway', 'hallway', 'office', 'bathroom']"
1935 |       ]
1936 |      },
1937 |      "execution_count": 46,
1938 |      "metadata": {},
1939 |      "output_type": "execute_result"
1940 |     }
1941 |    ],
1942 |    "source": [
1943 |     "actual[:5]"
1944 |    ]
1945 |   },
1946 |   {
1947 |    "cell_type": "code",
1948 |    "execution_count": 47,
1949 |    "metadata": {
1950 |     "collapsed": false,
1951 |     "deletable": true,
1952 |     "editable": true
1953 |    },
1954 |    "outputs": [
1955 |     {
1956 |      "data": {
1957 |       "text/plain": [
1958 |        "1.0"
1959 |       ]
1960 |      },
1961 |      "execution_count": 47,
1962 |      "metadata": {},
1963 |      "output_type": "execute_result"
1964 |     }
1965 |    ],
1966 |    "source": [
1967 |     "accuracy_score(actual, predicted)"
1968 |    ]
1969 |   },
1970 |   {
1971 |    "cell_type": "markdown",
1972 |    "metadata": {
1973 |     "collapsed": true,
1974 |     "deletable": true,
1975 |     "editable": true
1976 |    },
1977 |    "source": [
1978 |     "And just like that, we get an accuracy of 100%. Of course, this dataset is very simple (and machine generated), so it should be of no surprise. But one notable achievement is that the graph we created can generalize to any statements of the form, `(subject, relation, object)`."
1979 |    ]
1980 |   }
1981 |  ],
1982 |  "metadata": {
1983 |   "kernelspec": {
1984 |    "display_name": "Python 3",
1985 |    "language": "python",
1986 |    "name": "python3"
1987 |   },
1988 |   "language_info": {
1989 |    "codemirror_mode": {
1990 |     "name": "ipython",
1991 |     "version": 3
1992 |    },
1993 |    "file_extension": ".py",
1994 |    "mimetype": "text/x-python",
1995 |    "name": "python",
1996 |    "nbconvert_exporter": "python",
1997 |    "pygments_lexer": "ipython3",
1998 |    "version": "3.6.1"
1999 |   }
2000 |  },
2001 |  "nbformat": 4,
2002 |  "nbformat_minor": 2
2003 | }
2004 | 


--------------------------------------------------------------------------------
/notebooks/dynamic_memory_2.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "deletable": true,
  7 |     "editable": true
  8 |    },
  9 |    "source": [
 10 |     "# Memory Representation in Dialogue Systems (Part 2)"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {
 16 |     "deletable": true,
 17 |     "editable": true
 18 |    },
 19 |    "source": [
 20 |     "This notebook is part 2 of the dynamic memory representation series. See part 1 to get started."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {
 26 |     "deletable": true,
 27 |     "editable": true
 28 |    },
 29 |    "source": [
 30 |     "## Process the Text\n",
 31 |     "As with part 1, part 2 will perform the same evaluation as part 1, except with bAbI tasks QA2, Two Supporting Facts. In QA1, there were two types of entities: `person`s and `room`s. In QA2, there is one additional entity type: `item`s. Each dialogue provides a sequence of statements that indicate persons going to different rooms as before, and also items that persons may have acquired or released. The key insight is that objects move into rooms with the person that last acquired them, and stay in rooms once released. This requires the system to make the distinction between rooms and items, and also between acquiring and releasing actions.\n",
 32 |     "\n",
 33 |     "The first step is to import `resources/qa2_two-supporting-facts_train.txt` into `data`. Text processing is exactly the same as before: tokenize and POS tag the sentences."
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 1,
 39 |    "metadata": {
 40 |     "collapsed": true,
 41 |     "deletable": true,
 42 |     "editable": true
 43 |    },
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "import pandas as pd\n",
 47 |     "import numpy as np\n",
 48 |     "import nltk\n",
 49 |     "from sklearn.metrics import accuracy_score"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 2,
 55 |    "metadata": {
 56 |     "collapsed": false,
 57 |     "deletable": true,
 58 |     "editable": true
 59 |    },
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "# Read the bAbI data as CSV\n",
 63 |     "filename = 'resources/qa2_two-supporting-facts_train.txt'\n",
 64 |     "data = pd.read_csv(filename, delimiter='\\t', names=['sentence', 'answer', 'factid'])\n",
 65 |     "data = data.fillna('')\n",
 66 |     "\n",
 67 |     "# Tag each sentence as a statement or question\n",
 68 |     "tag_sentence = lambda row: 'S' if row.answer == '' else 'Q'\n",
 69 |     "data['type'] = data.apply(tag_sentence, axis=1)\n",
 70 |     "\n",
 71 |     "# Use NLTK to tokenize the sentences into arrays of words\n",
 72 |     "tokenize = lambda row: nltk.word_tokenize(row.sentence)[1:]\n",
 73 |     "data.sentence = data.apply(tokenize, axis=1)"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 3,
 79 |    "metadata": {
 80 |     "collapsed": true,
 81 |     "deletable": true,
 82 |     "editable": true
 83 |    },
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "# Create a DataFrame with just the statements\n",
 87 |     "def statements():\n",
 88 |     "    return data[data.type == 'S'] \\\n",
 89 |     "        .reset_index(drop=True) \\\n",
 90 |     "        .drop('answer', axis=1) \\\n",
 91 |     "        .drop('factid', axis=1) \\\n",
 92 |     "        .drop('type', axis=1)\n",
 93 |     "\n",
 94 |     "# Create a DataFrame with just the questions\n",
 95 |     "def questions():\n",
 96 |     "    return data[data.type == 'Q'] \\\n",
 97 |     "        .reset_index(drop=True) \\\n",
 98 |     "        .drop('type', axis=1)"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 4,
104 |    "metadata": {
105 |     "collapsed": true,
106 |     "deletable": true,
107 |     "editable": true
108 |    },
109 |    "outputs": [],
110 |    "source": [
111 |     "# Tag each token as a part of speech\n",
112 |     "pos_tag = lambda row: nltk.pos_tag(row.sentence)\n",
113 |     "data['tag'] = data.apply(pos_tag, axis=1)"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 5,
119 |    "metadata": {
120 |     "collapsed": true,
121 |     "deletable": true,
122 |     "editable": true
123 |    },
124 |    "outputs": [],
125 |    "source": [
126 |     "def extract_statement(tags):\n",
127 |     "    '''Extracts a (subject, relation, object) triple from each statement based on the POS tags'''\n",
128 |     "    subject, relation, obj = '', '', ''\n",
129 |     "    for word,tag in tags:\n",
130 |     "        if tag == 'NNP':\n",
131 |     "            subject = word\n",
132 |     "        elif tag == 'VBD' or word == 'journeyed': # TODO: 'journeyed' is tagged improperly\n",
133 |     "            relation = word\n",
134 |     "        elif tag == 'NN':\n",
135 |     "            obj = word\n",
136 |     "    return (subject, relation, obj)"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 6,
142 |    "metadata": {
143 |     "collapsed": true,
144 |     "deletable": true,
145 |     "editable": true
146 |    },
147 |    "outputs": [],
148 |    "source": [
149 |     "def extract_question(tags):\n",
150 |     "    '''Extracts the entity under discussion from each question based on the POS tags'''\n",
151 |     "    eud = ''\n",
152 |     "    for word,tag in tags:\n",
153 |     "        if tag == 'NNP' or tag == 'NN':\n",
154 |     "            eud = word\n",
155 |     "    return eud"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 7,
161 |    "metadata": {
162 |     "collapsed": true,
163 |     "deletable": true,
164 |     "editable": true
165 |    },
166 |    "outputs": [],
167 |    "source": [
168 |     "def extract(row):\n",
169 |     "    '''Extracts the appropriate data given a processed DataFrame row'''\n",
170 |     "    if row.type == 'S':\n",
171 |     "        return extract_statement(row.tag)\n",
172 |     "    else: \n",
173 |     "        return extract_question(row.tag)"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 8,
179 |    "metadata": {
180 |     "collapsed": true,
181 |     "deletable": true,
182 |     "editable": true
183 |    },
184 |    "outputs": [],
185 |    "source": [
186 |     "data['extracted'] = data.apply(extract, axis=1)"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "markdown",
191 |    "metadata": {
192 |     "deletable": true,
193 |     "editable": true
194 |    },
195 |    "source": [
196 |     "## Define the Graph"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": 11,
202 |    "metadata": {
203 |     "collapsed": true,
204 |     "deletable": true,
205 |     "editable": true
206 |    },
207 |    "outputs": [],
208 |    "source": [
209 |     "from neo4j.v1 import GraphDatabase, basic_auth"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": 12,
215 |    "metadata": {
216 |     "collapsed": true,
217 |     "deletable": true,
218 |     "editable": true
219 |    },
220 |    "outputs": [],
221 |    "source": [
222 |     "# Create a neo4j session\n",
223 |     "# NOTE: Make sure that URL/credentials are correct and that Neo4j is running\n",
224 |     "driver = GraphDatabase.driver('bolt://localhost:7687', auth=basic_auth('neo4j', 'neo4j'))"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": 13,
230 |    "metadata": {
231 |     "collapsed": true,
232 |     "deletable": true,
233 |     "editable": true
234 |    },
235 |    "outputs": [],
236 |    "source": [
237 |     "# WARNING: This will clear the database when run!\n",
238 |     "def reset_db():\n",
239 |     "    '''Remove all nodes and relationships from the database'''\n",
240 |     "    session = driver.session()\n",
241 |     "    session.run('MATCH (n) DETACH DELETE n')"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": 14,
247 |    "metadata": {
248 |     "collapsed": true,
249 |     "deletable": true,
250 |     "editable": true
251 |    },
252 |    "outputs": [],
253 |    "source": [
254 |     "def create(query, start=0, end=0):\n",
255 |     "    '''Create a graph based on each triple in the extracted statements'''\n",
256 |     "    session = driver.session()\n",
257 |     "    stat = statements()\n",
258 |     "    end = len(stat) if end <= start else end\n",
259 |     "    for subject,relation,obj in stat[start:end].extracted:\n",
260 |     "        session.run(query, subject=subject, relation=relation, obj=obj)"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "markdown",
265 |    "metadata": {
266 |     "deletable": true,
267 |     "editable": true
268 |    },
269 |    "source": [
270 |     "This is the point where QA2 starts to be different from QA1. The query generating the knowledge graph needs to be altered slightly to encode information about the ordering of events relative to objects as well as subjects.\n",
271 |     "\n",
272 |     "In QA1, a linked list was constructed to keep track of events relative to a character; the `NEXT` edge type indicated the next event that the _person_ acted upon. This was all that was necessary, since the questions asked directly about the most recent event that corresponded to a particular person.\n",
273 |     "\n",
274 |     "In QA2, questions ask about the item a room is in, which requires a way to keep track of the last person who interacted with it. As such, it is not enough to know the order in which a person performed actions, but it is also necessary to know the order in which an item was handled. The most recent interaction indicates the person who interacted with that object last, and that can be used to find the room based on their visit history.\n",
275 |     "\n",
276 |     "Thus, the v4 graph query will create three types of lists.\n",
277 |     "1. The first list is the global list of events indicated by the `NEXT` edge type.\n",
278 |     "2. The second list is a `person`'s list of events indicated by the `S_NEXT` (next subject) edge type.\n",
279 |     "3. The third list is an `item`'s list of events indicated by the `O_NEXT` (next object) edge type.\n",
280 |     "Each list has a `HEAD` edge that points to the most recent event relative to their respective lists."
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "code",
285 |    "execution_count": null,
286 |    "metadata": {
287 |     "collapsed": true,
288 |     "deletable": true,
289 |     "editable": true
290 |    },
291 |    "outputs": [],
292 |    "source": [
293 |     "v4_query = '''\n",
294 |     "    /// 1. Create Nodes\n",
295 |     "    MERGE (global:GLOBAL {name:'global'}) // Find/create the global entity\n",
296 |     "    MERGE (subject:SUBJECT {name:$subject}) // Find/create the subject and object\n",
297 |     "    MERGE (object:OBJECT {name:$obj})\n",
298 |     "\n",
299 |     "    /// 2. Create a new relation between the subject and object\n",
300 |     "    CREATE (subject)-[:R_BEGIN]->(relation:RELATION {name:$relation})-[:R_END]->(object)\n",
301 |     "\n",
302 |     "    /// 3. Create head pointers to the newly created relation\n",
303 |     "    CREATE (global)-[globalHead:HEAD]->(relation)\n",
304 |     "    CREATE (subject)-[subjectHead:HEAD]->(relation)\n",
305 |     "    CREATE (object)-[objectHead:HEAD]->(relation)\n",
306 |     "\n",
307 |     "    WITH global,subject,relation,object,subjectHead,objectHead,globalHead\n",
308 |     "\n",
309 |     "    /// 4. Link the existing global list with the new head node\n",
310 |     "    // Find the previous global head of the list (if none exist, this query will terminate here)\n",
311 |     "    MATCH (global)-[prevGlobalHead:HEAD]->(prevGlobalRelation:RELATION) WHERE prevGlobalRelation <> relation\n",
312 |     "    CREATE (prevGlobalRelation)-[:NEXT]->(relation) // Complete the link\n",
313 |     "    DELETE prevGlobalHead // Remove the previous head pointer\n",
314 |     "\n",
315 |     "    WITH subject,relation,object,subjectHead,objectHead\n",
316 |     "\n",
317 |     "    /// 5. Link the existing subject list with the new head node\n",
318 |     "    // Find the previous subject head of the list (if none exist, this query will terminate here)\n",
319 |     "    MATCH (subject)-[prevSubjectHead:HEAD]->(prevSubjectRelation:RELATION) WHERE prevSubjectRelation <> relation\n",
320 |     "    CREATE (prevSubjectRelation)-[:S_NEXT]->(relation) // Complete the link\n",
321 |     "    DELETE prevSubjectHead // Remove the previous head pointer\n",
322 |     "\n",
323 |     "    WITH subject,relation,object,objectHead\n",
324 |     "\n",
325 |     "    /// 6. Link the existing object list with the new head node\n",
326 |     "    // Find the previous subject head of the list (if none exist, this query will terminate here)\n",
327 |     "    MATCH (object)-[prevObjectHead:HEAD]->(prevObjectRelation:RELATION) WHERE prevObjectRelation <> relation\n",
328 |     "    CREATE (prevObjectRelation)-[:O_NEXT]->(relation) // Complete the link\n",
329 |     "    DELETE prevObjectHead // Remove the previous head pointer\n",
330 |     "'''"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": 15,
336 |    "metadata": {
337 |     "collapsed": false,
338 |     "deletable": true,
339 |     "editable": true
340 |    },
341 |    "outputs": [],
342 |    "source": [
343 |     "# Represent each relation as a node, ordered by multiple linked lists\n",
344 |     "def build_v4_graph(start=0, end=0):\n",
345 |     "    reset_db()\n",
346 |     "    \n",
347 |     "    session = driver.session()\n",
348 |     "    \n",
349 |     "    # Create an index for faster access\n",
350 |     "    session.run('CREATE INDEX ON :SUBJECT(name)')\n",
351 |     "    session.run('CREATE INDEX ON :RELATION(name)')\n",
352 |     "    session.run('CREATE INDEX ON :OBJECT(name)')\n",
353 |     "    \n",
354 |     "    create(v4_query, start=start, end=end)"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "code",
359 |    "execution_count": 16,
360 |    "metadata": {
361 |     "collapsed": false,
362 |     "deletable": true,
363 |     "editable": true
364 |    },
365 |    "outputs": [],
366 |    "source": [
367 |     "all_actions = sorted(list(set(x[1] for x in data.extracted if x != '' and x[1] != '')))"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "code",
372 |    "execution_count": 17,
373 |    "metadata": {
374 |     "collapsed": true,
375 |     "deletable": true,
376 |     "editable": true
377 |    },
378 |    "outputs": [],
379 |    "source": [
380 |     "movement_actions = ['journeyed', 'moved', 'travelled', 'went']\n",
381 |     "acquire_actions = ['got', 'grabbed', 'picked', 'took']\n",
382 |     "release_actions = ['discarded', 'dropped', 'left', 'put']"
383 |    ]
384 |   },
385 |   {
386 |    "cell_type": "code",
387 |    "execution_count": 18,
388 |    "metadata": {
389 |     "collapsed": true,
390 |     "deletable": true,
391 |     "editable": true
392 |    },
393 |    "outputs": [],
394 |    "source": [
395 |     "def find_last_person(obj):\n",
396 |     "    '''Finds the last person in contact with the object'''\n",
397 |     "    query = '''\n",
398 |     "        MATCH (:OBJECT {name:$name})-[:HEAD]->(relation:RELATION)<-[:R_BEGIN]-(subject:SUBJECT)\n",
399 |     "        RETURN relation.name AS relation, subject.name AS subject\n",
400 |     "    '''\n",
401 |     "    return session.run(query, name=obj)"
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "code",
406 |    "execution_count": 20,
407 |    "metadata": {
408 |     "collapsed": true,
409 |     "deletable": true,
410 |     "editable": true
411 |    },
412 |    "outputs": [],
413 |    "source": [
414 |     "def find_object_location(obj):\n",
415 |     "    query = '''\n",
416 |     "        // Find the last person in contact with the object\n",
417 |     "        MATCH (:OBJECT {name:$obj})-[:HEAD]->(relation:RELATION)<-[:R_BEGIN]-(subject:SUBJECT)\n",
418 |     "\n",
419 |     "        // Acquire\n",
420 |     "        MATCH (subject)-[:HEAD]->(head_relation:RELATION)\n",
421 |     "        \n",
422 |     "        MATCH p=(head_relation)<-[next:S_NEXT *1..20]-(prevRelation:RELATION)\n",
423 |     "        WHERE prevRelation.name IN $movement\n",
424 |     "        WITH size(next) as dist, p, relation\n",
425 |     "        ORDER BY dist\n",
426 |     "        WITH filter(n IN nodes(p) WHERE n.name IN $movement)[0] AS shortest, relation\n",
427 |     "        MATCH (shortest)-[:R_END]->(object_acquire:OBJECT)\n",
428 |     "        \n",
429 |     "        WITH relation, object_acquire\n",
430 |     "\n",
431 |     "        // Release\n",
432 |     "        MATCH p=(relation)<-[next:S_NEXT *1..20]-(prevRelation:RELATION)\n",
433 |     "        WHERE prevRelation.name IN $movement\n",
434 |     "        WITH size(next) as dist, p, object_acquire, relation\n",
435 |     "        ORDER BY dist\n",
436 |     "        WITH filter(n IN nodes(p) WHERE n.name IN $movement)[0] AS shortest, object_acquire, relation\n",
437 |     "        MATCH (shortest)-[:R_END]->(object_release:OBJECT)\n",
438 |     "\n",
439 |     "        RETURN DISTINCT object_acquire.name AS acquire, object_release.name AS release, relation.name AS relation\n",
440 |     "    '''\n",
441 |     "    return session.run(query, obj=obj, movement=movement_actions)"
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "code",
446 |    "execution_count": 21,
447 |    "metadata": {
448 |     "collapsed": true,
449 |     "deletable": true,
450 |     "editable": true
451 |    },
452 |    "outputs": [],
453 |    "source": [
454 |     "## 98.6% Accuracy\n",
455 |     "\n",
456 |     "# def find_object_location(obj):\n",
457 |     "#     query = '''\n",
458 |     "#         // Find the last person in contact with the object\n",
459 |     "#         MATCH (:OBJECT {name:$obj})-[:HEAD]->(relation:RELATION)<-[:R_BEGIN]-(subject:SUBJECT)\n",
460 |     "\n",
461 |     "#         // Acquire\n",
462 |     "#         MATCH (subject)-[:HEAD]->(:RELATION)-[:R_END]->(object_acquire:OBJECT)\n",
463 |     "\n",
464 |     "#         // Release\n",
465 |     "#         MATCH p=(relation)<-[next:S_NEXT *1..20]-(prevRelation:RELATION)\n",
466 |     "#         WHERE prevRelation.name IN $movement\n",
467 |     "#         WITH size(next) as dist, p, object_acquire, relation\n",
468 |     "#         ORDER BY dist\n",
469 |     "#         WITH filter(n IN nodes(p) WHERE n.name IN $movement)[0] AS shortest, object_acquire, relation\n",
470 |     "#         MATCH (shortest)-[:R_END]->(object_release:OBJECT)\n",
471 |     "\n",
472 |     "#         RETURN DISTINCT object_acquire.name AS acquire, object_release.name AS release, relation.name AS relation\n",
473 |     "#     '''\n",
474 |     "#     return session.run(query, obj=obj, movement=movement_actions)"
475 |    ]
476 |   },
477 |   {
478 |    "cell_type": "code",
479 |    "execution_count": 24,
480 |    "metadata": {
481 |     "collapsed": false,
482 |     "deletable": true,
483 |     "editable": true
484 |    },
485 |    "outputs": [
486 |     {
487 |      "data": {
488 |       "text/plain": [
489 |        "<Record acquire='garden' release='bathroom' relation='got'>"
490 |       ]
491 |      },
492 |      "execution_count": 24,
493 |      "metadata": {},
494 |      "output_type": "execute_result"
495 |     }
496 |    ],
497 |    "source": [
498 |     "build_v4_graph(start=0, end=6)\n",
499 |     "\n",
500 |     "session = driver.session()\n",
501 |     "find_object_location('football').single()"
502 |    ]
503 |   },
504 |   {
505 |    "cell_type": "code",
506 |    "execution_count": 25,
507 |    "metadata": {
508 |     "collapsed": false,
509 |     "deletable": true,
510 |     "editable": true
511 |    },
512 |    "outputs": [
513 |     {
514 |      "data": {
515 |       "text/plain": [
516 |        "<Record acquire='garden' release='bathroom' relation='got'>"
517 |       ]
518 |      },
519 |      "execution_count": 25,
520 |      "metadata": {},
521 |      "output_type": "execute_result"
522 |     }
523 |    ],
524 |    "source": [
525 |     "session = driver.session()\n",
526 |     "find_object_location('football').single()"
527 |    ]
528 |   },
529 |   {
530 |    "cell_type": "markdown",
531 |    "metadata": {
532 |     "deletable": true,
533 |     "editable": true
534 |    },
535 |    "source": [
536 |     "## Build the Graph"
537 |    ]
538 |   },
539 |   {
540 |    "cell_type": "code",
541 |    "execution_count": 26,
542 |    "metadata": {
543 |     "collapsed": false,
544 |     "deletable": true,
545 |     "editable": true
546 |    },
547 |    "outputs": [],
548 |    "source": [
549 |     "build_v4_graph()"
550 |    ]
551 |   },
552 |   {
553 |    "cell_type": "markdown",
554 |    "metadata": {
555 |     "deletable": true,
556 |     "editable": true
557 |    },
558 |    "source": [
559 |     "<img src=\"screenshots/qa2-multiple-list.png\" style=\"width:1000px;\">"
560 |    ]
561 |   },
562 |   {
563 |    "cell_type": "markdown",
564 |    "metadata": {
565 |     "deletable": true,
566 |     "editable": true
567 |    },
568 |    "source": [
569 |     "## Calcualte an Accuracy Score"
570 |    ]
571 |   },
572 |   {
573 |    "cell_type": "code",
574 |    "execution_count": 27,
575 |    "metadata": {
576 |     "collapsed": true,
577 |     "deletable": true,
578 |     "editable": true
579 |    },
580 |    "outputs": [],
581 |    "source": [
582 |     "def get_answers(row):\n",
583 |     "    '''Given an input row merge the statement in the graph, \n",
584 |     "    or query the graph if it is a question'''\n",
585 |     "    if row.type == 'S':\n",
586 |     "        subject,relation,obj = row.extracted\n",
587 |     "        session.run(v4_query, subject=subject, relation=relation, obj=obj)\n",
588 |     "        return ''\n",
589 |     "    elif row.type == 'Q':\n",
590 |     "        obj = row.extracted\n",
591 |     "        # WARNING: do not consume the result (e.g., call .consume() or .single()) \n",
592 |     "        # until the entire iteration is done.\n",
593 |     "        # Failure to do so may cause the queries to be VERY slow!\n",
594 |     "        return find_object_location(obj)"
595 |    ]
596 |   },
597 |   {
598 |    "cell_type": "code",
599 |    "execution_count": 28,
600 |    "metadata": {
601 |     "collapsed": true,
602 |     "deletable": true,
603 |     "editable": true
604 |    },
605 |    "outputs": [],
606 |    "source": [
607 |     "def traverse(result):\n",
608 |     "    if result['relation'] in acquire_actions:\n",
609 |     "        return result['acquire']\n",
610 |     "    else:\n",
611 |     "        return result['release']"
612 |    ]
613 |   },
614 |   {
615 |    "cell_type": "code",
616 |    "execution_count": 30,
617 |    "metadata": {
618 |     "collapsed": false,
619 |     "deletable": true,
620 |     "editable": true
621 |    },
622 |    "outputs": [],
623 |    "source": [
624 |     "reset_db()"
625 |    ]
626 |   },
627 |   {
628 |    "cell_type": "code",
629 |    "execution_count": 31,
630 |    "metadata": {
631 |     "collapsed": false,
632 |     "deletable": true,
633 |     "editable": true,
634 |     "scrolled": true
635 |    },
636 |    "outputs": [],
637 |    "source": [
638 |     "session = driver.session()\n",
639 |     "results = data.apply(get_answers, axis=1)\n",
640 |     "results = [x for x in results if x != '']\n",
641 |     "predicted = [traverse(result.single()) for result in results]"
642 |    ]
643 |   },
644 |   {
645 |    "cell_type": "code",
646 |    "execution_count": 32,
647 |    "metadata": {
648 |     "collapsed": false,
649 |     "deletable": true,
650 |     "editable": true
651 |    },
652 |    "outputs": [
653 |     {
654 |      "data": {
655 |       "text/plain": [
656 |        "['garden', 'garden', 'hallway', 'hallway', 'kitchen']"
657 |       ]
658 |      },
659 |      "execution_count": 32,
660 |      "metadata": {},
661 |      "output_type": "execute_result"
662 |     }
663 |    ],
664 |    "source": [
665 |     "predicted[:5]"
666 |    ]
667 |   },
668 |   {
669 |    "cell_type": "code",
670 |    "execution_count": 33,
671 |    "metadata": {
672 |     "collapsed": true,
673 |     "deletable": true,
674 |     "editable": true
675 |    },
676 |    "outputs": [],
677 |    "source": [
678 |     "actual = list(questions().answer)"
679 |    ]
680 |   },
681 |   {
682 |    "cell_type": "code",
683 |    "execution_count": 34,
684 |    "metadata": {
685 |     "collapsed": false,
686 |     "deletable": true,
687 |     "editable": true
688 |    },
689 |    "outputs": [
690 |     {
691 |      "data": {
692 |       "text/plain": [
693 |        "['garden', 'garden', 'hallway', 'hallway', 'kitchen']"
694 |       ]
695 |      },
696 |      "execution_count": 34,
697 |      "metadata": {},
698 |      "output_type": "execute_result"
699 |     }
700 |    ],
701 |    "source": [
702 |     "actual[:5]"
703 |    ]
704 |   },
705 |   {
706 |    "cell_type": "code",
707 |    "execution_count": 35,
708 |    "metadata": {
709 |     "collapsed": false,
710 |     "deletable": true,
711 |     "editable": true
712 |    },
713 |    "outputs": [
714 |     {
715 |      "data": {
716 |       "text/plain": [
717 |        "1.0"
718 |       ]
719 |      },
720 |      "execution_count": 35,
721 |      "metadata": {},
722 |      "output_type": "execute_result"
723 |     }
724 |    ],
725 |    "source": [
726 |     "accuracy_score(actual, predicted)"
727 |    ]
728 |   },
729 |   {
730 |    "cell_type": "code",
731 |    "execution_count": 36,
732 |    "metadata": {
733 |     "collapsed": false,
734 |     "deletable": true,
735 |     "editable": true
736 |    },
737 |    "outputs": [],
738 |    "source": [
739 |     "def find_incorrect(actual, predicted):\n",
740 |     "    z = list(zip(actual, predicted))\n",
741 |     "    return [(i, x[0], x[1]) for i,x in enumerate(z) if x[0] != x[1]]"
742 |    ]
743 |   },
744 |   {
745 |    "cell_type": "code",
746 |    "execution_count": null,
747 |    "metadata": {
748 |     "collapsed": true,
749 |     "deletable": true,
750 |     "editable": true
751 |    },
752 |    "outputs": [],
753 |    "source": []
754 |   }
755 |  ],
756 |  "metadata": {
757 |   "kernelspec": {
758 |    "display_name": "Python 3",
759 |    "language": "python",
760 |    "name": "python3"
761 |   },
762 |   "language_info": {
763 |    "codemirror_mode": {
764 |     "name": "ipython",
765 |     "version": 3
766 |    },
767 |    "file_extension": ".py",
768 |    "mimetype": "text/x-python",
769 |    "name": "python",
770 |    "nbconvert_exporter": "python",
771 |    "pygments_lexer": "ipython3",
772 |    "version": "3.6.1"
773 |   }
774 |  },
775 |  "nbformat": 4,
776 |  "nbformat_minor": 2
777 | }
778 | 


--------------------------------------------------------------------------------
/notebooks/dynamic_memory_3.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Memory Representation in Dialogue Systems (Part 3)\n",
   8 |     "\n",
   9 |     "Under construction, will update with explanations when finished.\n",
  10 |     "\n",
  11 |     "## Import"
  12 |    ]
  13 |   },
  14 |   {
  15 |    "cell_type": "code",
  16 |    "execution_count": 1,
  17 |    "metadata": {
  18 |     "collapsed": true
  19 |    },
  20 |    "outputs": [],
  21 |    "source": [
  22 |     "import pandas as pd\n",
  23 |     "import numpy as np\n",
  24 |     "import nltk\n",
  25 |     "from sklearn.metrics import accuracy_score\n",
  26 |     "from neo4j.v1 import GraphDatabase, basic_auth\n",
  27 |     "from collections import defaultdict"
  28 |    ]
  29 |   },
  30 |   {
  31 |    "cell_type": "code",
  32 |    "execution_count": 2,
  33 |    "metadata": {
  34 |     "collapsed": false
  35 |    },
  36 |    "outputs": [
  37 |     {
  38 |      "data": {
  39 |       "text/plain": [
  40 |        "(414, 52256)"
  41 |       ]
  42 |      },
  43 |      "execution_count": 2,
  44 |      "metadata": {},
  45 |      "output_type": "execute_result"
  46 |     }
  47 |    ],
  48 |    "source": [
  49 |     "refs_utts = pd.read_pickle('resources/utts_refs.pkl')\n",
  50 |     "props = pd.read_pickle('resources/restaurants_props.pkl')\n",
  51 |     "len(refs_utts), len(props)"
  52 |    ]
  53 |   },
  54 |   {
  55 |    "cell_type": "code",
  56 |    "execution_count": 3,
  57 |    "metadata": {
  58 |     "collapsed": false
  59 |    },
  60 |    "outputs": [
  61 |     {
  62 |      "data": {
  63 |       "text/html": [
  64 |        "<div>\n",
  65 |        "<table border=\"1\" class=\"dataframe\">\n",
  66 |        "  <thead>\n",
  67 |        "    <tr style=\"text-align: right;\">\n",
  68 |        "      <th></th>\n",
  69 |        "      <th>text</th>\n",
  70 |        "      <th>bot</th>\n",
  71 |        "      <th>o</th>\n",
  72 |        "      <th>ind</th>\n",
  73 |        "      <th>mask</th>\n",
  74 |        "      <th>gid</th>\n",
  75 |        "      <th>target</th>\n",
  76 |        "    </tr>\n",
  77 |        "  </thead>\n",
  78 |        "  <tbody>\n",
  79 |        "    <tr>\n",
  80 |        "      <th>0</th>\n",
  81 |        "      <td>[i, want, a, moderately, priced, restaurant, i...</td>\n",
  82 |        "      <td>api_call R_cuisine west moderate</td>\n",
  83 |        "      <td>trn</td>\n",
  84 |        "      <td>2</td>\n",
  85 |        "      <td>True</td>\n",
  86 |        "      <td>2</td>\n",
  87 |        "      <td>prezzo</td>\n",
  88 |        "    </tr>\n",
  89 |        "    <tr>\n",
  90 |        "      <th>2</th>\n",
  91 |        "      <td>[cheap, restaurant, in, the, north, part, of, ...</td>\n",
  92 |        "      <td>api_call R_cuisine north cheap</td>\n",
  93 |        "      <td>trn</td>\n",
  94 |        "      <td>2</td>\n",
  95 |        "      <td>True</td>\n",
  96 |        "      <td>11</td>\n",
  97 |        "      <td>da_vinci_pizzeria</td>\n",
  98 |        "    </tr>\n",
  99 |        "    <tr>\n",
 100 |        "      <th>3</th>\n",
 101 |        "      <td>[cheap, restaurant, in, the, south, part, of, ...</td>\n",
 102 |        "      <td>api_call R_cuisine south cheap</td>\n",
 103 |        "      <td>trn</td>\n",
 104 |        "      <td>2</td>\n",
 105 |        "      <td>True</td>\n",
 106 |        "      <td>12</td>\n",
 107 |        "      <td>the_lucky_star</td>\n",
 108 |        "    </tr>\n",
 109 |        "    <tr>\n",
 110 |        "      <th>4</th>\n",
 111 |        "      <td>[cheap, restaurant, serving, indian, food]</td>\n",
 112 |        "      <td>api_call indian R_location cheap</td>\n",
 113 |        "      <td>trn</td>\n",
 114 |        "      <td>2</td>\n",
 115 |        "      <td>True</td>\n",
 116 |        "      <td>15</td>\n",
 117 |        "      <td>the_gandhi</td>\n",
 118 |        "    </tr>\n",
 119 |        "    <tr>\n",
 120 |        "      <th>5</th>\n",
 121 |        "      <td>[thai, food]</td>\n",
 122 |        "      <td>api_call thai R_location R_price</td>\n",
 123 |        "      <td>trn</td>\n",
 124 |        "      <td>2</td>\n",
 125 |        "      <td>True</td>\n",
 126 |        "      <td>22</td>\n",
 127 |        "      <td>bangkok_city</td>\n",
 128 |        "    </tr>\n",
 129 |        "  </tbody>\n",
 130 |        "</table>\n",
 131 |        "</div>"
 132 |       ],
 133 |       "text/plain": [
 134 |        "                                                text  \\\n",
 135 |        "0  [i, want, a, moderately, priced, restaurant, i...   \n",
 136 |        "2  [cheap, restaurant, in, the, north, part, of, ...   \n",
 137 |        "3  [cheap, restaurant, in, the, south, part, of, ...   \n",
 138 |        "4         [cheap, restaurant, serving, indian, food]   \n",
 139 |        "5                                       [thai, food]   \n",
 140 |        "\n",
 141 |        "                                bot    o ind  mask  gid             target  \n",
 142 |        "0  api_call R_cuisine west moderate  trn   2  True    2             prezzo  \n",
 143 |        "2    api_call R_cuisine north cheap  trn   2  True   11  da_vinci_pizzeria  \n",
 144 |        "3    api_call R_cuisine south cheap  trn   2  True   12     the_lucky_star  \n",
 145 |        "4  api_call indian R_location cheap  trn   2  True   15         the_gandhi  \n",
 146 |        "5  api_call thai R_location R_price  trn   2  True   22       bangkok_city  "
 147 |       ]
 148 |      },
 149 |      "execution_count": 3,
 150 |      "metadata": {},
 151 |      "output_type": "execute_result"
 152 |     }
 153 |    ],
 154 |    "source": [
 155 |     "refs_utts[:5]"
 156 |    ]
 157 |   },
 158 |   {
 159 |    "cell_type": "code",
 160 |    "execution_count": 4,
 161 |    "metadata": {
 162 |     "collapsed": false
 163 |    },
 164 |    "outputs": [
 165 |     {
 166 |      "data": {
 167 |       "text/html": [
 168 |        "<div>\n",
 169 |        "<table border=\"1\" class=\"dataframe\">\n",
 170 |        "  <thead>\n",
 171 |        "    <tr style=\"text-align: right;\">\n",
 172 |        "      <th></th>\n",
 173 |        "      <th>rname</th>\n",
 174 |        "      <th>attr_key</th>\n",
 175 |        "      <th>attr_value</th>\n",
 176 |        "    </tr>\n",
 177 |        "  </thead>\n",
 178 |        "  <tbody>\n",
 179 |        "    <tr>\n",
 180 |        "      <th>3</th>\n",
 181 |        "      <td>saint_johns_chop_house</td>\n",
 182 |        "      <td>R_cuisine</td>\n",
 183 |        "      <td>british</td>\n",
 184 |        "    </tr>\n",
 185 |        "    <tr>\n",
 186 |        "      <th>4</th>\n",
 187 |        "      <td>saint_johns_chop_house</td>\n",
 188 |        "      <td>R_location</td>\n",
 189 |        "      <td>west</td>\n",
 190 |        "    </tr>\n",
 191 |        "    <tr>\n",
 192 |        "      <th>7</th>\n",
 193 |        "      <td>saint_johns_chop_house</td>\n",
 194 |        "      <td>R_price</td>\n",
 195 |        "      <td>moderate</td>\n",
 196 |        "    </tr>\n",
 197 |        "    <tr>\n",
 198 |        "      <th>10</th>\n",
 199 |        "      <td>prezzo</td>\n",
 200 |        "      <td>R_cuisine</td>\n",
 201 |        "      <td>italian</td>\n",
 202 |        "    </tr>\n",
 203 |        "    <tr>\n",
 204 |        "      <th>11</th>\n",
 205 |        "      <td>prezzo</td>\n",
 206 |        "      <td>R_location</td>\n",
 207 |        "      <td>west</td>\n",
 208 |        "    </tr>\n",
 209 |        "  </tbody>\n",
 210 |        "</table>\n",
 211 |        "</div>"
 212 |       ],
 213 |       "text/plain": [
 214 |        "                     rname    attr_key attr_value\n",
 215 |        "3   saint_johns_chop_house   R_cuisine    british\n",
 216 |        "4   saint_johns_chop_house  R_location       west\n",
 217 |        "7   saint_johns_chop_house     R_price   moderate\n",
 218 |        "10                  prezzo   R_cuisine    italian\n",
 219 |        "11                  prezzo  R_location       west"
 220 |       ]
 221 |      },
 222 |      "execution_count": 4,
 223 |      "metadata": {},
 224 |      "output_type": "execute_result"
 225 |     }
 226 |    ],
 227 |    "source": [
 228 |     "props[:5]"
 229 |    ]
 230 |   },
 231 |   {
 232 |    "cell_type": "markdown",
 233 |    "metadata": {},
 234 |    "source": [
 235 |     "## Process Text"
 236 |    ]
 237 |   },
 238 |   {
 239 |    "cell_type": "code",
 240 |    "execution_count": 5,
 241 |    "metadata": {
 242 |     "collapsed": true
 243 |    },
 244 |    "outputs": [],
 245 |    "source": [
 246 |     "stemmer = nltk.stem.snowball.EnglishStemmer()\n",
 247 |     "\n",
 248 |     "def stem(sentence):\n",
 249 |     "    return [stemmer.stem(w) for w in sentence]"
 250 |    ]
 251 |   },
 252 |   {
 253 |    "cell_type": "code",
 254 |    "execution_count": 6,
 255 |    "metadata": {
 256 |     "collapsed": false
 257 |    },
 258 |    "outputs": [
 259 |     {
 260 |      "data": {
 261 |       "text/plain": [
 262 |        "414"
 263 |       ]
 264 |      },
 265 |      "execution_count": 6,
 266 |      "metadata": {},
 267 |      "output_type": "execute_result"
 268 |     }
 269 |    ],
 270 |    "source": [
 271 |     "test = pd.DataFrame()\n",
 272 |     "test['text'] = [stem(s) for s in refs_utts.text]\n",
 273 |     "test['frame'] = [tuple(stem(f.split()[1:])) for f in refs_utts.bot]\n",
 274 |     "len(test)"
 275 |    ]
 276 |   },
 277 |   {
 278 |    "cell_type": "code",
 279 |    "execution_count": 7,
 280 |    "metadata": {
 281 |     "collapsed": false
 282 |    },
 283 |    "outputs": [
 284 |     {
 285 |      "data": {
 286 |       "text/plain": [
 287 |        "405"
 288 |       ]
 289 |      },
 290 |      "execution_count": 7,
 291 |      "metadata": {},
 292 |      "output_type": "execute_result"
 293 |     }
 294 |    ],
 295 |    "source": [
 296 |     "# Remove poorly formatted frames\n",
 297 |     "test = test[test.frame.map(len) == 3]\n",
 298 |     "len(test)"
 299 |    ]
 300 |   },
 301 |   {
 302 |    "cell_type": "code",
 303 |    "execution_count": 8,
 304 |    "metadata": {
 305 |     "collapsed": false
 306 |    },
 307 |    "outputs": [
 308 |     {
 309 |      "data": {
 310 |       "text/html": [
 311 |        "<div>\n",
 312 |        "<table border=\"1\" class=\"dataframe\">\n",
 313 |        "  <thead>\n",
 314 |        "    <tr style=\"text-align: right;\">\n",
 315 |        "      <th></th>\n",
 316 |        "      <th>text</th>\n",
 317 |        "      <th>frame</th>\n",
 318 |        "    </tr>\n",
 319 |        "  </thead>\n",
 320 |        "  <tbody>\n",
 321 |        "    <tr>\n",
 322 |        "      <th>0</th>\n",
 323 |        "      <td>[i, want, a, moder, price, restaur, in, the, w...</td>\n",
 324 |        "      <td>(r_cuisin, west, moder)</td>\n",
 325 |        "    </tr>\n",
 326 |        "    <tr>\n",
 327 |        "      <th>1</th>\n",
 328 |        "      <td>[cheap, restaur, in, the, north, part, of, town]</td>\n",
 329 |        "      <td>(r_cuisin, north, cheap)</td>\n",
 330 |        "    </tr>\n",
 331 |        "    <tr>\n",
 332 |        "      <th>2</th>\n",
 333 |        "      <td>[cheap, restaur, in, the, south, part, of, town]</td>\n",
 334 |        "      <td>(r_cuisin, south, cheap)</td>\n",
 335 |        "    </tr>\n",
 336 |        "    <tr>\n",
 337 |        "      <th>3</th>\n",
 338 |        "      <td>[cheap, restaur, serv, indian, food]</td>\n",
 339 |        "      <td>(indian, r_locat, cheap)</td>\n",
 340 |        "    </tr>\n",
 341 |        "    <tr>\n",
 342 |        "      <th>4</th>\n",
 343 |        "      <td>[thai, food]</td>\n",
 344 |        "      <td>(thai, r_locat, r_price)</td>\n",
 345 |        "    </tr>\n",
 346 |        "  </tbody>\n",
 347 |        "</table>\n",
 348 |        "</div>"
 349 |       ],
 350 |       "text/plain": [
 351 |        "                                                text                     frame\n",
 352 |        "0  [i, want, a, moder, price, restaur, in, the, w...   (r_cuisin, west, moder)\n",
 353 |        "1   [cheap, restaur, in, the, north, part, of, town]  (r_cuisin, north, cheap)\n",
 354 |        "2   [cheap, restaur, in, the, south, part, of, town]  (r_cuisin, south, cheap)\n",
 355 |        "3               [cheap, restaur, serv, indian, food]  (indian, r_locat, cheap)\n",
 356 |        "4                                       [thai, food]  (thai, r_locat, r_price)"
 357 |       ]
 358 |      },
 359 |      "execution_count": 8,
 360 |      "metadata": {},
 361 |      "output_type": "execute_result"
 362 |     }
 363 |    ],
 364 |    "source": [
 365 |     "test[:5]"
 366 |    ]
 367 |   },
 368 |   {
 369 |    "cell_type": "code",
 370 |    "execution_count": 9,
 371 |    "metadata": {
 372 |     "collapsed": true
 373 |    },
 374 |    "outputs": [],
 375 |    "source": [
 376 |     "knowledge = pd.DataFrame()\n",
 377 |     "knowledge['restaurant'] = props.rname.copy()\n",
 378 |     "knowledge['key'] = [stemmer.stem(s) for s in props.attr_key]\n",
 379 |     "knowledge['value'] = [stemmer.stem(s) for s in props.attr_value]"
 380 |    ]
 381 |   },
 382 |   {
 383 |    "cell_type": "code",
 384 |    "execution_count": 11,
 385 |    "metadata": {
 386 |     "collapsed": false
 387 |    },
 388 |    "outputs": [
 389 |     {
 390 |      "data": {
 391 |       "text/html": [
 392 |        "<div>\n",
 393 |        "<table border=\"1\" class=\"dataframe\">\n",
 394 |        "  <thead>\n",
 395 |        "    <tr style=\"text-align: right;\">\n",
 396 |        "      <th></th>\n",
 397 |        "      <th>restaurant</th>\n",
 398 |        "      <th>key</th>\n",
 399 |        "      <th>value</th>\n",
 400 |        "    </tr>\n",
 401 |        "  </thead>\n",
 402 |        "  <tbody>\n",
 403 |        "    <tr>\n",
 404 |        "      <th>3</th>\n",
 405 |        "      <td>saint_johns_chop_house</td>\n",
 406 |        "      <td>r_cuisin</td>\n",
 407 |        "      <td>british</td>\n",
 408 |        "    </tr>\n",
 409 |        "    <tr>\n",
 410 |        "      <th>4</th>\n",
 411 |        "      <td>saint_johns_chop_house</td>\n",
 412 |        "      <td>r_locat</td>\n",
 413 |        "      <td>west</td>\n",
 414 |        "    </tr>\n",
 415 |        "    <tr>\n",
 416 |        "      <th>7</th>\n",
 417 |        "      <td>saint_johns_chop_house</td>\n",
 418 |        "      <td>r_price</td>\n",
 419 |        "      <td>moder</td>\n",
 420 |        "    </tr>\n",
 421 |        "    <tr>\n",
 422 |        "      <th>10</th>\n",
 423 |        "      <td>prezzo</td>\n",
 424 |        "      <td>r_cuisin</td>\n",
 425 |        "      <td>italian</td>\n",
 426 |        "    </tr>\n",
 427 |        "    <tr>\n",
 428 |        "      <th>11</th>\n",
 429 |        "      <td>prezzo</td>\n",
 430 |        "      <td>r_locat</td>\n",
 431 |        "      <td>west</td>\n",
 432 |        "    </tr>\n",
 433 |        "  </tbody>\n",
 434 |        "</table>\n",
 435 |        "</div>"
 436 |       ],
 437 |       "text/plain": [
 438 |        "                restaurant       key    value\n",
 439 |        "3   saint_johns_chop_house  r_cuisin  british\n",
 440 |        "4   saint_johns_chop_house   r_locat     west\n",
 441 |        "7   saint_johns_chop_house   r_price    moder\n",
 442 |        "10                  prezzo  r_cuisin  italian\n",
 443 |        "11                  prezzo   r_locat     west"
 444 |       ]
 445 |      },
 446 |      "execution_count": 11,
 447 |      "metadata": {},
 448 |      "output_type": "execute_result"
 449 |     }
 450 |    ],
 451 |    "source": [
 452 |     "knowledge[:5]"
 453 |    ]
 454 |   },
 455 |   {
 456 |    "cell_type": "code",
 457 |    "execution_count": 11,
 458 |    "metadata": {
 459 |     "collapsed": false
 460 |    },
 461 |    "outputs": [],
 462 |    "source": [
 463 |     "# A dictionary of keys to the list of values they can take\n",
 464 |     "# In this instance, keys form mutually exclusive lists of values\n",
 465 |     "types = knowledge[['key', 'value']] \\\n",
 466 |     "    .groupby('key') \\\n",
 467 |     "    .aggregate(lambda x: tuple(set(x))) \\\n",
 468 |     "    .reset_index() \\\n",
 469 |     "    .set_index('key') \\\n",
 470 |     "    .value \\\n",
 471 |     "    .to_dict()"
 472 |    ]
 473 |   },
 474 |   {
 475 |    "cell_type": "code",
 476 |    "execution_count": 12,
 477 |    "metadata": {
 478 |     "collapsed": false
 479 |    },
 480 |    "outputs": [
 481 |     {
 482 |      "data": {
 483 |       "text/plain": [
 484 |        "('asian_orient', 'vietnames', 'lebanes', 'african', 'thai')"
 485 |       ]
 486 |      },
 487 |      "execution_count": 12,
 488 |      "metadata": {},
 489 |      "output_type": "execute_result"
 490 |     }
 491 |    ],
 492 |    "source": [
 493 |     "types['r_cuisin'][:5]"
 494 |    ]
 495 |   },
 496 |   {
 497 |    "cell_type": "code",
 498 |    "execution_count": 13,
 499 |    "metadata": {
 500 |     "collapsed": false
 501 |    },
 502 |    "outputs": [
 503 |     {
 504 |      "data": {
 505 |       "text/plain": [
 506 |        "('centr', 'south', 'west', 'east', 'north')"
 507 |       ]
 508 |      },
 509 |      "execution_count": 13,
 510 |      "metadata": {},
 511 |      "output_type": "execute_result"
 512 |     }
 513 |    ],
 514 |    "source": [
 515 |     "types['r_locat']"
 516 |    ]
 517 |   },
 518 |   {
 519 |    "cell_type": "code",
 520 |    "execution_count": 14,
 521 |    "metadata": {
 522 |     "collapsed": false
 523 |    },
 524 |    "outputs": [
 525 |     {
 526 |      "data": {
 527 |       "text/plain": [
 528 |        "('expens', 'moder', 'cheap')"
 529 |       ]
 530 |      },
 531 |      "execution_count": 14,
 532 |      "metadata": {},
 533 |      "output_type": "execute_result"
 534 |     }
 535 |    ],
 536 |    "source": [
 537 |     "types['r_price']"
 538 |    ]
 539 |   },
 540 |   {
 541 |    "cell_type": "markdown",
 542 |    "metadata": {},
 543 |    "source": [
 544 |     "## Create Knowledge Graph"
 545 |    ]
 546 |   },
 547 |   {
 548 |    "cell_type": "code",
 549 |    "execution_count": 15,
 550 |    "metadata": {
 551 |     "collapsed": true
 552 |    },
 553 |    "outputs": [],
 554 |    "source": [
 555 |     "# Create a neo4j session\n",
 556 |     "driver = GraphDatabase.driver('bolt://localhost:7687', auth=basic_auth('neo4j', 'neo4j'))"
 557 |    ]
 558 |   },
 559 |   {
 560 |    "cell_type": "code",
 561 |    "execution_count": 16,
 562 |    "metadata": {
 563 |     "collapsed": true
 564 |    },
 565 |    "outputs": [],
 566 |    "source": [
 567 |     "# WARNING: This will clear the database when run!\n",
 568 |     "def reset_db():\n",
 569 |     "    session = driver.session()\n",
 570 |     "    session.run('MATCH (n) DETACH DELETE n')"
 571 |    ]
 572 |   },
 573 |   {
 574 |    "cell_type": "code",
 575 |    "execution_count": 17,
 576 |    "metadata": {
 577 |     "collapsed": false
 578 |    },
 579 |    "outputs": [],
 580 |    "source": [
 581 |     "reset_db()"
 582 |    ]
 583 |   },
 584 |   {
 585 |    "cell_type": "code",
 586 |    "execution_count": 18,
 587 |    "metadata": {
 588 |     "collapsed": false
 589 |    },
 590 |    "outputs": [],
 591 |    "source": [
 592 |     "session = driver.session()\n",
 593 |     "\n",
 594 |     "for i,row in knowledge.iterrows():\n",
 595 |     "    subject, relation, obj = row.restaurant, row.key, row.value\n",
 596 |     "    session.run('''\n",
 597 |     "        MERGE (s:SUBJECT {name: $subject}) \n",
 598 |     "        MERGE (o:OBJECT  {name: $obj}) \n",
 599 |     "        MERGE (s)-[r:RELATION {name: $relation}]->(o)\n",
 600 |     "    ''', { \n",
 601 |     "        'subject': subject,\n",
 602 |     "        'relation': relation,\n",
 603 |     "        'obj': obj\n",
 604 |     "    })"
 605 |    ]
 606 |   },
 607 |   {
 608 |    "cell_type": "markdown",
 609 |    "metadata": {},
 610 |    "source": [
 611 |     "## Test\n",
 612 |     "#### Baseline\n",
 613 |     "The baseline accuracy is the slot accuracy, calculated by the assumption of not knowing any frame values for any of the sentences."
 614 |    ]
 615 |   },
 616 |   {
 617 |    "cell_type": "code",
 618 |    "execution_count": 19,
 619 |    "metadata": {
 620 |     "collapsed": false
 621 |    },
 622 |    "outputs": [
 623 |     {
 624 |      "data": {
 625 |       "text/plain": [
 626 |        "('r_cuisin', 'r_locat', 'r_price')"
 627 |       ]
 628 |      },
 629 |      "execution_count": 19,
 630 |      "metadata": {},
 631 |      "output_type": "execute_result"
 632 |     }
 633 |    ],
 634 |    "source": [
 635 |     "dont_know = tuple(types.keys())\n",
 636 |     "dont_know"
 637 |    ]
 638 |   },
 639 |   {
 640 |    "cell_type": "code",
 641 |    "execution_count": 20,
 642 |    "metadata": {
 643 |     "collapsed": false
 644 |    },
 645 |    "outputs": [],
 646 |    "source": [
 647 |     "base_predicted = list(dont_know) * len(test)\n",
 648 |     "base_actual = [w for frame in test.frame for w in frame]"
 649 |    ]
 650 |   },
 651 |   {
 652 |    "cell_type": "code",
 653 |    "execution_count": 21,
 654 |    "metadata": {
 655 |     "collapsed": false
 656 |    },
 657 |    "outputs": [
 658 |     {
 659 |      "data": {
 660 |       "text/plain": [
 661 |        "0.45267489711934156"
 662 |       ]
 663 |      },
 664 |      "execution_count": 21,
 665 |      "metadata": {},
 666 |      "output_type": "execute_result"
 667 |     }
 668 |    ],
 669 |    "source": [
 670 |     "accuracy_score(base_actual, base_predicted)"
 671 |    ]
 672 |   },
 673 |   {
 674 |    "cell_type": "markdown",
 675 |    "metadata": {},
 676 |    "source": [
 677 |     "#### Accuracy"
 678 |    ]
 679 |   },
 680 |   {
 681 |    "cell_type": "code",
 682 |    "execution_count": 91,
 683 |    "metadata": {
 684 |     "collapsed": false
 685 |    },
 686 |    "outputs": [],
 687 |    "source": [
 688 |     "# Cache properties from DB\n",
 689 |     "# Running this query will obtain all properties at this point in time\n",
 690 |     "def get_properties():\n",
 691 |     "    session = driver.session()\n",
 692 |     "    return session.run('''\n",
 693 |     "        MATCH ()-[r:RELATION]->(o:OBJECT) \n",
 694 |     "        RETURN collect(distinct o.name) AS properties\n",
 695 |     "    ''').single()['properties']"
 696 |    ]
 697 |   },
 698 |   {
 699 |    "cell_type": "code",
 700 |    "execution_count": 92,
 701 |    "metadata": {
 702 |     "collapsed": true
 703 |    },
 704 |    "outputs": [],
 705 |    "source": [
 706 |     "# def get_types():\n",
 707 |     "#     session = driver.session()\n",
 708 |     "#     result = session.run('''\n",
 709 |     "#         MATCH ()-[r:RELATION]->(o:OBJECT) \n",
 710 |     "#         RETURN collect(distinct [r.name, o.name]) AS pair\n",
 711 |     "#     ''').single()[0]\n",
 712 |     "    \n",
 713 |     "#     g_types = defaultdict(lambda: [])\n",
 714 |     "#     for k,v in result:\n",
 715 |     "#         g_types[k].append(v)\n",
 716 |     "#     return g_types"
 717 |    ]
 718 |   },
 719 |   {
 720 |    "cell_type": "code",
 721 |    "execution_count": 115,
 722 |    "metadata": {
 723 |     "collapsed": false
 724 |    },
 725 |    "outputs": [],
 726 |    "source": [
 727 |     "properties = set(get_properties())"
 728 |    ]
 729 |   },
 730 |   {
 731 |    "cell_type": "code",
 732 |    "execution_count": 116,
 733 |    "metadata": {
 734 |     "collapsed": true
 735 |    },
 736 |    "outputs": [],
 737 |    "source": [
 738 |     "# Hotword listener\n",
 739 |     "def is_hotword(word):\n",
 740 |     "    return word in properties"
 741 |    ]
 742 |   },
 743 |   {
 744 |    "cell_type": "code",
 745 |    "execution_count": 117,
 746 |    "metadata": {
 747 |     "collapsed": false
 748 |    },
 749 |    "outputs": [
 750 |     {
 751 |      "data": {
 752 |       "text/plain": [
 753 |        "(True, False)"
 754 |       ]
 755 |      },
 756 |      "execution_count": 117,
 757 |      "metadata": {},
 758 |      "output_type": "execute_result"
 759 |     }
 760 |    ],
 761 |    "source": [
 762 |     "is_hotword('british'), is_hotword('python')"
 763 |    ]
 764 |   },
 765 |   {
 766 |    "cell_type": "code",
 767 |    "execution_count": 122,
 768 |    "metadata": {
 769 |     "collapsed": true
 770 |    },
 771 |    "outputs": [],
 772 |    "source": [
 773 |     "# Issue DB queries\n",
 774 |     "def find_slot(prop):\n",
 775 |     "    return session.run('''\n",
 776 |     "        MATCH (s:SUBJECT)-[r:RELATION]->(o:OBJECT {name:$name}) \n",
 777 |     "        RETURN collect(distinct [r.name, o.name]) AS properties\n",
 778 |     "    ''', {\n",
 779 |     "        'name': prop\n",
 780 |     "    })\n",
 781 |     "\n",
 782 |     "def extract(result):\n",
 783 |     "    return result.single()['properties'][0]"
 784 |    ]
 785 |   },
 786 |   {
 787 |    "cell_type": "code",
 788 |    "execution_count": 123,
 789 |    "metadata": {
 790 |     "collapsed": false
 791 |    },
 792 |    "outputs": [
 793 |     {
 794 |      "data": {
 795 |       "text/plain": [
 796 |        "['r_locat', 'west']"
 797 |       ]
 798 |      },
 799 |      "execution_count": 123,
 800 |      "metadata": {},
 801 |      "output_type": "execute_result"
 802 |     }
 803 |    ],
 804 |    "source": [
 805 |     "session = driver.session()\n",
 806 |     "extract(find_slot('west'))"
 807 |    ]
 808 |   },
 809 |   {
 810 |    "cell_type": "code",
 811 |    "execution_count": 183,
 812 |    "metadata": {
 813 |     "collapsed": false
 814 |    },
 815 |    "outputs": [],
 816 |    "source": [
 817 |     "session = driver.session()\n",
 818 |     "all_slots = [[find_slot(word) for word in sentence if is_hotword(word)] for sentence in test.text]\n",
 819 |     "extracted_slots = [[tuple(extract(slot)) for slot in slots] for slots in all_slots]\n",
 820 |     "test['slots'] = extracted_slots"
 821 |    ]
 822 |   },
 823 |   {
 824 |    "cell_type": "code",
 825 |    "execution_count": 184,
 826 |    "metadata": {
 827 |     "collapsed": true
 828 |    },
 829 |    "outputs": [],
 830 |    "source": [
 831 |     "def to_frame(slots):\n",
 832 |     "    frame = list(dont_know)\n",
 833 |     "    s = dict(slots)\n",
 834 |     "    \n",
 835 |     "    for i,x in enumerate(frame):\n",
 836 |     "        if x in s.keys():\n",
 837 |     "            frame[i] = s[x]\n",
 838 |     "    \n",
 839 |     "    return tuple(frame)"
 840 |    ]
 841 |   },
 842 |   {
 843 |    "cell_type": "code",
 844 |    "execution_count": 185,
 845 |    "metadata": {
 846 |     "collapsed": false
 847 |    },
 848 |    "outputs": [],
 849 |    "source": [
 850 |     "test['predicted'] = [to_frame(slot) for slot in test.slots]"
 851 |    ]
 852 |   },
 853 |   {
 854 |    "cell_type": "code",
 855 |    "execution_count": 186,
 856 |    "metadata": {
 857 |     "collapsed": false
 858 |    },
 859 |    "outputs": [
 860 |     {
 861 |      "data": {
 862 |       "text/html": [
 863 |        "<div>\n",
 864 |        "<table border=\"1\" class=\"dataframe\">\n",
 865 |        "  <thead>\n",
 866 |        "    <tr style=\"text-align: right;\">\n",
 867 |        "      <th></th>\n",
 868 |        "      <th>text</th>\n",
 869 |        "      <th>frame</th>\n",
 870 |        "      <th>slots</th>\n",
 871 |        "      <th>predicted</th>\n",
 872 |        "    </tr>\n",
 873 |        "  </thead>\n",
 874 |        "  <tbody>\n",
 875 |        "    <tr>\n",
 876 |        "      <th>0</th>\n",
 877 |        "      <td>[i, want, a, moder, price, restaur, in, the, w...</td>\n",
 878 |        "      <td>(r_cuisin, west, moder)</td>\n",
 879 |        "      <td>[(r_price, moder), (r_locat, west)]</td>\n",
 880 |        "      <td>(r_cuisin, west, moder)</td>\n",
 881 |        "    </tr>\n",
 882 |        "    <tr>\n",
 883 |        "      <th>1</th>\n",
 884 |        "      <td>[cheap, restaur, in, the, north, part, of, town]</td>\n",
 885 |        "      <td>(r_cuisin, north, cheap)</td>\n",
 886 |        "      <td>[(r_price, cheap), (r_locat, north)]</td>\n",
 887 |        "      <td>(r_cuisin, north, cheap)</td>\n",
 888 |        "    </tr>\n",
 889 |        "    <tr>\n",
 890 |        "      <th>2</th>\n",
 891 |        "      <td>[cheap, restaur, in, the, south, part, of, town]</td>\n",
 892 |        "      <td>(r_cuisin, south, cheap)</td>\n",
 893 |        "      <td>[(r_price, cheap), (r_locat, south)]</td>\n",
 894 |        "      <td>(r_cuisin, south, cheap)</td>\n",
 895 |        "    </tr>\n",
 896 |        "    <tr>\n",
 897 |        "      <th>3</th>\n",
 898 |        "      <td>[cheap, restaur, serv, indian, food]</td>\n",
 899 |        "      <td>(indian, r_locat, cheap)</td>\n",
 900 |        "      <td>[(r_price, cheap), (r_cuisin, indian)]</td>\n",
 901 |        "      <td>(indian, r_locat, cheap)</td>\n",
 902 |        "    </tr>\n",
 903 |        "    <tr>\n",
 904 |        "      <th>4</th>\n",
 905 |        "      <td>[thai, food]</td>\n",
 906 |        "      <td>(thai, r_locat, r_price)</td>\n",
 907 |        "      <td>[(r_cuisin, thai)]</td>\n",
 908 |        "      <td>(thai, r_locat, r_price)</td>\n",
 909 |        "    </tr>\n",
 910 |        "  </tbody>\n",
 911 |        "</table>\n",
 912 |        "</div>"
 913 |       ],
 914 |       "text/plain": [
 915 |        "                                                text  \\\n",
 916 |        "0  [i, want, a, moder, price, restaur, in, the, w...   \n",
 917 |        "1   [cheap, restaur, in, the, north, part, of, town]   \n",
 918 |        "2   [cheap, restaur, in, the, south, part, of, town]   \n",
 919 |        "3               [cheap, restaur, serv, indian, food]   \n",
 920 |        "4                                       [thai, food]   \n",
 921 |        "\n",
 922 |        "                      frame                                   slots  \\\n",
 923 |        "0   (r_cuisin, west, moder)     [(r_price, moder), (r_locat, west)]   \n",
 924 |        "1  (r_cuisin, north, cheap)    [(r_price, cheap), (r_locat, north)]   \n",
 925 |        "2  (r_cuisin, south, cheap)    [(r_price, cheap), (r_locat, south)]   \n",
 926 |        "3  (indian, r_locat, cheap)  [(r_price, cheap), (r_cuisin, indian)]   \n",
 927 |        "4  (thai, r_locat, r_price)                      [(r_cuisin, thai)]   \n",
 928 |        "\n",
 929 |        "                  predicted  \n",
 930 |        "0   (r_cuisin, west, moder)  \n",
 931 |        "1  (r_cuisin, north, cheap)  \n",
 932 |        "2  (r_cuisin, south, cheap)  \n",
 933 |        "3  (indian, r_locat, cheap)  \n",
 934 |        "4  (thai, r_locat, r_price)  "
 935 |       ]
 936 |      },
 937 |      "execution_count": 186,
 938 |      "metadata": {},
 939 |      "output_type": "execute_result"
 940 |     }
 941 |    ],
 942 |    "source": [
 943 |     "test[:5]"
 944 |    ]
 945 |   },
 946 |   {
 947 |    "cell_type": "code",
 948 |    "execution_count": 173,
 949 |    "metadata": {
 950 |     "collapsed": false
 951 |    },
 952 |    "outputs": [],
 953 |    "source": [
 954 |     "predicted = [w for frame in test.predicted for w in frame]\n",
 955 |     "actual = [w for frame in test.frame for w in frame]"
 956 |    ]
 957 |   },
 958 |   {
 959 |    "cell_type": "code",
 960 |    "execution_count": 187,
 961 |    "metadata": {
 962 |     "collapsed": false
 963 |    },
 964 |    "outputs": [
 965 |     {
 966 |      "data": {
 967 |       "text/plain": [
 968 |        "0.96954732510288066"
 969 |       ]
 970 |      },
 971 |      "execution_count": 187,
 972 |      "metadata": {},
 973 |      "output_type": "execute_result"
 974 |     }
 975 |    ],
 976 |    "source": [
 977 |     "accuracy_score(actual, predicted)"
 978 |    ]
 979 |   },
 980 |   {
 981 |    "cell_type": "code",
 982 |    "execution_count": 193,
 983 |    "metadata": {
 984 |     "collapsed": false
 985 |    },
 986 |    "outputs": [
 987 |     {
 988 |      "name": "stdout",
 989 |      "output_type": "stream",
 990 |      "text": [
 991 |       "         |      r                                                                |\n",
 992 |       "         |      _      r             r                                           |\n",
 993 |       "         |      c      _             _                                         e |\n",
 994 |       "         |      u      l      c      p      m             n             s      x |\n",
 995 |       "         |      i      o      h      r      o      e      o      w      o      p |\n",
 996 |       "         |      s      c      e      i      d      a      r      e      u      e |\n",
 997 |       "         |      i      a      a      c      e      s      t      s      t      n |\n",
 998 |       "         |      n      t      p      e      r      t      h      t      h      s |\n",
 999 |       "---------+-----------------------------------------------------------------------+\n",
1000 |       "r_cuisin | <18.9%>     .      .      .      .      .      .      .      .      . |\n",
1001 |       " r_locat |      . <14.7%>     .      .      .      .      .   0.1%   0.6%      . |\n",
1002 |       "   cheap |      .      . <11.9%>  0.2%      .      .      .      .      .      . |\n",
1003 |       " r_price |      .      .   0.1% <10.5%>  0.2%      .      .      .      .   0.1% |\n",
1004 |       "   moder |      .      .      .   0.2%  <8.2%>     .      .      .      .      . |\n",
1005 |       "    east |      .      .      .      .      .  <6.4%>     .      .      .      . |\n",
1006 |       "   north |      .      .      .      .      .      .  <5.1%>     .      .      . |\n",
1007 |       "    west |      .   0.2%      .      .      .      .      .  <3.3%>     .      . |\n",
1008 |       "   south |      .      .      .      .      .      .      .      .  <3.0%>     . |\n",
1009 |       "  expens |      .      .      .      .      .      .      .      .      .  <1.9%>|\n",
1010 |       "---------+-----------------------------------------------------------------------+\n",
1011 |       "(row = reference; col = test)\n",
1012 |       "\n"
1013 |      ]
1014 |     }
1015 |    ],
1016 |    "source": [
1017 |     "cm = nltk.ConfusionMatrix(actual, predicted)\n",
1018 |     "print(cm.pretty_format(sort_by_count=True, show_percents=True, truncate=10))"
1019 |    ]
1020 |   },
1021 |   {
1022 |    "cell_type": "code",
1023 |    "execution_count": 207,
1024 |    "metadata": {
1025 |     "collapsed": false
1026 |    },
1027 |    "outputs": [
1028 |     {
1029 |      "data": {
1030 |       "text/html": [
1031 |        "<div>\n",
1032 |        "<table border=\"1\" class=\"dataframe\">\n",
1033 |        "  <thead>\n",
1034 |        "    <tr style=\"text-align: right;\">\n",
1035 |        "      <th></th>\n",
1036 |        "      <th>text</th>\n",
1037 |        "      <th>frame</th>\n",
1038 |        "      <th>slots</th>\n",
1039 |        "      <th>predicted</th>\n",
1040 |        "    </tr>\n",
1041 |        "  </thead>\n",
1042 |        "  <tbody>\n",
1043 |        "    <tr>\n",
1044 |        "      <th>1</th>\n",
1045 |        "      <td>[cheap, restaur, in, the, north, part, of, town]</td>\n",
1046 |        "      <td>(r_cuisin, north, cheap)</td>\n",
1047 |        "      <td>[(r_price, cheap), (r_locat, north)]</td>\n",
1048 |        "      <td>(r_cuisin, north, cheap)</td>\n",
1049 |        "    </tr>\n",
1050 |        "    <tr>\n",
1051 |        "      <th>2</th>\n",
1052 |        "      <td>[cheap, restaur, in, the, south, part, of, town]</td>\n",
1053 |        "      <td>(r_cuisin, south, cheap)</td>\n",
1054 |        "      <td>[(r_price, cheap), (r_locat, south)]</td>\n",
1055 |        "      <td>(r_cuisin, south, cheap)</td>\n",
1056 |        "    </tr>\n",
1057 |        "    <tr>\n",
1058 |        "      <th>3</th>\n",
1059 |        "      <td>[cheap, restaur, serv, indian, food]</td>\n",
1060 |        "      <td>(indian, r_locat, cheap)</td>\n",
1061 |        "      <td>[(r_price, cheap), (r_cuisin, indian)]</td>\n",
1062 |        "      <td>(indian, r_locat, cheap)</td>\n",
1063 |        "    </tr>\n",
1064 |        "    <tr>\n",
1065 |        "      <th>7</th>\n",
1066 |        "      <td>[im, look, for, a, cheap, restaur, in, the, no...</td>\n",
1067 |        "      <td>(r_cuisin, north, cheap)</td>\n",
1068 |        "      <td>[(r_price, cheap), (r_locat, north)]</td>\n",
1069 |        "      <td>(r_cuisin, north, cheap)</td>\n",
1070 |        "    </tr>\n",
1071 |        "    <tr>\n",
1072 |        "      <th>10</th>\n",
1073 |        "      <td>[cheap, restaur]</td>\n",
1074 |        "      <td>(r_cuisin, r_locat, cheap)</td>\n",
1075 |        "      <td>[(r_price, cheap)]</td>\n",
1076 |        "      <td>(r_cuisin, r_locat, cheap)</td>\n",
1077 |        "    </tr>\n",
1078 |        "    <tr>\n",
1079 |        "      <th>12</th>\n",
1080 |        "      <td>[i, want, a, cheap, restaur, in, the, west, pa...</td>\n",
1081 |        "      <td>(r_cuisin, west, cheap)</td>\n",
1082 |        "      <td>[(r_price, cheap), (r_locat, west)]</td>\n",
1083 |        "      <td>(r_cuisin, west, cheap)</td>\n",
1084 |        "    </tr>\n",
1085 |        "    <tr>\n",
1086 |        "      <th>14</th>\n",
1087 |        "      <td>[i, am, look, for, a, cheap, restaur, in, the,...</td>\n",
1088 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1089 |        "      <td>[(r_price, cheap), (r_locat, east)]</td>\n",
1090 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1091 |        "    </tr>\n",
1092 |        "    <tr>\n",
1093 |        "      <th>15</th>\n",
1094 |        "      <td>[im, look, for, a, cheap, restaur, serv, inter...</td>\n",
1095 |        "      <td>(intern, r_locat, cheap)</td>\n",
1096 |        "      <td>[(r_price, cheap), (r_cuisin, intern)]</td>\n",
1097 |        "      <td>(intern, r_locat, cheap)</td>\n",
1098 |        "    </tr>\n",
1099 |        "    <tr>\n",
1100 |        "      <th>17</th>\n",
1101 |        "      <td>[look, for, a, cheap, restaur, in, the, south,...</td>\n",
1102 |        "      <td>(r_cuisin, south, cheap)</td>\n",
1103 |        "      <td>[(r_price, cheap), (r_locat, south)]</td>\n",
1104 |        "      <td>(r_cuisin, south, cheap)</td>\n",
1105 |        "    </tr>\n",
1106 |        "    <tr>\n",
1107 |        "      <th>20</th>\n",
1108 |        "      <td>[look, for, someth, cheap, in, the, north, sid...</td>\n",
1109 |        "      <td>(r_cuisin, north, cheap)</td>\n",
1110 |        "      <td>[(r_price, cheap), (r_locat, north)]</td>\n",
1111 |        "      <td>(r_cuisin, north, cheap)</td>\n",
1112 |        "    </tr>\n",
1113 |        "    <tr>\n",
1114 |        "      <th>33</th>\n",
1115 |        "      <td>[im, look, for, a, cheap, restaur, in, the, so...</td>\n",
1116 |        "      <td>(r_cuisin, r_locat, cheap)</td>\n",
1117 |        "      <td>[(r_price, cheap), (r_locat, south)]</td>\n",
1118 |        "      <td>(r_cuisin, south, cheap)</td>\n",
1119 |        "    </tr>\n",
1120 |        "    <tr>\n",
1121 |        "      <th>40</th>\n",
1122 |        "      <td>[i, want, a, cheap, restaur, in, the, south, p...</td>\n",
1123 |        "      <td>(r_cuisin, r_locat, cheap)</td>\n",
1124 |        "      <td>[(r_price, cheap), (r_locat, south)]</td>\n",
1125 |        "      <td>(r_cuisin, south, cheap)</td>\n",
1126 |        "    </tr>\n",
1127 |        "    <tr>\n",
1128 |        "      <th>44</th>\n",
1129 |        "      <td>[i, need, a, cheap, restaur, in, the, south, p...</td>\n",
1130 |        "      <td>(r_cuisin, south, cheap)</td>\n",
1131 |        "      <td>[(r_price, cheap), (r_locat, south)]</td>\n",
1132 |        "      <td>(r_cuisin, south, cheap)</td>\n",
1133 |        "    </tr>\n",
1134 |        "    <tr>\n",
1135 |        "      <th>46</th>\n",
1136 |        "      <td>[i, want, a, cheap, restaur, in, the, east, pa...</td>\n",
1137 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1138 |        "      <td>[(r_price, cheap), (r_locat, east)]</td>\n",
1139 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1140 |        "    </tr>\n",
1141 |        "    <tr>\n",
1142 |        "      <th>57</th>\n",
1143 |        "      <td>[breath, id, like, a, cheap, restaur, in, the,...</td>\n",
1144 |        "      <td>(r_cuisin, south, cheap)</td>\n",
1145 |        "      <td>[(r_price, cheap), (r_locat, south)]</td>\n",
1146 |        "      <td>(r_cuisin, south, cheap)</td>\n",
1147 |        "    </tr>\n",
1148 |        "    <tr>\n",
1149 |        "      <th>61</th>\n",
1150 |        "      <td>[im, look, for, a, cheap, restaur, in, the, we...</td>\n",
1151 |        "      <td>(r_cuisin, west, cheap)</td>\n",
1152 |        "      <td>[(r_price, cheap), (r_locat, west)]</td>\n",
1153 |        "      <td>(r_cuisin, west, cheap)</td>\n",
1154 |        "    </tr>\n",
1155 |        "    <tr>\n",
1156 |        "      <th>62</th>\n",
1157 |        "      <td>[i, would, like, to, find, a, cheap, restaur, ...</td>\n",
1158 |        "      <td>(r_cuisin, south, cheap)</td>\n",
1159 |        "      <td>[(r_price, cheap), (r_locat, south)]</td>\n",
1160 |        "      <td>(r_cuisin, south, cheap)</td>\n",
1161 |        "    </tr>\n",
1162 |        "    <tr>\n",
1163 |        "      <th>68</th>\n",
1164 |        "      <td>[cheap, restaur, in, the, north, part, of, town]</td>\n",
1165 |        "      <td>(r_cuisin, north, cheap)</td>\n",
1166 |        "      <td>[(r_price, cheap), (r_locat, north)]</td>\n",
1167 |        "      <td>(r_cuisin, north, cheap)</td>\n",
1168 |        "    </tr>\n",
1169 |        "    <tr>\n",
1170 |        "      <th>71</th>\n",
1171 |        "      <td>[i, would, like, a, cheap, restaur, in, the, n...</td>\n",
1172 |        "      <td>(r_cuisin, north, cheap)</td>\n",
1173 |        "      <td>[(r_price, cheap), (r_locat, north)]</td>\n",
1174 |        "      <td>(r_cuisin, north, cheap)</td>\n",
1175 |        "    </tr>\n",
1176 |        "    <tr>\n",
1177 |        "      <th>72</th>\n",
1178 |        "      <td>[i, would, like, a, cheap, restaur, in, the, w...</td>\n",
1179 |        "      <td>(r_cuisin, west, cheap)</td>\n",
1180 |        "      <td>[(r_price, cheap), (r_locat, west)]</td>\n",
1181 |        "      <td>(r_cuisin, west, cheap)</td>\n",
1182 |        "    </tr>\n",
1183 |        "    <tr>\n",
1184 |        "      <th>76</th>\n",
1185 |        "      <td>[im, look, for, a, cheap, restaur, in, the, we...</td>\n",
1186 |        "      <td>(r_cuisin, west, cheap)</td>\n",
1187 |        "      <td>[(r_price, cheap), (r_locat, west)]</td>\n",
1188 |        "      <td>(r_cuisin, west, cheap)</td>\n",
1189 |        "    </tr>\n",
1190 |        "    <tr>\n",
1191 |        "      <th>82</th>\n",
1192 |        "      <td>[im, look, for, a, cheap, restaur, in, the, so...</td>\n",
1193 |        "      <td>(r_cuisin, south, cheap)</td>\n",
1194 |        "      <td>[(r_price, cheap), (r_locat, south)]</td>\n",
1195 |        "      <td>(r_cuisin, south, cheap)</td>\n",
1196 |        "    </tr>\n",
1197 |        "    <tr>\n",
1198 |        "      <th>83</th>\n",
1199 |        "      <td>[im, look, for, a, cheap, restaur, in, the, ea...</td>\n",
1200 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1201 |        "      <td>[(r_price, cheap), (r_locat, east)]</td>\n",
1202 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1203 |        "    </tr>\n",
1204 |        "    <tr>\n",
1205 |        "      <th>84</th>\n",
1206 |        "      <td>[cheap, restaur, serv, spanish, food]</td>\n",
1207 |        "      <td>(spanish, r_locat, cheap)</td>\n",
1208 |        "      <td>[(r_price, cheap), (r_cuisin, spanish)]</td>\n",
1209 |        "      <td>(spanish, r_locat, cheap)</td>\n",
1210 |        "    </tr>\n",
1211 |        "    <tr>\n",
1212 |        "      <th>86</th>\n",
1213 |        "      <td>[im, look, for, a, cheap, restaur, in, the, no...</td>\n",
1214 |        "      <td>(r_cuisin, north, cheap)</td>\n",
1215 |        "      <td>[(r_price, cheap), (r_locat, north)]</td>\n",
1216 |        "      <td>(r_cuisin, north, cheap)</td>\n",
1217 |        "    </tr>\n",
1218 |        "    <tr>\n",
1219 |        "      <th>89</th>\n",
1220 |        "      <td>[im, look, for, a, cheap, restaur, in, the, we...</td>\n",
1221 |        "      <td>(r_cuisin, west, cheap)</td>\n",
1222 |        "      <td>[(r_price, cheap), (r_locat, west)]</td>\n",
1223 |        "      <td>(r_cuisin, west, cheap)</td>\n",
1224 |        "    </tr>\n",
1225 |        "    <tr>\n",
1226 |        "      <th>92</th>\n",
1227 |        "      <td>[im, look, for, a, cheap, restaur, in, the, ea...</td>\n",
1228 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1229 |        "      <td>[(r_price, cheap), (r_locat, east)]</td>\n",
1230 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1231 |        "    </tr>\n",
1232 |        "    <tr>\n",
1233 |        "      <th>96</th>\n",
1234 |        "      <td>[i, need, a, cheap, restaur, in, the, west, pa...</td>\n",
1235 |        "      <td>(r_cuisin, west, cheap)</td>\n",
1236 |        "      <td>[(r_price, cheap), (r_locat, west)]</td>\n",
1237 |        "      <td>(r_cuisin, west, cheap)</td>\n",
1238 |        "    </tr>\n",
1239 |        "    <tr>\n",
1240 |        "      <th>98</th>\n",
1241 |        "      <td>[im, look, for, a, cheap, restaur, in, the, no...</td>\n",
1242 |        "      <td>(r_cuisin, north, cheap)</td>\n",
1243 |        "      <td>[(r_price, cheap), (r_locat, north)]</td>\n",
1244 |        "      <td>(r_cuisin, north, cheap)</td>\n",
1245 |        "    </tr>\n",
1246 |        "    <tr>\n",
1247 |        "      <th>99</th>\n",
1248 |        "      <td>[im, look, for, a, cheap, restaur, in, the, no...</td>\n",
1249 |        "      <td>(r_cuisin, north, cheap)</td>\n",
1250 |        "      <td>[(r_price, cheap), (r_locat, north)]</td>\n",
1251 |        "      <td>(r_cuisin, north, cheap)</td>\n",
1252 |        "    </tr>\n",
1253 |        "    <tr>\n",
1254 |        "      <th>...</th>\n",
1255 |        "      <td>...</td>\n",
1256 |        "      <td>...</td>\n",
1257 |        "      <td>...</td>\n",
1258 |        "      <td>...</td>\n",
1259 |        "    </tr>\n",
1260 |        "    <tr>\n",
1261 |        "      <th>337</th>\n",
1262 |        "      <td>[look, for, a, cheap, restaur, in, the, east, ...</td>\n",
1263 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1264 |        "      <td>[(r_price, cheap), (r_locat, east)]</td>\n",
1265 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1266 |        "    </tr>\n",
1267 |        "    <tr>\n",
1268 |        "      <th>340</th>\n",
1269 |        "      <td>[cheap, restaur, west, part, of, town]</td>\n",
1270 |        "      <td>(r_cuisin, west, cheap)</td>\n",
1271 |        "      <td>[(r_price, cheap), (r_locat, west)]</td>\n",
1272 |        "      <td>(r_cuisin, west, cheap)</td>\n",
1273 |        "    </tr>\n",
1274 |        "    <tr>\n",
1275 |        "      <th>343</th>\n",
1276 |        "      <td>[im, look, for, a, cheap, restaur, and, it, sh...</td>\n",
1277 |        "      <td>(r_cuisin, north, cheap)</td>\n",
1278 |        "      <td>[(r_price, cheap), (r_locat, north)]</td>\n",
1279 |        "      <td>(r_cuisin, north, cheap)</td>\n",
1280 |        "    </tr>\n",
1281 |        "    <tr>\n",
1282 |        "      <th>346</th>\n",
1283 |        "      <td>[im, look, for, a, cheap, restaur, in, the, we...</td>\n",
1284 |        "      <td>(r_cuisin, west, cheap)</td>\n",
1285 |        "      <td>[(r_price, cheap), (r_locat, west)]</td>\n",
1286 |        "      <td>(r_cuisin, west, cheap)</td>\n",
1287 |        "    </tr>\n",
1288 |        "    <tr>\n",
1289 |        "      <th>350</th>\n",
1290 |        "      <td>[cheap, restaur, south, part, of, town]</td>\n",
1291 |        "      <td>(r_cuisin, south, cheap)</td>\n",
1292 |        "      <td>[(r_price, cheap), (r_locat, south)]</td>\n",
1293 |        "      <td>(r_cuisin, south, cheap)</td>\n",
1294 |        "    </tr>\n",
1295 |        "    <tr>\n",
1296 |        "      <th>351</th>\n",
1297 |        "      <td>[iam, look, for, a, cheap, restaur, and, it, s...</td>\n",
1298 |        "      <td>(r_cuisin, north, cheap)</td>\n",
1299 |        "      <td>[(r_price, cheap), (r_locat, north)]</td>\n",
1300 |        "      <td>(r_cuisin, north, cheap)</td>\n",
1301 |        "    </tr>\n",
1302 |        "    <tr>\n",
1303 |        "      <th>352</th>\n",
1304 |        "      <td>[uh, i, want, a, cheap, restaur, and, it, shou...</td>\n",
1305 |        "      <td>(r_cuisin, north, cheap)</td>\n",
1306 |        "      <td>[(r_price, cheap), (r_locat, north)]</td>\n",
1307 |        "      <td>(r_cuisin, north, cheap)</td>\n",
1308 |        "    </tr>\n",
1309 |        "    <tr>\n",
1310 |        "      <th>353</th>\n",
1311 |        "      <td>[im, look, for, a, cheap, restaur, in, the, so...</td>\n",
1312 |        "      <td>(r_cuisin, south, cheap)</td>\n",
1313 |        "      <td>[(r_price, cheap), (r_locat, south)]</td>\n",
1314 |        "      <td>(r_cuisin, south, cheap)</td>\n",
1315 |        "    </tr>\n",
1316 |        "    <tr>\n",
1317 |        "      <th>354</th>\n",
1318 |        "      <td>[i, would, like, a, cheap, restaur, in, the, s...</td>\n",
1319 |        "      <td>(r_cuisin, r_locat, cheap)</td>\n",
1320 |        "      <td>[(r_price, cheap), (r_locat, south)]</td>\n",
1321 |        "      <td>(r_cuisin, south, cheap)</td>\n",
1322 |        "    </tr>\n",
1323 |        "    <tr>\n",
1324 |        "      <th>356</th>\n",
1325 |        "      <td>[can, i, have, a, cheap, restaur, in, the, wes...</td>\n",
1326 |        "      <td>(r_cuisin, west, cheap)</td>\n",
1327 |        "      <td>[(r_price, cheap), (r_locat, west)]</td>\n",
1328 |        "      <td>(r_cuisin, west, cheap)</td>\n",
1329 |        "    </tr>\n",
1330 |        "    <tr>\n",
1331 |        "      <th>358</th>\n",
1332 |        "      <td>[i, am, look, for, a, cheap, restaur, in, the,...</td>\n",
1333 |        "      <td>(r_cuisin, west, cheap)</td>\n",
1334 |        "      <td>[(r_price, cheap), (r_locat, west)]</td>\n",
1335 |        "      <td>(r_cuisin, west, cheap)</td>\n",
1336 |        "    </tr>\n",
1337 |        "    <tr>\n",
1338 |        "      <th>360</th>\n",
1339 |        "      <td>[a, want, a, cheap, restaur, in, the, north, p...</td>\n",
1340 |        "      <td>(r_cuisin, north, cheap)</td>\n",
1341 |        "      <td>[(r_price, cheap), (r_locat, north)]</td>\n",
1342 |        "      <td>(r_cuisin, north, cheap)</td>\n",
1343 |        "    </tr>\n",
1344 |        "    <tr>\n",
1345 |        "      <th>365</th>\n",
1346 |        "      <td>[cheap, restaur, in, th, east, part, of, town]</td>\n",
1347 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1348 |        "      <td>[(r_price, cheap), (r_locat, east)]</td>\n",
1349 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1350 |        "    </tr>\n",
1351 |        "    <tr>\n",
1352 |        "      <th>366</th>\n",
1353 |        "      <td>[im, look, for, a, cheap, restaur, and, it, sh...</td>\n",
1354 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1355 |        "      <td>[(r_price, cheap), (r_locat, east)]</td>\n",
1356 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1357 |        "    </tr>\n",
1358 |        "    <tr>\n",
1359 |        "      <th>369</th>\n",
1360 |        "      <td>[cheap, restaur, in, th, east, part, of, town]</td>\n",
1361 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1362 |        "      <td>[(r_price, cheap), (r_locat, east)]</td>\n",
1363 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1364 |        "    </tr>\n",
1365 |        "    <tr>\n",
1366 |        "      <th>371</th>\n",
1367 |        "      <td>[cheap, restaur, in, the, east, part, of, town]</td>\n",
1368 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1369 |        "      <td>[(r_price, cheap), (r_locat, east)]</td>\n",
1370 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1371 |        "    </tr>\n",
1372 |        "    <tr>\n",
1373 |        "      <th>372</th>\n",
1374 |        "      <td>[im, look, for, a, cheap, restaur, in, the, ea...</td>\n",
1375 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1376 |        "      <td>[(r_price, cheap), (r_locat, east)]</td>\n",
1377 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1378 |        "    </tr>\n",
1379 |        "    <tr>\n",
1380 |        "      <th>374</th>\n",
1381 |        "      <td>[look, for, someth, cheap, on, the, east, part...</td>\n",
1382 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1383 |        "      <td>[(r_price, cheap), (r_locat, east)]</td>\n",
1384 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1385 |        "    </tr>\n",
1386 |        "    <tr>\n",
1387 |        "      <th>377</th>\n",
1388 |        "      <td>[im, look, for, a, cheap, restaur, serv, medit...</td>\n",
1389 |        "      <td>(mediterranean, r_locat, cheap)</td>\n",
1390 |        "      <td>[(r_price, cheap), (r_cuisin, mediterranean)]</td>\n",
1391 |        "      <td>(mediterranean, r_locat, cheap)</td>\n",
1392 |        "    </tr>\n",
1393 |        "    <tr>\n",
1394 |        "      <th>378</th>\n",
1395 |        "      <td>[im, look, for, a, cheap, restaur, in, the, so...</td>\n",
1396 |        "      <td>(r_cuisin, south, cheap)</td>\n",
1397 |        "      <td>[(r_price, cheap), (r_locat, south)]</td>\n",
1398 |        "      <td>(r_cuisin, south, cheap)</td>\n",
1399 |        "    </tr>\n",
1400 |        "    <tr>\n",
1401 |        "      <th>384</th>\n",
1402 |        "      <td>[im, look, for, a, cheap, restaur, that, serv,...</td>\n",
1403 |        "      <td>(vietnames, r_locat, cheap)</td>\n",
1404 |        "      <td>[(r_price, cheap), (r_cuisin, vietnames)]</td>\n",
1405 |        "      <td>(vietnames, r_locat, cheap)</td>\n",
1406 |        "    </tr>\n",
1407 |        "    <tr>\n",
1408 |        "      <th>389</th>\n",
1409 |        "      <td>[im, look, for, a, cheap, restaur, in, the, ea...</td>\n",
1410 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1411 |        "      <td>[(r_price, cheap), (r_locat, east)]</td>\n",
1412 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1413 |        "    </tr>\n",
1414 |        "    <tr>\n",
1415 |        "      <th>393</th>\n",
1416 |        "      <td>[im, look, for, a, cheap, restaur, in, the, ea...</td>\n",
1417 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1418 |        "      <td>[(r_price, cheap), (r_locat, east)]</td>\n",
1419 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1420 |        "    </tr>\n",
1421 |        "    <tr>\n",
1422 |        "      <th>396</th>\n",
1423 |        "      <td>[cheap, restaur, east, part, of, town]</td>\n",
1424 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1425 |        "      <td>[(r_price, cheap), (r_locat, east)]</td>\n",
1426 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1427 |        "    </tr>\n",
1428 |        "    <tr>\n",
1429 |        "      <th>398</th>\n",
1430 |        "      <td>[look, for, a, cheap, restaur, in, the, east, ...</td>\n",
1431 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1432 |        "      <td>[(r_price, cheap), (r_locat, east)]</td>\n",
1433 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1434 |        "    </tr>\n",
1435 |        "    <tr>\n",
1436 |        "      <th>400</th>\n",
1437 |        "      <td>[cheap, restaur, on, the, east, part, of, town]</td>\n",
1438 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1439 |        "      <td>[(r_price, cheap), (r_locat, east)]</td>\n",
1440 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1441 |        "    </tr>\n",
1442 |        "    <tr>\n",
1443 |        "      <th>406</th>\n",
1444 |        "      <td>[cheap, restaur, in, the, east, part, of, town]</td>\n",
1445 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1446 |        "      <td>[(r_price, cheap), (r_locat, east)]</td>\n",
1447 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1448 |        "    </tr>\n",
1449 |        "    <tr>\n",
1450 |        "      <th>410</th>\n",
1451 |        "      <td>[i, need, a, cheap, restaur, in, the, east, pa...</td>\n",
1452 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1453 |        "      <td>[(r_price, cheap), (r_locat, east)]</td>\n",
1454 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1455 |        "    </tr>\n",
1456 |        "    <tr>\n",
1457 |        "      <th>412</th>\n",
1458 |        "      <td>[cheap, restaur, east, part, of, town]</td>\n",
1459 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1460 |        "      <td>[(r_price, cheap), (r_locat, east)]</td>\n",
1461 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1462 |        "    </tr>\n",
1463 |        "    <tr>\n",
1464 |        "      <th>413</th>\n",
1465 |        "      <td>[im, look, for, a, cheap, restaur, in, the, ea...</td>\n",
1466 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1467 |        "      <td>[(r_price, cheap), (r_locat, east)]</td>\n",
1468 |        "      <td>(r_cuisin, east, cheap)</td>\n",
1469 |        "    </tr>\n",
1470 |        "  </tbody>\n",
1471 |        "</table>\n",
1472 |        "<p>145 rows × 4 columns</p>\n",
1473 |        "</div>"
1474 |       ],
1475 |       "text/plain": [
1476 |        "                                                  text  \\\n",
1477 |        "1     [cheap, restaur, in, the, north, part, of, town]   \n",
1478 |        "2     [cheap, restaur, in, the, south, part, of, town]   \n",
1479 |        "3                 [cheap, restaur, serv, indian, food]   \n",
1480 |        "7    [im, look, for, a, cheap, restaur, in, the, no...   \n",
1481 |        "10                                    [cheap, restaur]   \n",
1482 |        "12   [i, want, a, cheap, restaur, in, the, west, pa...   \n",
1483 |        "14   [i, am, look, for, a, cheap, restaur, in, the,...   \n",
1484 |        "15   [im, look, for, a, cheap, restaur, serv, inter...   \n",
1485 |        "17   [look, for, a, cheap, restaur, in, the, south,...   \n",
1486 |        "20   [look, for, someth, cheap, in, the, north, sid...   \n",
1487 |        "33   [im, look, for, a, cheap, restaur, in, the, so...   \n",
1488 |        "40   [i, want, a, cheap, restaur, in, the, south, p...   \n",
1489 |        "44   [i, need, a, cheap, restaur, in, the, south, p...   \n",
1490 |        "46   [i, want, a, cheap, restaur, in, the, east, pa...   \n",
1491 |        "57   [breath, id, like, a, cheap, restaur, in, the,...   \n",
1492 |        "61   [im, look, for, a, cheap, restaur, in, the, we...   \n",
1493 |        "62   [i, would, like, to, find, a, cheap, restaur, ...   \n",
1494 |        "68    [cheap, restaur, in, the, north, part, of, town]   \n",
1495 |        "71   [i, would, like, a, cheap, restaur, in, the, n...   \n",
1496 |        "72   [i, would, like, a, cheap, restaur, in, the, w...   \n",
1497 |        "76   [im, look, for, a, cheap, restaur, in, the, we...   \n",
1498 |        "82   [im, look, for, a, cheap, restaur, in, the, so...   \n",
1499 |        "83   [im, look, for, a, cheap, restaur, in, the, ea...   \n",
1500 |        "84               [cheap, restaur, serv, spanish, food]   \n",
1501 |        "86   [im, look, for, a, cheap, restaur, in, the, no...   \n",
1502 |        "89   [im, look, for, a, cheap, restaur, in, the, we...   \n",
1503 |        "92   [im, look, for, a, cheap, restaur, in, the, ea...   \n",
1504 |        "96   [i, need, a, cheap, restaur, in, the, west, pa...   \n",
1505 |        "98   [im, look, for, a, cheap, restaur, in, the, no...   \n",
1506 |        "99   [im, look, for, a, cheap, restaur, in, the, no...   \n",
1507 |        "..                                                 ...   \n",
1508 |        "337  [look, for, a, cheap, restaur, in, the, east, ...   \n",
1509 |        "340             [cheap, restaur, west, part, of, town]   \n",
1510 |        "343  [im, look, for, a, cheap, restaur, and, it, sh...   \n",
1511 |        "346  [im, look, for, a, cheap, restaur, in, the, we...   \n",
1512 |        "350            [cheap, restaur, south, part, of, town]   \n",
1513 |        "351  [iam, look, for, a, cheap, restaur, and, it, s...   \n",
1514 |        "352  [uh, i, want, a, cheap, restaur, and, it, shou...   \n",
1515 |        "353  [im, look, for, a, cheap, restaur, in, the, so...   \n",
1516 |        "354  [i, would, like, a, cheap, restaur, in, the, s...   \n",
1517 |        "356  [can, i, have, a, cheap, restaur, in, the, wes...   \n",
1518 |        "358  [i, am, look, for, a, cheap, restaur, in, the,...   \n",
1519 |        "360  [a, want, a, cheap, restaur, in, the, north, p...   \n",
1520 |        "365     [cheap, restaur, in, th, east, part, of, town]   \n",
1521 |        "366  [im, look, for, a, cheap, restaur, and, it, sh...   \n",
1522 |        "369     [cheap, restaur, in, th, east, part, of, town]   \n",
1523 |        "371    [cheap, restaur, in, the, east, part, of, town]   \n",
1524 |        "372  [im, look, for, a, cheap, restaur, in, the, ea...   \n",
1525 |        "374  [look, for, someth, cheap, on, the, east, part...   \n",
1526 |        "377  [im, look, for, a, cheap, restaur, serv, medit...   \n",
1527 |        "378  [im, look, for, a, cheap, restaur, in, the, so...   \n",
1528 |        "384  [im, look, for, a, cheap, restaur, that, serv,...   \n",
1529 |        "389  [im, look, for, a, cheap, restaur, in, the, ea...   \n",
1530 |        "393  [im, look, for, a, cheap, restaur, in, the, ea...   \n",
1531 |        "396             [cheap, restaur, east, part, of, town]   \n",
1532 |        "398  [look, for, a, cheap, restaur, in, the, east, ...   \n",
1533 |        "400    [cheap, restaur, on, the, east, part, of, town]   \n",
1534 |        "406    [cheap, restaur, in, the, east, part, of, town]   \n",
1535 |        "410  [i, need, a, cheap, restaur, in, the, east, pa...   \n",
1536 |        "412             [cheap, restaur, east, part, of, town]   \n",
1537 |        "413  [im, look, for, a, cheap, restaur, in, the, ea...   \n",
1538 |        "\n",
1539 |        "                               frame  \\\n",
1540 |        "1           (r_cuisin, north, cheap)   \n",
1541 |        "2           (r_cuisin, south, cheap)   \n",
1542 |        "3           (indian, r_locat, cheap)   \n",
1543 |        "7           (r_cuisin, north, cheap)   \n",
1544 |        "10        (r_cuisin, r_locat, cheap)   \n",
1545 |        "12           (r_cuisin, west, cheap)   \n",
1546 |        "14           (r_cuisin, east, cheap)   \n",
1547 |        "15          (intern, r_locat, cheap)   \n",
1548 |        "17          (r_cuisin, south, cheap)   \n",
1549 |        "20          (r_cuisin, north, cheap)   \n",
1550 |        "33        (r_cuisin, r_locat, cheap)   \n",
1551 |        "40        (r_cuisin, r_locat, cheap)   \n",
1552 |        "44          (r_cuisin, south, cheap)   \n",
1553 |        "46           (r_cuisin, east, cheap)   \n",
1554 |        "57          (r_cuisin, south, cheap)   \n",
1555 |        "61           (r_cuisin, west, cheap)   \n",
1556 |        "62          (r_cuisin, south, cheap)   \n",
1557 |        "68          (r_cuisin, north, cheap)   \n",
1558 |        "71          (r_cuisin, north, cheap)   \n",
1559 |        "72           (r_cuisin, west, cheap)   \n",
1560 |        "76           (r_cuisin, west, cheap)   \n",
1561 |        "82          (r_cuisin, south, cheap)   \n",
1562 |        "83           (r_cuisin, east, cheap)   \n",
1563 |        "84         (spanish, r_locat, cheap)   \n",
1564 |        "86          (r_cuisin, north, cheap)   \n",
1565 |        "89           (r_cuisin, west, cheap)   \n",
1566 |        "92           (r_cuisin, east, cheap)   \n",
1567 |        "96           (r_cuisin, west, cheap)   \n",
1568 |        "98          (r_cuisin, north, cheap)   \n",
1569 |        "99          (r_cuisin, north, cheap)   \n",
1570 |        "..                               ...   \n",
1571 |        "337          (r_cuisin, east, cheap)   \n",
1572 |        "340          (r_cuisin, west, cheap)   \n",
1573 |        "343         (r_cuisin, north, cheap)   \n",
1574 |        "346          (r_cuisin, west, cheap)   \n",
1575 |        "350         (r_cuisin, south, cheap)   \n",
1576 |        "351         (r_cuisin, north, cheap)   \n",
1577 |        "352         (r_cuisin, north, cheap)   \n",
1578 |        "353         (r_cuisin, south, cheap)   \n",
1579 |        "354       (r_cuisin, r_locat, cheap)   \n",
1580 |        "356          (r_cuisin, west, cheap)   \n",
1581 |        "358          (r_cuisin, west, cheap)   \n",
1582 |        "360         (r_cuisin, north, cheap)   \n",
1583 |        "365          (r_cuisin, east, cheap)   \n",
1584 |        "366          (r_cuisin, east, cheap)   \n",
1585 |        "369          (r_cuisin, east, cheap)   \n",
1586 |        "371          (r_cuisin, east, cheap)   \n",
1587 |        "372          (r_cuisin, east, cheap)   \n",
1588 |        "374          (r_cuisin, east, cheap)   \n",
1589 |        "377  (mediterranean, r_locat, cheap)   \n",
1590 |        "378         (r_cuisin, south, cheap)   \n",
1591 |        "384      (vietnames, r_locat, cheap)   \n",
1592 |        "389          (r_cuisin, east, cheap)   \n",
1593 |        "393          (r_cuisin, east, cheap)   \n",
1594 |        "396          (r_cuisin, east, cheap)   \n",
1595 |        "398          (r_cuisin, east, cheap)   \n",
1596 |        "400          (r_cuisin, east, cheap)   \n",
1597 |        "406          (r_cuisin, east, cheap)   \n",
1598 |        "410          (r_cuisin, east, cheap)   \n",
1599 |        "412          (r_cuisin, east, cheap)   \n",
1600 |        "413          (r_cuisin, east, cheap)   \n",
1601 |        "\n",
1602 |        "                                             slots  \\\n",
1603 |        "1             [(r_price, cheap), (r_locat, north)]   \n",
1604 |        "2             [(r_price, cheap), (r_locat, south)]   \n",
1605 |        "3           [(r_price, cheap), (r_cuisin, indian)]   \n",
1606 |        "7             [(r_price, cheap), (r_locat, north)]   \n",
1607 |        "10                              [(r_price, cheap)]   \n",
1608 |        "12             [(r_price, cheap), (r_locat, west)]   \n",
1609 |        "14             [(r_price, cheap), (r_locat, east)]   \n",
1610 |        "15          [(r_price, cheap), (r_cuisin, intern)]   \n",
1611 |        "17            [(r_price, cheap), (r_locat, south)]   \n",
1612 |        "20            [(r_price, cheap), (r_locat, north)]   \n",
1613 |        "33            [(r_price, cheap), (r_locat, south)]   \n",
1614 |        "40            [(r_price, cheap), (r_locat, south)]   \n",
1615 |        "44            [(r_price, cheap), (r_locat, south)]   \n",
1616 |        "46             [(r_price, cheap), (r_locat, east)]   \n",
1617 |        "57            [(r_price, cheap), (r_locat, south)]   \n",
1618 |        "61             [(r_price, cheap), (r_locat, west)]   \n",
1619 |        "62            [(r_price, cheap), (r_locat, south)]   \n",
1620 |        "68            [(r_price, cheap), (r_locat, north)]   \n",
1621 |        "71            [(r_price, cheap), (r_locat, north)]   \n",
1622 |        "72             [(r_price, cheap), (r_locat, west)]   \n",
1623 |        "76             [(r_price, cheap), (r_locat, west)]   \n",
1624 |        "82            [(r_price, cheap), (r_locat, south)]   \n",
1625 |        "83             [(r_price, cheap), (r_locat, east)]   \n",
1626 |        "84         [(r_price, cheap), (r_cuisin, spanish)]   \n",
1627 |        "86            [(r_price, cheap), (r_locat, north)]   \n",
1628 |        "89             [(r_price, cheap), (r_locat, west)]   \n",
1629 |        "92             [(r_price, cheap), (r_locat, east)]   \n",
1630 |        "96             [(r_price, cheap), (r_locat, west)]   \n",
1631 |        "98            [(r_price, cheap), (r_locat, north)]   \n",
1632 |        "99            [(r_price, cheap), (r_locat, north)]   \n",
1633 |        "..                                             ...   \n",
1634 |        "337            [(r_price, cheap), (r_locat, east)]   \n",
1635 |        "340            [(r_price, cheap), (r_locat, west)]   \n",
1636 |        "343           [(r_price, cheap), (r_locat, north)]   \n",
1637 |        "346            [(r_price, cheap), (r_locat, west)]   \n",
1638 |        "350           [(r_price, cheap), (r_locat, south)]   \n",
1639 |        "351           [(r_price, cheap), (r_locat, north)]   \n",
1640 |        "352           [(r_price, cheap), (r_locat, north)]   \n",
1641 |        "353           [(r_price, cheap), (r_locat, south)]   \n",
1642 |        "354           [(r_price, cheap), (r_locat, south)]   \n",
1643 |        "356            [(r_price, cheap), (r_locat, west)]   \n",
1644 |        "358            [(r_price, cheap), (r_locat, west)]   \n",
1645 |        "360           [(r_price, cheap), (r_locat, north)]   \n",
1646 |        "365            [(r_price, cheap), (r_locat, east)]   \n",
1647 |        "366            [(r_price, cheap), (r_locat, east)]   \n",
1648 |        "369            [(r_price, cheap), (r_locat, east)]   \n",
1649 |        "371            [(r_price, cheap), (r_locat, east)]   \n",
1650 |        "372            [(r_price, cheap), (r_locat, east)]   \n",
1651 |        "374            [(r_price, cheap), (r_locat, east)]   \n",
1652 |        "377  [(r_price, cheap), (r_cuisin, mediterranean)]   \n",
1653 |        "378           [(r_price, cheap), (r_locat, south)]   \n",
1654 |        "384      [(r_price, cheap), (r_cuisin, vietnames)]   \n",
1655 |        "389            [(r_price, cheap), (r_locat, east)]   \n",
1656 |        "393            [(r_price, cheap), (r_locat, east)]   \n",
1657 |        "396            [(r_price, cheap), (r_locat, east)]   \n",
1658 |        "398            [(r_price, cheap), (r_locat, east)]   \n",
1659 |        "400            [(r_price, cheap), (r_locat, east)]   \n",
1660 |        "406            [(r_price, cheap), (r_locat, east)]   \n",
1661 |        "410            [(r_price, cheap), (r_locat, east)]   \n",
1662 |        "412            [(r_price, cheap), (r_locat, east)]   \n",
1663 |        "413            [(r_price, cheap), (r_locat, east)]   \n",
1664 |        "\n",
1665 |        "                           predicted  \n",
1666 |        "1           (r_cuisin, north, cheap)  \n",
1667 |        "2           (r_cuisin, south, cheap)  \n",
1668 |        "3           (indian, r_locat, cheap)  \n",
1669 |        "7           (r_cuisin, north, cheap)  \n",
1670 |        "10        (r_cuisin, r_locat, cheap)  \n",
1671 |        "12           (r_cuisin, west, cheap)  \n",
1672 |        "14           (r_cuisin, east, cheap)  \n",
1673 |        "15          (intern, r_locat, cheap)  \n",
1674 |        "17          (r_cuisin, south, cheap)  \n",
1675 |        "20          (r_cuisin, north, cheap)  \n",
1676 |        "33          (r_cuisin, south, cheap)  \n",
1677 |        "40          (r_cuisin, south, cheap)  \n",
1678 |        "44          (r_cuisin, south, cheap)  \n",
1679 |        "46           (r_cuisin, east, cheap)  \n",
1680 |        "57          (r_cuisin, south, cheap)  \n",
1681 |        "61           (r_cuisin, west, cheap)  \n",
1682 |        "62          (r_cuisin, south, cheap)  \n",
1683 |        "68          (r_cuisin, north, cheap)  \n",
1684 |        "71          (r_cuisin, north, cheap)  \n",
1685 |        "72           (r_cuisin, west, cheap)  \n",
1686 |        "76           (r_cuisin, west, cheap)  \n",
1687 |        "82          (r_cuisin, south, cheap)  \n",
1688 |        "83           (r_cuisin, east, cheap)  \n",
1689 |        "84         (spanish, r_locat, cheap)  \n",
1690 |        "86          (r_cuisin, north, cheap)  \n",
1691 |        "89           (r_cuisin, west, cheap)  \n",
1692 |        "92           (r_cuisin, east, cheap)  \n",
1693 |        "96           (r_cuisin, west, cheap)  \n",
1694 |        "98          (r_cuisin, north, cheap)  \n",
1695 |        "99          (r_cuisin, north, cheap)  \n",
1696 |        "..                               ...  \n",
1697 |        "337          (r_cuisin, east, cheap)  \n",
1698 |        "340          (r_cuisin, west, cheap)  \n",
1699 |        "343         (r_cuisin, north, cheap)  \n",
1700 |        "346          (r_cuisin, west, cheap)  \n",
1701 |        "350         (r_cuisin, south, cheap)  \n",
1702 |        "351         (r_cuisin, north, cheap)  \n",
1703 |        "352         (r_cuisin, north, cheap)  \n",
1704 |        "353         (r_cuisin, south, cheap)  \n",
1705 |        "354         (r_cuisin, south, cheap)  \n",
1706 |        "356          (r_cuisin, west, cheap)  \n",
1707 |        "358          (r_cuisin, west, cheap)  \n",
1708 |        "360         (r_cuisin, north, cheap)  \n",
1709 |        "365          (r_cuisin, east, cheap)  \n",
1710 |        "366          (r_cuisin, east, cheap)  \n",
1711 |        "369          (r_cuisin, east, cheap)  \n",
1712 |        "371          (r_cuisin, east, cheap)  \n",
1713 |        "372          (r_cuisin, east, cheap)  \n",
1714 |        "374          (r_cuisin, east, cheap)  \n",
1715 |        "377  (mediterranean, r_locat, cheap)  \n",
1716 |        "378         (r_cuisin, south, cheap)  \n",
1717 |        "384      (vietnames, r_locat, cheap)  \n",
1718 |        "389          (r_cuisin, east, cheap)  \n",
1719 |        "393          (r_cuisin, east, cheap)  \n",
1720 |        "396          (r_cuisin, east, cheap)  \n",
1721 |        "398          (r_cuisin, east, cheap)  \n",
1722 |        "400          (r_cuisin, east, cheap)  \n",
1723 |        "406          (r_cuisin, east, cheap)  \n",
1724 |        "410          (r_cuisin, east, cheap)  \n",
1725 |        "412          (r_cuisin, east, cheap)  \n",
1726 |        "413          (r_cuisin, east, cheap)  \n",
1727 |        "\n",
1728 |        "[145 rows x 4 columns]"
1729 |       ]
1730 |      },
1731 |      "execution_count": 207,
1732 |      "metadata": {},
1733 |      "output_type": "execute_result"
1734 |     }
1735 |    ],
1736 |    "source": [
1737 |     "test[test.text.map(lambda s: 'cheap' in s)]"
1738 |    ]
1739 |   },
1740 |   {
1741 |    "cell_type": "code",
1742 |    "execution_count": 202,
1743 |    "metadata": {
1744 |     "collapsed": false
1745 |    },
1746 |    "outputs": [
1747 |     {
1748 |      "data": {
1749 |       "text/plain": [
1750 |        "['id',\n",
1751 |        " 'like',\n",
1752 |        " 'a',\n",
1753 |        " 'cheap',\n",
1754 |        " 'restaur',\n",
1755 |        " 'in',\n",
1756 |        " 'the',\n",
1757 |        " 'south',\n",
1758 |        " 'part',\n",
1759 |        " 'of',\n",
1760 |        " 'town']"
1761 |       ]
1762 |      },
1763 |      "execution_count": 202,
1764 |      "metadata": {},
1765 |      "output_type": "execute_result"
1766 |     }
1767 |    ],
1768 |    "source": [
1769 |     "test[test.text.map(lambda s: 'south' in s)]['text'][284]"
1770 |    ]
1771 |   },
1772 |   {
1773 |    "cell_type": "code",
1774 |    "execution_count": null,
1775 |    "metadata": {
1776 |     "collapsed": true
1777 |    },
1778 |    "outputs": [],
1779 |    "source": []
1780 |   }
1781 |  ],
1782 |  "metadata": {
1783 |   "kernelspec": {
1784 |    "display_name": "Python 3",
1785 |    "language": "python",
1786 |    "name": "python3"
1787 |   },
1788 |   "language_info": {
1789 |    "codemirror_mode": {
1790 |     "name": "ipython",
1791 |     "version": 3
1792 |    },
1793 |    "file_extension": ".py",
1794 |    "mimetype": "text/x-python",
1795 |    "name": "python",
1796 |    "nbconvert_exporter": "python",
1797 |    "pygments_lexer": "ipython3",
1798 |    "version": "3.6.0"
1799 |   }
1800 |  },
1801 |  "nbformat": 4,
1802 |  "nbformat_minor": 2
1803 | }
1804 | 


--------------------------------------------------------------------------------
/notebooks/resources/restaurants_props.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hyperparticle/graph-nlu/2aa7ef3ce67e4dadd5d1b89b9d7bf40d3d53d9fc/notebooks/resources/restaurants_props.pkl


--------------------------------------------------------------------------------
/notebooks/resources/utts_refs.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hyperparticle/graph-nlu/2aa7ef3ce67e4dadd5d1b89b9d7bf40d3d53d9fc/notebooks/resources/utts_refs.pkl


--------------------------------------------------------------------------------
/notebooks/screenshots/dialog-system.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hyperparticle/graph-nlu/2aa7ef3ce67e4dadd5d1b89b9d7bf40d3d53d9fc/notebooks/screenshots/dialog-system.png


--------------------------------------------------------------------------------
/notebooks/screenshots/global-and-local-list.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hyperparticle/graph-nlu/2aa7ef3ce67e4dadd5d1b89b9d7bf40d3d53d9fc/notebooks/screenshots/global-and-local-list.png


--------------------------------------------------------------------------------
/notebooks/screenshots/local-list.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hyperparticle/graph-nlu/2aa7ef3ce67e4dadd5d1b89b9d7bf40d3d53d9fc/notebooks/screenshots/local-list.png


--------------------------------------------------------------------------------
/notebooks/screenshots/mary-john-example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hyperparticle/graph-nlu/2aa7ef3ce67e4dadd5d1b89b9d7bf40d3d53d9fc/notebooks/screenshots/mary-john-example.png


--------------------------------------------------------------------------------
/notebooks/screenshots/prezzo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hyperparticle/graph-nlu/2aa7ef3ce67e4dadd5d1b89b9d7bf40d3d53d9fc/notebooks/screenshots/prezzo.png


--------------------------------------------------------------------------------
/notebooks/screenshots/qa2-multiple-list.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hyperparticle/graph-nlu/2aa7ef3ce67e4dadd5d1b89b9d7bf40d3d53d9fc/notebooks/screenshots/qa2-multiple-list.png


--------------------------------------------------------------------------------
/notebooks/screenshots/simple-relation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hyperparticle/graph-nlu/2aa7ef3ce67e4dadd5d1b89b9d7bf40d3d53d9fc/notebooks/screenshots/simple-relation.png


--------------------------------------------------------------------------------
/notebooks/screenshots/state-graph-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hyperparticle/graph-nlu/2aa7ef3ce67e4dadd5d1b89b9d7bf40d3d53d9fc/notebooks/screenshots/state-graph-1.png


--------------------------------------------------------------------------------
/notebooks/screenshots/state-graph-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hyperparticle/graph-nlu/2aa7ef3ce67e4dadd5d1b89b9d7bf40d3d53d9fc/notebooks/screenshots/state-graph-2.png


--------------------------------------------------------------------------------
/notebooks/screenshots/v4-mary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hyperparticle/graph-nlu/2aa7ef3ce67e4dadd5d1b89b9d7bf40d3d53d9fc/notebooks/screenshots/v4-mary.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | ipython
2 | jupyter
3 | neo4j-driver
4 | numpy
5 | nltk
6 | pandas
7 | scipy
8 | scikit-learn


--------------------------------------------------------------------------------