├── src
    ├── __init__.py
    ├── util.py
    ├── glove.py
    ├── data_objects_turked.py
    ├── trained_factors.py
    ├── data_turked.py
    ├── main.py
    ├── settings.py
    └── data.py
├── docs
    ├── _config.yml
    ├── max_thumb.jpeg
    ├── yejin_thumb.jpg
    ├── thumb-all-resized.png
    ├── css
    │   └── default.css
    ├── data
    │   └── config
    │   │   └── default.json
    ├── todo.md
    ├── index.md
    └── factorgraph-viz.js
├── lib
    └── ngramdb
    │   ├── ngramdb
    │       ├── __init__.py
    │       ├── util.py
    │       ├── ngramtoken.py
    │       ├── constants.py
    │       └── ngramdb.py
    │   └── setup.py
├── factorgraph-viz.png
├── data
    └── verbphysics
    │   ├── action-frames
    │       ├── train-5
    │       │   ├── train.txt
    │       │   ├── dev.txt
    │       │   └── test.txt
    │       └── train-20
    │       │   ├── train.txt
    │       │   ├── dev.txt
    │       │   └── test.txt
    │   └── objects
    │       ├── train-5
    │           └── train.csv
    │       └── train-20
    │           └── train.csv
├── requirements.txt
├── .travis.yml
├── .gitignore
├── LICENSE.txt
├── scripts
    └── data.sh
└── README.md


/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-cayman


--------------------------------------------------------------------------------
/lib/ngramdb/ngramdb/__init__.py:
--------------------------------------------------------------------------------
1 | from ngramdb import NgramDb
2 | 


--------------------------------------------------------------------------------
/docs/max_thumb.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uwnlp/verbphysics/HEAD/docs/max_thumb.jpeg


--------------------------------------------------------------------------------
/docs/yejin_thumb.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uwnlp/verbphysics/HEAD/docs/yejin_thumb.jpg


--------------------------------------------------------------------------------
/factorgraph-viz.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uwnlp/verbphysics/HEAD/factorgraph-viz.png


--------------------------------------------------------------------------------
/data/verbphysics/action-frames/train-5/train.txt:
--------------------------------------------------------------------------------
1 | took
2 | grew
3 | washed
4 | trimmed
5 | made
6 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | factorgraph
2 | numpy
3 | pandas
4 | nltk
5 | tqdm
6 | myria-python
7 | tabulate
8 | 


--------------------------------------------------------------------------------
/docs/thumb-all-resized.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uwnlp/verbphysics/HEAD/docs/thumb-all-resized.png


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: required
 2 | language: python
 3 | python:
 4 |   - "2.7"
 5 | install:
 6 |   - pip install -r requirements.txt
 7 |   - pip install lib/ngramdb/  
 8 |   - ./scripts/data.sh
 9 | script:
10 |   - python -m src.main
11 | 


--------------------------------------------------------------------------------
/data/verbphysics/action-frames/train-20/train.txt:
--------------------------------------------------------------------------------
 1 | took
 2 | grew
 3 | washed
 4 | trimmed
 5 | made
 6 | got
 7 | looked
 8 | wrote
 9 | entered
10 | kept
11 | lived
12 | played
13 | placed
14 | served
15 | arrived
16 | stopped
17 | changed
18 | accepted
19 | cast
20 | developed
21 | 


--------------------------------------------------------------------------------
/data/verbphysics/action-frames/train-20/dev.txt:
--------------------------------------------------------------------------------
 1 | threw
 2 | drank
 3 | swung
 4 | conquered
 5 | towed
 6 | snipped
 7 | saw
 8 | put
 9 | received
10 | turned
11 | stood
12 | opened
13 | passed
14 | set
15 | wore
16 | raised
17 | died
18 | caught
19 | worked
20 | led
21 | formed
22 | moved
23 | obtained
24 | added
25 | lifted
26 | contained
27 | gained
28 | drove
29 | covered
30 | touched
31 | 


--------------------------------------------------------------------------------
/src/util.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This ended up being smaller than I expected.
 3 | 
 4 | author: mbforbes
 5 | """
 6 | 
 7 | import os
 8 | 
 9 | 
10 | def ensure_dir(directory):
11 |     '''
12 |     Makes directory and all needed parent dirs if it doesn't exist.
13 | 
14 |     Args:
15 |         directory (str)
16 |     '''
17 |     if not os.path.isdir(directory):
18 |         os.makedirs(directory)
19 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Generic python crap
 2 | *.pyc
 3 | 
 4 | # Data that should be retrieved when setting up repository.
 5 | # See scripts/data.sh.
 6 | data/ngramdb/
 7 | data/glove/
 8 | data/emb/
 9 | 
10 | # Every time the system is run it logs files and produces diagnostic output of
11 | # its decisions.
12 | log/
13 | output/
14 | 
15 | # Visualization data gets dumped to this directory.
16 | viz/
17 | 


--------------------------------------------------------------------------------
/lib/ngramdb/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(name='ngramdb',
 4 |       version='0.1.1',
 5 |       description='Provides access to the Myria DB of syntactic n-grams.',
 6 |       author='lzilles',
 7 |       author_email='lzilles@cs.washington.edu',
 8 |       packages=['ngramdb'],
 9 |       dependency_links=[
10 |         'https://github.com/uwescience/myria-python/archive/master.zip'],
11 |       # install_requires=['myria-python'],
12 |       zip_safe=False)
13 | 


--------------------------------------------------------------------------------
/data/verbphysics/action-frames/train-5/dev.txt:
--------------------------------------------------------------------------------
 1 | got
 2 | looked
 3 | wrote
 4 | entered
 5 | kept
 6 | lived
 7 | played
 8 | placed
 9 | served
10 | arrived
11 | stopped
12 | changed
13 | accepted
14 | cast
15 | developed
16 | threw
17 | drank
18 | swung
19 | conquered
20 | towed
21 | snipped
22 | saw
23 | put
24 | received
25 | turned
26 | stood
27 | opened
28 | passed
29 | set
30 | wore
31 | raised
32 | died
33 | caught
34 | worked
35 | led
36 | formed
37 | moved
38 | obtained
39 | added
40 | lifted
41 | contained
42 | gained
43 | drove
44 | covered
45 | touched
46 | 


--------------------------------------------------------------------------------
/data/verbphysics/action-frames/train-5/test.txt:
--------------------------------------------------------------------------------
 1 | walked
 2 | dropped
 3 | cut
 4 | gazed
 5 | jumped
 6 | dug
 7 | scaled
 8 | toppled
 9 | hiked
10 | squashed
11 | gave
12 | found
13 | went
14 | came
15 | heard
16 | held
17 | reached
18 | sat
19 | began
20 | sent
21 | showed
22 | fell
23 | used
24 | shook
25 | drew
26 | carried
27 | called
28 | ran
29 | laid
30 | followed
31 | remained
32 | returned
33 | appeared
34 | rose
35 | pulled
36 | broke
37 | produced
38 | bought
39 | expressed
40 | crossed
41 | struck
42 | picked
43 | won
44 | filled
45 | built
46 | pushed
47 | ordered
48 | poured
49 | waited
50 | ate
51 | 


--------------------------------------------------------------------------------
/data/verbphysics/action-frames/train-20/test.txt:
--------------------------------------------------------------------------------
 1 | walked
 2 | dropped
 3 | cut
 4 | gazed
 5 | jumped
 6 | dug
 7 | scaled
 8 | toppled
 9 | hiked
10 | squashed
11 | gave
12 | found
13 | went
14 | came
15 | heard
16 | held
17 | reached
18 | sat
19 | began
20 | sent
21 | showed
22 | fell
23 | used
24 | shook
25 | drew
26 | carried
27 | called
28 | ran
29 | laid
30 | followed
31 | remained
32 | returned
33 | appeared
34 | rose
35 | pulled
36 | broke
37 | produced
38 | bought
39 | expressed
40 | crossed
41 | struck
42 | picked
43 | won
44 | filled
45 | built
46 | pushed
47 | ordered
48 | poured
49 | waited
50 | ate
51 | 


--------------------------------------------------------------------------------
/docs/css/default.css:
--------------------------------------------------------------------------------
 1 | .links line {
 2 | 	/*stroke: #999;*/
 3 | 	stroke-opacity: 0.6;
 4 | }
 5 | 
 6 | .nodes circle {
 7 |   stroke: #aaa;
 8 |   stroke-width: 1px;
 9 | }
10 | 
11 | .facs rect {
12 | 	stroke: #aaa;
13 | 	stroke-width: 1px;
14 | }
15 | 
16 | .rvtext {
17 | 	font-size: 1em;
18 | }
19 | 
20 | .factext {
21 | 	font-size: 0.7em;
22 | }
23 | 
24 | #suggestionNotice {
25 | 	font-weight: bold;
26 | 	visibility: hidden;
27 | }
28 | 
29 | button.suggestion {
30 | 	background-color: lightgray;
31 | 	margin: 5px;
32 | 	padding: 5px;
33 | 	border: none;
34 | 	border-radius: 8px;
35 | }
36 | 
37 | button.suggestion:hover {
38 | 	background-color: #159957;
39 | 	color: white;
40 | 	cursor: pointer;
41 | }
42 | 
43 | svg {
44 | 	border: 1px solid #159957;
45 | }
46 | 
47 | 
48 | p.limited {
49 | 	color: slategray;
50 | 	font-style: italic;
51 | 	display: inline-block;
52 | 	padding: 4px;
53 | }


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2017 Maxwell Forbes
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/docs/data/config/default.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data_dir": "https://cdn.rawgit.com/mbforbes/verbphysics-examples/f30d8eb1387208aaefff3c0b06f50a3cd8b20800/action-frames/",
 3 |     "data_filenames": "data/options/example-list.json",
 4 |     "startup_filename": "size-threw_dp_into",
 5 |     "autocomplete_limit": 50,
 6 |     "display_prefix": "Action frame: ",
 7 |     "size": {
 8 |         "rv": 8,
 9 |         "factor": 8
10 |     },
11 |     "position": {
12 |         "leftScale": 0.0,
13 |         "leftSubtype": "frame",
14 |         "leftStrength": 0.7,
15 |         "centerScale": 0.33,
16 |         "rightScale": 0.84,
17 |         "rightSubtype": "noun",
18 |         "rightStrength": 0.7,
19 |         "upScale": 0.0,
20 |         "upSubtype": "seed",
21 |         "upStrength": 0.7,
22 |         "downScale": 1.0,
23 |         "downSubtype": "xfactor",
24 |         "downStrength": 0.7,
25 |         "middleStrength": 0.1
26 |     },
27 |     "color": {
28 |         "none": "whitesmoke",
29 |         "unsureColor": "lightslategray",
30 |         "unsureCutoff": 0.4,
31 |         "values": [
32 |             "tomato",
33 |             "royalblue",
34 |             "lightslategray"
35 |         ]
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/docs/todo.md:
--------------------------------------------------------------------------------
 1 | # todo
 2 | 
 3 | ## other repos
 4 | - [x] factor graph
 5 | - [x] actual code (cleanup)
 6 | - [x] store data somewhere programmatically accessible
 7 | 
 8 | ## github.io page
 9 | - [x] brief description
10 | - [ ] demo
11 |     - [x] live loading of new graphs
12 |     - [x] user input w/ autocomplete suggestions
13 |     - [x] clickable suggestions
14 |     - [ ] generate data (FGs and options file)
15 |     - [ ] ensure demo works on project (github.io) page
16 |     - [ ] preload one example
17 |     - [ ] suggestions above input box of things to type
18 |     - [ ] table below of what the different frame types mean (prefix, example
19 |       name, frame type, example frame)
20 | - [x] abstract
21 | - [x] paper vis -> link to paper
22 | - [x] author pics + links
23 | - [x] bibtex
24 | - [x] data (instructions, download links)
25 | - [x] code link / notice for detailed instructions
26 | - [x] acknowledgements (incl stanford vision lab for project page inspiration)
27 | 
28 | ## readme
29 | 
30 | - [x] badges
31 | - [x] overview
32 | - [x] link to github.io page
33 | - [x] installation (code, aux data)
34 | - [x] running
35 | - [x] data
36 | - [x] viz (desc + picture)
37 | - [x] see also for py-factorgraph
38 | 


--------------------------------------------------------------------------------
/scripts/data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #
 4 | # verbphysics
 5 | #
 6 | # Data retrieval script.
 7 | #
 8 | # author: mbforbes
 9 | #
10 | 
11 | # Get and extract ngramdb (cached query and pmi) data
12 | mkdir data/ngramdb/
13 | cd data/ngramdb/
14 | curl https://storage.googleapis.com/ai2-mosaic-public/projects/verb-physics/ngramdb-cache.tar.gz > ngramdb-cache.tar.gz
15 | tar -xzf ngramdb-cache.tar.gz
16 | rm ngramdb-cache.tar.gz
17 | cd ../..
18 | 
19 | # Get and convert GloVe (word embedding) data
20 | mkdir data/glove/
21 | curl https://nlp.stanford.edu/data/wordvecs/glove.6B.zip > data/glove/glove.6B.zip
22 | unzip data/glove/glove.6B.zip -d data/glove/
23 | python src/glove.py
24 | cd data/glove/
25 | rm glove.6B.100d.txt
26 | rm glove.6B.200d.txt
27 | rm glove.6B.300d.txt
28 | rm glove.6B.50d.txt
29 | rm glove.6B.zip
30 | cd ../..
31 | 
32 | # Get embedding-trained unary factor weights
33 | mkdir data/emb/
34 | cd data/emb/
35 | curl https://storage.googleapis.com/ai2-mosaic-public/projects/verb-physics/emb-trained-weights.tar.gz > emb-trained-weights.tar.gz
36 | tar -xzf emb-trained-weights.tar.gz
37 | rm emb-trained-weights.tar.gz
38 | cd ../..
39 | 
40 | # Get wordnet data for NLTK
41 | python -m nltk.downloader wordnet
42 | 


--------------------------------------------------------------------------------
/lib/ngramdb/ngramdb/util.py:
--------------------------------------------------------------------------------
 1 | def aliased_relation(relation, alias):
 2 |     return ' '.join((relation, alias))
 3 | 
 4 | 
 5 | def make_predicate(relation_alias, attribute, value, relationship='='):
 6 |     return "{}.{}{}{}".format(relation_alias, attribute, relationship, value)
 7 | 
 8 | 
 9 | def collapse_ngram_surface(ngrams):
10 |     return sorted((
11 |         (k, sum(x[1] for x in g))
12 |         for k, g in itertools.groupby((
13 |             (n.surface(), n.freq)
14 |             for n in sorted(
15 |                 ngrams,
16 |                 key=lambda x: x.surface())),
17 |             lambda x: x[0])
18 |         ), key=lambda x: x[1], reverse=True)
19 | 
20 | 
21 | def pprint_ngram_list(ngram_list):
22 |     surface_width = max(
23 |         max(len(t.surface) for n in ngram_list for t in n) + 1, 6)
24 | 
25 |     ngram_format_str = "ID: {}\tFreq: {}\tHeight: {}"
26 |     token_format_str = \
27 |         "{0:>2}\t{1:<" + str(surface_width) + "}\t{2:<5}\t{3}{4}"
28 | 
29 |     all_lines = []
30 | 
31 |     for n in ngram_list:
32 |         try:
33 |             all_lines.append(
34 |                 ngram_format_str.format(n.nid, n.freq, n.height)
35 |             )
36 |         except AttributeError:
37 |             all_lines.append(
38 |                 ngram_format_str.format(n.nid, n.freq, "")
39 |             )
40 | 
41 |         all_lines += [
42 |             token_format_str.format(
43 |                 t.position,
44 |                 t.surface,
45 |                 t.postag,
46 |                 t.deprel,
47 |                 '-' + str(t.headposition) if t.headposition > -1 else '')
48 |             for t in n]
49 | 
50 |         all_lines.append('')
51 | 
52 |     # return '\n'.join(all_lines)
53 |     print('\n'.join(all_lines))
54 | 
55 | 
56 | def collapsed_histogram(kv_list):
57 |     c = Counter()
58 |     for k, v in kv_list:
59 |         c[k] += v
60 |     return c
61 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # verbphysics
  2 | 
  3 | [![Build Status](https://travis-ci.org/uwnlp/verbphysics.svg?branch=master)](https://travis-ci.org/uwnlp/verbphysics)
  4 | [![license MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://github.com/uwnlp/verbphysics/blob/master/LICENSE.txt)
  5 | 
  6 | ## About
  7 | 
  8 | This repository contains the data and reference implementation for the paper
  9 | 
 10 | **Verb Physics: Relative Physical Knowledge of Actions and Objects**  
 11 | Maxwell Forbes and Yejin Choi  
 12 | _ACL 2017_
 13 | 
 14 | See the [Verb Physics project page](https://uwnlp.github.io/verbphysics/) for
 15 | more details (model visualiation, paper link, bibtex citation).
 16 | 
 17 | ## Installation
 18 | 
 19 | The code is written in Python 2.7. We recommend a fresh virtualenv.
 20 | 
 21 | ```sh
 22 | # Install the required python libraries
 23 | pip install -r requirements.txt
 24 | 
 25 | # Install the locally-packaged `ngramdb` library (written by Li Zilles).
 26 | pip install lib/ngramdb/
 27 | 
 28 | # Download the data (cached ngramdb data; GloVe embeddings; trained factor
 29 | # weights; NLTK data).
 30 | ./scripts/data.sh
 31 | ```
 32 | 
 33 | Our [Travis-CI
 34 | script](https://github.com/uwnlp/verbphysics/blob/master/.travis.yml) validates
 35 | the above installation instructions by running them on a fresh machine after
 36 | every code modification.
 37 | 
 38 | ## Running
 39 | 
 40 | By default, the code is setup to run a particular model from the paper (**our
 41 | model (A)**)
 42 | 
 43 | ```sh
 44 | python -m src.main
 45 | ```
 46 | 
 47 | You can view all of the default configurations by running with `--help`
 48 | 
 49 | ```
 50 | python -m src.main --help
 51 | usage: main.py [-h] [--config CONFIG] [--poly POLY] [--viz]
 52 | 
 53 | verbphysics reference implementation
 54 | 
 55 | optional arguments:
 56 |   -h, --help       show this help message and exit
 57 |   --config CONFIG  hyperparameter configuration to use; options: model_a |
 58 |                    playing | model_b_objpairs | model_b_frames (default:
 59 |                    model_a
 60 |   --poly POLY      Whether to try polynomially-many hyperparameter config
 61 |                    combinations (True, default) or vary config dimension
 62 |                    sequentially (False).
 63 |   --viz            Whether to dump model / data to JSON for visualization
 64 |                    (default False).
 65 | ```
 66 | 
 67 | Settings (hyperparameter) configurations are found in `src/settings.py`. You
 68 | can modify the `playing` dictionary found in `src/main.py` with your own
 69 | configuration and run the custom model using `--config=playing`.
 70 | 
 71 | ## Data
 72 | 
 73 | The `verbphysics` data is found under `data/verbphysics/`.
 74 | 
 75 | ### Task setup as in the ACL 2017 paper
 76 | 
 77 | When predicting action frames, only 5% action frame data should be used. Either
 78 | 5% (our model A) or 20% object pair data (our model B) may be used to assist in
 79 | action frame prediction.
 80 | 
 81 | When predicting object pairs, only 5% object pair data should be used. Either 5%
 82 | (our model A) or 20% action frame data (our model B) may be used to assist in
 83 | object pair prediction.
 84 | 
 85 | ### Attribute names in code
 86 | 
 87 | For legacy reasons, the code has different names for some attributes. The actual
 88 | data (i.e., the questions asked to Mechanical Turk workers) use the attributes
 89 | reported in the paper.
 90 | 
 91 | attribute   | name in code
 92 | ---         | ---
 93 | `size`      | `size`
 94 | `weight`    | `weight`
 95 | `strength`  | `hardness`
 96 | `rigidness` | `rigidness`
 97 | `speed`     | `verb-speed`
 98 | 
 99 | ## Visualization
100 | 
101 | You can use [`factorgraph-viz`](https://github.com/mbforbes/factorgraph-viz) to
102 | visualize `verbphysics` factor graph models interactively in your web browser.
103 | To produce visualization data, add the command line argument `--viz`.
104 | 
105 | The [Verb Physics project page](https://uwnlp.github.io/verbphysics/) has a
106 | live demo of this running.
107 | 
108 | ![An example rendering of a factor graph using the factorgraph-viz library](factorgraph-viz.png)
109 | 
110 | ## See also
111 | 
112 | The [`py-factorgraph`](https://github.com/mbforbes/py-factorgraph) library
113 | provides the underlying factor graph implementation.
114 | 


--------------------------------------------------------------------------------
/src/glove.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Get GloVe distance between words.
  3 | 
  4 | This code adapted from
  5 | https://github.com/stanfordnlp/GloVe/blob/master/eval/python/distance.py
  6 | '''
  7 | 
  8 | # IMPORTS
  9 | # -----------------------------------------------------------------------------
 10 | 
 11 | # Builtins
 12 | import cPickle as pickle
 13 | 
 14 | # 3rd party
 15 | import numpy as np
 16 | 
 17 | 
 18 | # CONSTANTS
 19 | # -----------------------------------------------------------------------------
 20 | 
 21 | # 300d constants
 22 | MODEL_FILE_300D = 'data/glove/glove.6B.300d.txt'
 23 | MODEL_FILE_300D_NORM = 'data/glove/glove.6B.300d-weights-norm.npy'
 24 | VOCAB_FILE = 'data/glove/glove.6B.vocab.pickle'
 25 | 
 26 | 
 27 | # CLASSES
 28 | # -----------------------------------------------------------------------------
 29 | 
 30 | class Glove(object):
 31 |     '''
 32 |     Once loaded (g = Glove()), can get word vector using
 33 | 
 34 |         g.w[g.vocab[word], :]
 35 | 
 36 |     For example,
 37 | 
 38 |         g.w[g.vocab['fish'], :]
 39 | 
 40 |     This vector will have unit length (l2 norm).
 41 |     '''
 42 | 
 43 |     def __init__(self, model_file_norm=MODEL_FILE_300D_NORM, vocab_file=VOCAB_FILE):
 44 |         '''
 45 |         Load normalized vectors and vocab from a cache. Use Glove.convert(...)
 46 |         to make.
 47 | 
 48 |         Args:
 49 |             model_file_norm (str)
 50 |             vocab_file (str)
 51 |         '''
 52 | 
 53 |         w_norm, vocab = self.load_npy(model_file_norm, vocab_file)
 54 | 
 55 |         # use
 56 |         self.w = w_norm
 57 |         self.vocab = vocab
 58 | 
 59 |     def load_npy(self, model_file_norm, vocab_file):
 60 |         w_norm = np.load(model_file_norm)
 61 |         with open(vocab_file, 'r') as f:
 62 |             vocab = pickle.load(f)
 63 |         return w_norm, vocab
 64 | 
 65 |     @staticmethod
 66 |     def convert(in_model_file=MODEL_FILE_300D,
 67 |                 out_model_file_norm=MODEL_FILE_300D_NORM,
 68 |                 out_vocab_file=VOCAB_FILE):
 69 |         '''
 70 |         Takes a raw model file and saves to disk (a) a vocab file that indexes
 71 |         the model np.ndarray (w matrix), (b) a normalized model file {str: int}
 72 |         (vocab).
 73 | 
 74 |         Args:
 75 |             in_model_file (str): Path to original (downloaded) GloVe file.
 76 |             model_file_norm (str): Path to write norm weights to.
 77 |             vocab_file (str): Path to write vocab to.
 78 |         '''
 79 |         vocab, vectors = {}, {}
 80 |         with open(in_model_file, 'r') as f:
 81 |             i = 0
 82 |             for line in f:
 83 |                 vals = line.rstrip().split(' ')
 84 |                 word = vals[0]
 85 |                 vocab[word] = i
 86 |                 vectors[word] = [float(x) for x in vals[1:]]
 87 |                 i += 1
 88 |         vocab_size = len(vocab)
 89 |         vector_dim = len(vectors['the'])  # yay
 90 |         w = np.zeros((vocab_size, vector_dim))
 91 |         for word, v in vectors.iteritems():
 92 |             if word == '<unk>':
 93 |                 continue
 94 |             w[vocab[word], :] = v
 95 | 
 96 |         # normalize each word vector to unit length (l2 norm)
 97 |         w_norm = np.zeros(w.shape)
 98 |         d = (np.sum(w ** 2, 1) ** (0.5))
 99 |         w_norm = (w.T / d).T
100 | 
101 |         # save
102 |         np.save(out_model_file_norm, w_norm)
103 |         with open(out_vocab_file, 'w') as f:
104 |             pickle.dump(vocab, f)
105 | 
106 |         # NOTE: This left here for your convenience if you decide to adapt this
107 |         # code.
108 |         # return w_norm, vocab
109 | 
110 |     def distance(self, target, queries):
111 |         '''
112 |         Args:
113 |             target  str
114 |             queries [str]
115 | 
116 |         Returns:
117 |             np.array of length len(queries): distance (from 1 (close) to 0 (I
118 |                 think) (far)) of each word in queries to target according to w
119 |         '''
120 |         res = np.zeros(len(queries))
121 |         if target not in self.vocab:
122 |             return res
123 |         vec_result = self.w[self.vocab[target], :]  # indexes self.w; don't mutate!
124 |         vec_norm = np.zeros(vec_result.shape)
125 |         d = (np.sum(vec_result ** 2,) ** (0.5))
126 |         vec_norm = (vec_result.T / d).T
127 |         dist = np.dot(self.w, vec_norm.T)
128 | 
129 |         # compute dist for each query
130 |         for i, q in enumerate(queries):
131 |             res[i] = dist[self.vocab[q]] if q in self.vocab else 0.0
132 |         return res
133 | 
134 | 
135 | if __name__ == '__main__':
136 |     Glove.convert()
137 | 


--------------------------------------------------------------------------------
/src/data_objects_turked.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Loading up the (processed) turked *OBJECT PAIR* data.
  3 | 
  4 | author: mbforbes
  5 | """
  6 | 
  7 | # IMPORTS
  8 | # -----------------------------------------------------------------------------
  9 | 
 10 | # stdlib
 11 | import logging
 12 | import sys
 13 | 
 14 | # 3rd party
 15 | import pandas as pd
 16 | 
 17 | 
 18 | # CONSTANTS
 19 | # -----------------------------------------------------------------------------
 20 | 
 21 | DIR_5 = 'data/verbphysics/objects/train-5/'
 22 | DIR_20 = 'data/verbphysics/objects/train-20/'
 23 | 
 24 | # The attr names are inconsistent in my code. This translates from external
 25 | # (e.g. AttrGraph) names to internal (turked object pairs) names.
 26 | ATTR_TRANSLATION = {
 27 |     'hardness': 'strength',
 28 |     'verb-speed': 'speed',
 29 | }
 30 | 
 31 | # For internal sanity checking: the complete list of attributes.
 32 | OUR_ATTRS = ['size', 'weight', 'strength', 'rigidness', 'speed']
 33 | 
 34 | # obj1 vs obj2, where vs is one of:
 35 | LABEL_GREATER = 1
 36 | LABEL_EQ = 0
 37 | LABEL_LESSER = -1
 38 | LABEL_UNK = -42
 39 | 
 40 | PERSON_DATA = 'person'
 41 | PERSON_TOKEN = 'PERSON'
 42 | 
 43 | logger = logging.getLogger(__name__)
 44 | 
 45 | 
 46 | # CLASSES
 47 | # -----------------------------------------------------------------------------
 48 | 
 49 | class DataObjectsTurked(object):
 50 | 
 51 |     @staticmethod
 52 |     def load_raw(partition, attr_raw, agreement_needed, remove_unk=True, directory=DIR_5):
 53 |         """
 54 |         Loads up partition, filtering out those with agreement <
 55 |         agreement_needed and those with majority of 'unk'.
 56 | 
 57 |         Args:
 58 |             partition (str): 'train'/'dev'/'test'
 59 |             attr_raw (str): 'size', 'weight', 'hardness' (mapped to 'strength'),
 60 |                 'rigidness', 'verb-speed' (mapped to 'speed')
 61 |             agreement_needed (int): 2 or 3
 62 |             directory (str): directory to load data from. use DIR_20 to use 20%
 63 |                 of data, DIR_5 to use 5%
 64 | 
 65 |         Returns:
 66 |             [[str, str, int]]: [[obj1, obj2, majority label]]
 67 |         """
 68 |         # translation, if needed
 69 |         attr = attr_raw if attr_raw not in ATTR_TRANSLATION else ATTR_TRANSLATION[attr_raw]
 70 |         if attr not in OUR_ATTRS:
 71 |             logger.error('Unknown attribute: "%s"' % (attr))
 72 |             sys.exit(1)
 73 | 
 74 |         # load, filter, and transform to list
 75 |         fn = directory + partition + '.csv'
 76 |         df = pd.read_csv(fn)
 77 |         filtered = df[(df[attr + '-agree'] >= agreement_needed)]
 78 |         if remove_unk:
 79 |             filtered = filtered[(filtered[attr + '-maj'] != LABEL_UNK)]
 80 |         data = filtered[['obj1', 'obj2', attr + '-maj']]
 81 |         lst = data.values.tolist()
 82 | 
 83 |         # switch our lowercased person token to the original
 84 |         for l in lst:
 85 |             for i in [0, 1]:
 86 |                 if l[i] == PERSON_DATA:
 87 |                     l[i] = PERSON_TOKEN
 88 | 
 89 |         return lst
 90 | 
 91 |     @staticmethod
 92 |     def load(partition, attr_raw, agreement_needed, greater_pot, eq_pot, lesser_pot, split):
 93 |         """
 94 |         Loads up partition, filtering out those with agreement <
 95 |         agreement_needed and those with majority of 'unk'. Then changes gold
 96 |         labels to the provided potentials.
 97 | 
 98 |         Args:
 99 |             partition (str): 'train'/'dev'/'test'
100 |             attr_raw (str): 'size', 'weight', 'hardness' (mapped to 'strength'),
101 |                 'rigidness', 'verb-speed' (mapped to 'speed')
102 |             agreement_needed (int): 2 or 3
103 |             greater_pot (np.ndarray: 1 x 3)
104 |             eq_pot (np.ndarray: 1 x 3)
105 |             lessert_pot (np.ndarray: 1 x 3)
106 | 
107 |         Returns:
108 |             [[str, str, np.ndarray]]: [[obj1, obj2, potential]]
109 |         """
110 |         if split == 5:
111 |             directory = DIR_5
112 |         elif split == 20:
113 |             directory = DIR_20
114 |         else:
115 |             logger.error('Unimplemented split: %r', split)
116 |             sys.exit(1)
117 |         lst = DataObjectsTurked.load_raw(partition, attr_raw, agreement_needed, True, directory)
118 | 
119 |         # create our own mini mapping for assigning potentials below
120 |         potmap = {
121 |             LABEL_GREATER: greater_pot,
122 |             LABEL_EQ: eq_pot,
123 |             LABEL_LESSER: lesser_pot,
124 |         }
125 | 
126 |         # replace each list's label with the corresponding passed potential
127 |         for l in lst:
128 |             l[-1] = potmap[l[-1]]
129 | 
130 |         return lst
131 | 


--------------------------------------------------------------------------------
/lib/ngramdb/ngramdb/ngramtoken.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | from collections import defaultdict, Counter
  3 | 
  4 | 
  5 | class Ngram(object):
  6 |     def __init__(self, tokens,
  7 |                  nid=None, freq=None):
  8 |         self.tokens = tokens
  9 |         self.headpositions = [t.headposition for t in self.tokens]
 10 | 
 11 |         try:
 12 |             for t in self.tokens:
 13 |                 if t.headposition > -1:
 14 |                     t.head = self.tokens[t.headposition]
 15 |                     t.head.children.append(t)
 16 | 
 17 |                 else:
 18 |                     t.depth = 0
 19 | 
 20 |             for t in self.tokens:
 21 |                 if t.depth is None:
 22 |                     depth = 0
 23 |                     current = t
 24 |                     while current.depth != 0 and depth < len(self.tokens):
 25 |                         depth += 1
 26 |                         current = current.head
 27 | 
 28 |                     if depth >= len(self.tokens):
 29 |                         raise IndexError
 30 | 
 31 |                     t.depth = depth
 32 | 
 33 |             self.height = max(t.depth for t in self.tokens if t is not None)
 34 | 
 35 |         except IndexError:
 36 |             pass
 37 | 
 38 |         self.nid = nid
 39 |         self.freq = freq
 40 | 
 41 |     @property
 42 |     def postags(self):
 43 |         return [t.postag for t in self.tokens]
 44 | 
 45 |     @property
 46 |     def deprels(self):
 47 |         return [t.deprel for t in self.tokens]
 48 | 
 49 |     @property
 50 |     def words(self):
 51 |         return [t.surface for t in self.tokens]
 52 | 
 53 |     def __repr__(self):
 54 |         kwargs = ["=".join((k, repr(v))) for k, v in self.__dict__.items()
 55 |                   if v is not None and k != "tokens"]
 56 |         return "Ngram({}, {})".format(self.tokens, ', '.join(kwargs))
 57 | 
 58 |     def __str__(self):
 59 |         return self.rawstring + " (freq: {})".format(self.freq)
 60 | 
 61 |     def __iter__(self):
 62 |         return iter(self.tokens)
 63 | 
 64 |     def __len__(self):
 65 |         return len(self.tokens)
 66 | 
 67 |     def __getitem__(self, key):
 68 |         return self.tokens[key]
 69 | 
 70 |     def __setitem__(self, key, value):
 71 |         raise TypeError("Can't replace Token in Ngram!")
 72 | 
 73 |     @property
 74 |     def surface(self):
 75 |         return ' '.join(w for w in self.words)
 76 | 
 77 |     @property
 78 |     def rawstring(self):
 79 |         return ' '.join(t.rawprint for t in self.tokens)
 80 | 
 81 | 
 82 | class Token(object):
 83 |     def __init__(self, surface,
 84 |                  position=None, postag=None, deprel=None, headposition=None,
 85 |                  freq=None):
 86 | 
 87 |         self.surface = surface
 88 | 
 89 |         self.position = position
 90 | 
 91 |         self.postag = postag
 92 |         self.deprel = deprel
 93 | 
 94 |         self.headposition = headposition if headposition != -1 else None
 95 |         self.head = None
 96 | 
 97 |         self.children = []
 98 | 
 99 |         self.depth = None
100 | 
101 |         self.freq = freq
102 | 
103 |     def __repr__(self):
104 |         kwargs = ["{}={}".format(k, v) for k, v in self.__dict__.items()
105 |                   if v is not None and k not in ("surface", "head")]
106 |         return "Token({}, {})".format(repr(self.surface), ', '.join(kwargs))
107 | 
108 |     def __str__(self):
109 |         return self.rawprint
110 | 
111 |     @property
112 |     def rawprint(self):
113 |         try:
114 |             return '{}/{}/{}/{}'.format(
115 |                 self.surface, self.postag, self.deprel, self.headposition)
116 |         except:
117 |             return '{}/{}/{}/{}'.format(
118 |                 self.surface, self.postag, self.deprel, 0)
119 | 
120 | 
121 | def ngrams_from_tupledict(tuples):
122 |     def keyfunc(x):
123 |         return x['nid']
124 |     results = []
125 | 
126 |     for key, group in itertools.groupby(tuples, keyfunc):
127 |         group = list(group)
128 | 
129 |         tokens = [Token(t['surface'], t['position']-1, t['postag'],
130 |                   t['deprel'], headposition=t['headposition']-1)
131 |                   for t in group]
132 | 
133 |         ngram_freq = None if 'freq' not in group[0] else group[0]['freq']
134 | 
135 |         ngram = Ngram(tokens, key, freq=ngram_freq)
136 | 
137 |         # filter out the TRASH :(
138 |         # TODO: why is there TRASH
139 |         # if [t.position for t in ngram] == list(range(len(ngram))) \
140 |         #    and all(t.headposition < len(ngram) for t in ngram):
141 |         results.append(ngram)
142 | 
143 |     return results
144 | 
145 | 
146 | def tokens_from_tupledict(tuples, collapse=True):
147 |     def keyfunc(x):
148 |         return x['surface']
149 | 
150 |     results = []
151 |     if collapse:
152 |         tuples.sort(key=lambda x: (x['surface'], -x['freq']
153 |                     if 'freq' in x else 0))
154 |         for key, group in itertools.groupby(tuples, keyfunc):
155 |             g = list(group)
156 |             if 'freq' in g[0]:
157 |                 freq = sum(t['freq'] for t in g)
158 |             results.append(Token(key, freq=freq))
159 | 
160 |         results.sort(key=lambda t: t.freq if t.freq is not None else 0,
161 |                      reverse=True)
162 | 
163 |     else:
164 |         for t in tuples:
165 |             results.append(Token(t['surface'], t['tid'], t['postag'],
166 |                            t['deprel'], freq=t['freq']))
167 | 
168 |     return results
169 | 


--------------------------------------------------------------------------------
/lib/ngramdb/ngramdb/constants.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | # myria server and port
  4 | REST_URL = "rest.myria.cs.washington.edu"
  5 | REST_PORT = 1776
  6 | 
  7 | # relation names in Myria
  8 | NGRAM_RELATION = "\"lzilles:ngrams:ngram\""
  9 | TOKEN_RELATION = "\"lzilles:ngrams:token\""
 10 | DEP_RELATION = "\"lzilles:ngrams:dependency\""
 11 | POS_RELATION = "\"lzilles:ngrams:partofspeech\""
 12 | TT_RELATION = "\"lzilles:ngrams:ngram_token_token\""
 13 | 
 14 | # query limitations
 15 | MIN_WORD_LEN = 3
 16 | MIN_WORD_COUNT = 1
 17 | 
 18 | # top-level sql query templates
 19 | SQL_CONTEXT_TEMPLATE = """SELECT
 20 |     t.nid, nginfo.freq, t.position, t.surface, pos.postag, dep.deprel,
 21 |     t.headposition
 22 | FROM
 23 |     "lzilles:ngrams:token" t,
 24 |     "lzilles:ngrams:dependency" dep,
 25 |     "lzilles:ngrams:partofspeech" pos,
 26 |     ({subquery}) AS nginfo
 27 | 
 28 | WHERE
 29 |     nginfo.nid=t.nid
 30 |     AND pos.posid=t.posid
 31 |     AND dep.depid=t.depid
 32 | 
 33 | ORDER BY
 34 |     nginfo.freq DESC,
 35 |     (t.nid, t.position) ASC;
 36 | """
 37 | 
 38 | SQL_COUNT_TEMPLATE = """SELECT SUM(nginfo.freq) FROM ({subquery}) AS nginfo;"""
 39 | 
 40 | # json query plan templates
 41 | JSON_COUNT_TEMPLATE = json.loads("""
 42 | {
 43 |     "fragments": [
 44 |         {
 45 |             "operators": [
 46 |                 {
 47 |                     "opId": 0,
 48 |                     "opType": "DbQueryScan",
 49 | 
 50 |                     "schema": {
 51 |                         "columnNames": [
 52 |                             "sum"
 53 |                         ],
 54 |                         "columnTypes": [
 55 |                             "LONG_TYPE"
 56 |                         ]
 57 |                     }
 58 | 
 59 |                 },
 60 |                 {
 61 |                     "opId": 1,
 62 |                     "argChild": 0,
 63 |                     "opType": "CollectProducer"
 64 |                 }
 65 |             ]
 66 |         },
 67 |         {
 68 |             "operators": [
 69 |                 {
 70 |                     "opId": 2,
 71 |                     "opType": "CollectConsumer",
 72 |                     "argOperatorId": 1
 73 |                 },
 74 |                 {
 75 |                     "opId": 3,
 76 |                     "opType": "Aggregate",
 77 |                     "argChild": 2,
 78 |                     "aggregators": [
 79 |                         {
 80 |                             "type": "SingleColumn",
 81 |                             "column": 0,
 82 |                             "aggOps": ["SUM"]
 83 |                         }
 84 |                     ]
 85 |                 },
 86 |                 {
 87 |                     "opId": 4,
 88 |                     "argChild": 3,
 89 |                     "opType": "DbInsert",
 90 |                     "argOverwriteTable": true,
 91 |                     "relationKey": {
 92 |                         "programName": "ngramoutput",
 93 |                         "relationName": "TEMPOUTCOUNT",
 94 |                         "userName": "lzilles"
 95 |                     }
 96 |                 }
 97 |             ]
 98 |         }
 99 |     ],
100 | 
101 |     "logicalRa": "",
102 |     "rawQuery": "[ ngram count test ]",
103 |     "language": "sql"
104 | }
105 | """)
106 | 
107 | JSON_CONTEXT_TEMPLATE = json.loads("""
108 | {
109 |     "fragments": [
110 |         {
111 |             "operators": [
112 |                 {
113 |                     "opId": 0,
114 |                     "opType": "DbQueryScan",
115 | 
116 |                     "schema": {
117 |                         "columnNames": [
118 |                             "nid",
119 |                             "freq",
120 |                             "position",
121 |                             "surface",
122 |                             "postag",
123 |                             "deprel",
124 |                             "headposition"
125 |                         ],
126 |                         "columnTypes": [
127 |                             "LONG_TYPE",
128 |                             "INT_TYPE",
129 |                             "INT_TYPE",
130 |                             "STRING_TYPE",
131 |                             "STRING_TYPE",
132 |                             "STRING_TYPE",
133 |                             "INT_TYPE"
134 |                         ]
135 |                     }
136 | 
137 |                 },
138 |                 {
139 |                     "opId": 1,
140 |                     "argChild": 0,
141 |                     "opType": "CollectProducer"
142 |                 }
143 |             ]
144 |         },
145 |         {
146 |             "operators": [
147 |                 {
148 |                     "opId": 2,
149 |                     "opType": "CollectConsumer",
150 |                     "argOperatorId": 1
151 |                 },
152 |                 {
153 |                     "opId": 3,
154 |                     "opType": "InMemoryOrderBy",
155 |                     "opName": "InMemSort(results)",
156 |                     "argChild": 2,
157 |                     "argSortColumns": [
158 |                         1,
159 |                         0,
160 |                         2
161 |                     ],
162 |                     "argAscending": [
163 |                         false,
164 |                         true,
165 |                         true
166 |                     ]
167 |                 },
168 |                 {
169 |                     "opId": 4,
170 |                     "argChild": 3,
171 |                     "opType": "DbInsert",
172 |                     "argOverwriteTable": true,
173 |                     "relationKey": {
174 |                         "programName": "ngramoutput",
175 |                         "relationName": "TEMPOUT",
176 |                         "userName": "lzilles"
177 |                     }
178 |                 }
179 |             ]
180 |         }
181 |     ],
182 | 
183 |     "logicalRa": "",
184 |     "rawQuery": "[ ngram test ]",
185 |     "language": "sql"
186 | }
187 | """)
188 | 


--------------------------------------------------------------------------------
/src/trained_factors.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Factors whose potentials are trained (on the training data) (duh).
  3 | 
  4 | author: mbforbes
  5 | """
  6 | 
  7 | # IMPORTS
  8 | # -----------------------------------------------------------------------------
  9 | 
 10 | # stdlib
 11 | import code  # code.interact(local=dict(globals(), **locals()))
 12 | from collections import Counter
 13 | import sys
 14 | 
 15 | # 3rd party
 16 | import numpy as np
 17 | import pandas as pd
 18 | 
 19 | # local
 20 | import data_turked as td
 21 | import data_objects_turked as dot
 22 | 
 23 | 
 24 | # CLASSES
 25 | # -----------------------------------------------------------------------------
 26 | 
 27 | class UnaryFrameEmbedding(object):
 28 | 
 29 |     def __init__(self, framesplit):
 30 |         """
 31 |         Args:
 32 |             framesplit (int)
 33 |         """
 34 |         if framesplit == 5:
 35 |             filename = 'data/emb/frames-train5.csv'
 36 |         elif framesplit == 20:
 37 |             filename = 'data/emb/frames-train20.csv'
 38 |         else:
 39 |             print 'ERROR: Unknown frame split %r' % (framesplit)
 40 |             sys.exit(1)
 41 | 
 42 |         self.df = pd.read_csv(filename)
 43 | 
 44 |     def get(self, attr, framestr):
 45 |         """
 46 |         Args:
 47 |             attr (str)
 48 |             framestr (str)
 49 | 
 50 |         Returns:
 51 |             np.ndarray of shape (3,) representing
 52 |                 [p(>), p(<), p(=)]
 53 |         """
 54 |         row = self.df[(self.df['attr'] == attr) & (self.df['framestr'] == framestr)]
 55 |         return row[['prob_greater', 'prob_lesser', 'prob_eq']].get_values().flatten()
 56 | 
 57 | 
 58 | class UnaryObjpairEmbedding(object):
 59 | 
 60 |     def __init__(self, objpairsplit):
 61 |         """
 62 |         Args:
 63 |             objpairsplit (int)
 64 |         """
 65 |         if objpairsplit == 5:
 66 |             filename = 'data/emb/objpairs-train5.csv'
 67 |         elif objpairsplit == 20:
 68 |             filename = 'data/emb/objpairs-train20.csv'
 69 |         else:
 70 |             print 'ERROR: Unknown objpair split %r' % (objpairsplit)
 71 |             sys.exit(1)
 72 | 
 73 |         self.df = pd.read_csv(filename)
 74 | 
 75 |     def get(self, attr, obj1, obj2):
 76 |         """
 77 |         Args:
 78 |             attr (str)
 79 |             obj1 (str)
 80 |             obj2 (str)
 81 | 
 82 |         Returns:
 83 |             np.ndarray of shape (3,) representing
 84 |                 [p(>), p(<), p(=)]
 85 |         """
 86 |         row = self.df[
 87 |             (self.df['attr'] == attr) &
 88 |             (self.df['obj1'] == obj1) &
 89 |             (self.df['obj2'] == obj2)]
 90 |         return row[['prob_greater', 'prob_lesser', 'prob_eq']].get_values().flatten()
 91 | 
 92 | 
 93 | class SelPrefEmbedding(object):
 94 | 
 95 |     def __init__(self, filename):
 96 |         self.df = pd.read_csv(filename)
 97 | 
 98 |     def get(self, attr, frame, obj1, obj2):
 99 |         """
100 |         Args:
101 |             attr (str)
102 |             frame (str)
103 |             obj1 (str)
104 |             obj2 (str)
105 | 
106 |         Returns:
107 |             np.ndarray of shape (3,3) representing
108 |                         objp >  objp =  objp <
109 |                 frame > [[ p       p       p]
110 |                 frame =  [ p       p       p]
111 |                 frame <  [ p       p       p]]
112 |         """
113 |         row = self.df[
114 |             (self.df['attr'] == attr) &
115 |             (self.df['frame'] == frame) &
116 |             (self.df['obj1'] == obj1) &
117 |             (self.df['obj2'] == obj2)]
118 |         try:
119 |             return row[['gg', 'ge', 'gl', 'eg', 'ee', 'el', 'lg', 'le', 'll']].get_values().flatten().reshape((3,3))
120 |         except:
121 |             code.interact(local=dict(globals(), **locals()))
122 | 
123 | 
124 | class SelPrefMLE(object):
125 | 
126 |     def __init__(self, pmi):
127 |         """
128 |         Args:
129 |             pmi (ngramdb.PMI)
130 |         """
131 |         self.pmi = pmi
132 | 
133 |     def get(self, attr, frame_agreement_needed, objpair_agreement_needed,
134 |             pmi_cutoff, objsplit):
135 |         """
136 |         Args:
137 |             attr (str)
138 |             frame_agreement_needed (int)
139 |             objpair_agreement_needed (int)
140 |             pmi_cutoff (float)
141 |             objsplit (int)
142 | 
143 |         Returns:
144 |             np.ndarray (3,3) selectional preference potential (frame, objpair)
145 |                 for attr
146 |         """
147 |         if objsplit == 5:
148 |             objdir = dot.DIR_5
149 |         elif objsplit == 20:
150 |             objdir = dot.DIR_20
151 |         else:
152 |             print 'ERROR: Unimplemented split: %r' % (split)
153 |             sys.exit(1)
154 | 
155 |         frames_expanded = td.TurkedData.load_raw(
156 |             'train', attr, frame_agreement_needed)
157 |         # pull off just v_s_p str and gold label
158 |         frames = [(fe[4], fe[2]) for fe in frames_expanded]
159 |         objpairs = dot.DataObjectsTurked.load_raw(
160 |             'train', attr, objpair_agreement_needed, True, objdir)
161 | 
162 |         # counts maps frame gold -> objpair gold. init'ing now rather than
163 |         # checking for missing later.
164 |         counts = {
165 |             td.LABEL_GREATER: Counter(),
166 |             td.LABEL_LESSER: Counter(),
167 |             td.LABEL_EQ: Counter(),
168 |         }
169 |         for f in frames:
170 |             framestr, frame_gold = f
171 |             for o in objpairs:
172 |                 obj1, obj2, objpair_gold = o
173 | 
174 |                 # get PMI. only count if >= cutoff
175 |                 pmi_score = self.pmi.query(framestr, (obj1, obj2))
176 |                 if pmi_score >= pmi_cutoff:
177 |                     counts[frame_gold][objpair_gold] += 1
178 | 
179 |         flat = np.array([
180 |             float(counts[td.LABEL_GREATER][dot.LABEL_GREATER]),
181 |             float(counts[td.LABEL_GREATER][dot.LABEL_LESSER]),
182 |             float(counts[td.LABEL_GREATER][dot.LABEL_EQ]),
183 |             float(counts[td.LABEL_LESSER][dot.LABEL_GREATER]),
184 |             float(counts[td.LABEL_LESSER][dot.LABEL_LESSER]),
185 |             float(counts[td.LABEL_LESSER][dot.LABEL_EQ]),
186 |             float(counts[td.LABEL_EQ][dot.LABEL_GREATER]),
187 |             float(counts[td.LABEL_EQ][dot.LABEL_LESSER]),
188 |             float(counts[td.LABEL_EQ][dot.LABEL_EQ]),
189 |         ])
190 | 
191 |         # per-row norm (i.e. marginal)
192 |         res = flat.reshape((3,3))
193 |         for i in range(res.shape[0]):
194 |             res[i, :] /= sum(res[i, :])
195 | 
196 |         return res
197 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: verbphysics
  3 | tagline: Maxwell Forbes & Yejin Choi — ACL 2017
  4 | ---
  5 | 
  6 | # About
  7 | 
  8 | The **Verb Physics** project explores how our choice of verbs entails relations
  9 | between the physical properties of the objects we talk about.
 10 | 
 11 | > Mary threw _____.
 12 | 
 13 | _Whatever Mary threw (a ball? a rock?) is probably smaller and weighs less than
 14 | her_
 15 | 
 16 | > Ricardo walked into _____.
 17 | 
 18 | _Whatever Ricardo walked into (the library? his office?) is probably larger
 19 | than him_
 20 | 
 21 | # Demo
 22 | 
 23 | Explore an interactive visualization of our factor graph model on the Verb
 24 | Physics dataset. Click and drag on components of the factor graph to move them
 25 | around.
 26 | 
 27 | <div>
 28 | 	<!-- d3 dependencies for d3-force -->
 29 | 	<script src="lib/d3.js"></script>
 30 | 
 31 |   <!-- CSS. No idea whether this will work in github.io but let's find out. -->
 32 |   <link rel="stylesheet" type="text/css" href="css/default.css">
 33 | 
 34 | 	<!-- show what we've loaded -->
 35 | 	<h2 id="fg-title"></h2>
 36 | 
 37 | 	<!-- stick an svg element in here for the factor graph visualizer to use -->
 38 | 	<svg id="fg-svg" width="800" height="600"></svg>
 39 | 
 40 | 	<!-- the factor graph visualizer. it will load a factor graph from disk. -->
 41 | 	<script src="factorgraph-viz.js"></script>
 42 | 
 43 | 	<!-- interactivity -->
 44 | 	<form onsubmit="return userSubmits();">
 45 |   <p>Type below to select an action frame to visualize.
 46 |   All action frames names start with one of the five attributes: "size,"
 47 |   "weight," "strength," "rigidness," or "speed." </p>
 48 | 	<input id="userInput" type="text" oninput="userTypes()" size="50"
 49 | 		placeholder="Start typing to get autocomplete suggestions below" />
 50 | 	<button type="submit">Load</button>
 51 | 	</form>
 52 | 	<p id="suggestionNotice">Completions (live) (clickable):</p>
 53 | 	<p id="suggestions"></p>
 54 | </div>
 55 | 
 56 | ## Explanation
 57 | 
 58 | The interactive diagram draws a small piece of the factor graph that is focused
 59 | on the selected action frame. The colors correspond to the model's decisions
 60 | about each random variable. <b style="color: red">Red</b> indicates a decision
 61 | that a random variable should take the value `>`, <b style="color:
 62 | blue">blue</b> represents `<`, and <b style="color: grey">grey</b> represents
 63 | `=`. (Grey is uncommon).
 64 | 
 65 | These decisions have different meanings depending on what the random variable
 66 | represents. There are two different types of random variables:
 67 | 
 68 | 1.  **Object pairs** - If a random variable represents two objects—for example,
 69 |     `person_vs_house`—then the decision for that random variable represents the
 70 |     model's choice about the relation of those two objects along the given
 71 |     attribute. For example, if we are looking at an action frame for `size`,
 72 |     then we would expect `person_vs_house` to take the value `<`, because people
 73 |     are generally smaller than houses.
 74 | 
 75 | 2.  **Action frames** — If a random variable represents an action frame—for
 76 |     example, `threw_d`—then the decisions for that random variable represents
 77 |     the model's choice about the relation of two objects that would fit in that
 78 |     action frame. For example, if we are looking at an action frame for `size`,
 79 |     then we would expect `threw_d` (which represents `<person> threw <object>`; see
 80 |     below for more details) to take the value `>`, because people are generally
 81 |     larger in size than the objects that they throw.
 82 | 
 83 | ## Action frame names
 84 | 
 85 | The format for the action frame names is:
 86 | 
 87 | ```
 88 | <attribute>-<verb>_<construction>[_<preposition>]
 89 | ```
 90 | 
 91 | The possible attributes are: `size`, `weight`, `strength`, `rigidness`, `speed`.
 92 | 
 93 | There are five possible action frame constructions. Each corresponds to a
 94 | syntactic template.
 95 | 
 96 | Construction   | Syntax template                                      | Example         | Example sentence
 97 | ---            | ---                                                  | ---             | ---
 98 | **`d`**        |  `<person> <verb> <object>`                          | `threw_d`       | "I threw the rock."
 99 | **`od`**       |  `<object1> <verb> <object2>`                        | `hit_od`        | "The tape hit the ground."
100 | **`p`**        |  `<person> <verb> <preposition> <object>`            | `threw_p_out`   | "I threw out the trash."
101 | **`op`**       |  `<object1> <verb> <preposition> <object2>`          | `landed_op_in`  | "The trash landed in the bin."
102 | **`dp`**       |  `<person> <verb> <object1> <preposition> <object2>` | `threw_dp_into` | "I threw the trash into the bin."
103 | 
104 | # Abstract
105 | 
106 | Learning commonsense knowledge from natural language text is nontrivial due to
107 | reporting bias: people rarely state the obvious, e.g., "My house is bigger than
108 | me." However, while rarely stated explicitly, this trivial everyday knowledge
109 | does influence the way people talk about the world, which provides indirect
110 | clues to reason about the world. For example, a statement like, "Tyler entered
111 | his house" implies that his house is bigger than Tyler.
112 | 
113 | In this paper, we present an approach to infer relative physical knowledge of
114 | actions and objects along five dimensions (e.g., size, weight, and strength)
115 | from unstructured natural language text. We frame knowledge acquisition as joint
116 | inference over two closely related problems: learning (1) relative physical
117 | knowledge of object pairs and (2) physical implications of actions when applied
118 | to those object pairs. Empirical results demonstrate that it is possible to
119 | extract knowledge of actions and objects from language and that joint inference
120 | over different types of knowledge improves performance.
121 | 
122 | # Authors
123 | 
124 | <div style="display: inline-block; padding: 10px; text-align: center">
125 |   <a href="http://maxwellforbes.com/">
126 |     <img src="max_thumb.jpeg" alt="A picture of Maxwell Forbes" />
127 |   </a>
128 |   <p><a href="http://maxwellforbes.com/">Maxwell Forbes</a></p>
129 | </div>
130 | 
131 | <div style="display: inline-block; padding: 10px; text-align: center">
132 |   <a href="https://homes.cs.washington.edu/~yejin/">
133 |     <img src="yejin_thumb.jpg" alt="A picture of Yejin Choi" />
134 |   </a>
135 |   <p><a href="https://homes.cs.washington.edu/~yejin/">Yejin Choi</a></p>
136 | </div>
137 | 
138 | # Paper
139 | 
140 | The paper is available on [arXiv](https://arxiv.org/abs/1706.03799).
141 | 
142 | [![a thumbnail rendering of the ACL 2017 verb physics paper](thumb-all-resized.png)](https://arxiv.org/abs/1706.03799)
143 | 
144 | # Bibtex
145 | 
146 | ```
147 | @inproceedings{forbes2017verb,
148 |   title = {Verb Physics: Relative Physical Knowledge of Actions and Objects},
149 |   author = {Maxwell Forbes and Yejin Choi},
150 |   booktitle = {ACL},
151 |   year = {2017}
152 | }
153 | ```
154 | 
155 | # Data
156 | 
157 | The data is available in the [`verbphysics` GitHub repository under
158 | `data/`](https://github.com/uwnlp/verbphysics/tree/master/data).
159 | 
160 | See the repository [README](https://github.com/uwnlp/verbphysics#data) for more
161 | information on the data splits and task setup.
162 | 
163 | # Code
164 | 
165 | Visit the [`verbphysics` GitHub
166 | repository](https://github.com/uwnlp/verbphysics) for our reference
167 | implementation and instructions for running our code.
168 | 
169 | It is released under the permissive MIT license.
170 | 
171 | ## Thanks
172 | 
173 | - to [Hannah Rashkin](https://homes.cs.washington.edu/~hrashkin/) for
174 |   inspiration with her [connotation frames
175 |   visualizer](https://homes.cs.washington.edu/~hrashkin/connframe_vis.php)
176 | 
177 | - to the [Stanford Vision Lab](http://vision.stanford.edu/) for inspiration
178 |   with good project webpage designs ([example](http://cs.stanford.edu/people/ranjaykrishna/im2p/index.html))
179 | 


--------------------------------------------------------------------------------
/data/verbphysics/objects/train-5/train.csv:
--------------------------------------------------------------------------------
  1 | ,obj1,obj2,size-agree,size-maj,weight-agree,weight-maj,strength-agree,strength-maj,rigidness-agree,rigidness-maj,speed-agree,speed-maj
  2 | 0,person,dress,1,-42,3,1,3,1,1,-42,1,-42
  3 | 1,person,step,3,1,3,1,3,-1,2,-1,3,1
  4 | 2,body,mouth,3,1,3,1,3,1,2,1,2,-1
  5 | 3,sun,coal,3,1,3,1,2,1,1,-42,2,1
  6 | 4,vessel,something,3,-42,3,-42,3,-42,3,-42,3,-42
  7 | 5,place,farm,3,-42,3,-42,2,-42,3,-42,2,0
  8 | 6,master,dress,3,1,3,1,3,1,3,1,3,1
  9 | 7,ground,body,3,1,3,1,3,1,3,1,3,-1
 10 | 8,ash,mouth,3,-1,3,-1,3,-1,2,-1,2,-1
 11 | 9,gentleman,knife,3,1,3,1,2,-1,2,-1,2,1
 12 | 10,train,face,3,1,3,1,3,1,3,1,3,1
 13 | 11,friend,mouth,3,1,3,1,3,1,2,1,2,0
 14 | 12,energy,sun,2,-42,2,-1,1,-42,2,-42,2,-42
 15 | 13,father,basin,3,1,2,1,2,1,2,-1,3,1
 16 | 14,bag,gate,3,-1,3,-1,3,-1,3,-1,2,-42
 17 | 15,brother,book,3,1,3,1,3,1,3,-1,3,1
 18 | 16,way,road,2,-42,3,-42,2,0,2,0,2,0
 19 | 17,back,something,2,-42,2,-42,2,-42,2,-42,2,-42
 20 | 18,lady,car,3,-1,3,-1,3,-1,3,-1,3,-1
 21 | 19,dinner,daughter,3,-1,3,-1,3,-1,1,-42,3,-1
 22 | 20,person,lad,3,0,3,0,3,0,3,0,3,0
 23 | 21,fist,hand,2,-1,3,0,2,0,2,0,2,0
 24 | 22,ground,room,1,-42,2,1,2,1,1,-42,2,-42
 25 | 23,child,doorway,3,-1,2,-1,2,-1,3,-1,3,1
 26 | 24,victim,face,3,1,3,1,2,1,3,0,2,0
 27 | 25,rain,light,1,-42,1,-42,2,-42,1,-42,3,-1
 28 | 26,horse,coal,3,1,3,1,2,1,2,-1,3,1
 29 | 27,poet,door,2,-1,2,1,1,-42,2,-1,2,-42
 30 | 28,brother,ball,3,1,3,1,3,1,3,-1,2,-1
 31 | 29,lady,direction,3,-42,2,-42,2,-42,2,-42,2,-42
 32 | 30,house,sea,3,-1,2,-1,3,-42,2,-42,2,-1
 33 | 31,coach,arm,3,1,3,1,3,1,2,0,2,0
 34 | 32,lady,object,3,-42,3,-42,3,-42,3,-42,3,-42
 35 | 33,something,hand,2,1,2,1,2,-42,2,-42,2,-42
 36 | 34,father,seal,1,-42,1,-42,1,-42,2,-42,2,-42
 37 | 35,edition,place,3,-42,3,-42,3,-42,3,-42,3,-42
 38 | 36,room,wife,2,1,2,1,2,1,2,1,2,-42
 39 | 37,messenger,camp,2,-1,2,-1,2,-42,2,-1,2,1
 40 | 38,window,floor,3,-1,3,-1,3,-1,2,0,3,0
 41 | 39,place,hand,3,1,2,1,2,1,2,1,3,-1
 42 | 40,door,floor,3,-1,3,-1,2,0,3,0,2,1
 43 | 41,bay,boat,3,1,2,-42,1,-42,1,-42,3,-1
 44 | 42,food,way,3,-42,3,-42,3,-42,3,-42,3,-42
 45 | 43,hat,back,3,-1,3,-1,3,-1,3,-1,2,-1
 46 | 44,someone,dinner,3,1,3,1,2,1,1,-42,3,1
 47 | 45,someone,fool,2,-42,1,-42,2,-42,2,-42,2,-42
 48 | 46,stone,hand,1,-42,2,1,3,1,3,1,2,-1
 49 | 47,ice,head,2,-42,1,-42,2,1,2,1,2,-42
 50 | 48,coach,hat,3,1,3,1,3,1,3,1,3,1
 51 | 49,ear,something,3,-42,3,-42,3,-42,3,-42,3,-42
 52 | 50,someone,boy,2,-42,2,-42,2,-42,2,0,1,-42
 53 | 51,stone,bed,2,-1,2,-1,2,1,3,1,1,-42
 54 | 52,person,daughter,2,1,2,1,2,1,1,-42,2,1
 55 | 53,person,barn,3,-1,3,-1,3,-1,3,-1,3,1
 56 | 54,sun,tree,3,1,3,1,2,1,2,-42,2,1
 57 | 55,door,light,2,1,3,1,3,1,2,1,2,-1
 58 | 56,ball,mouth,1,-42,1,-42,1,-42,2,1,1,-42
 59 | 57,child,picture,3,1,3,1,3,1,3,-1,3,1
 60 | 58,brother,hand,3,1,3,1,3,1,2,0,3,0
 61 | 59,back,air,2,-42,2,1,2,1,2,1,2,-42
 62 | 60,gentleman,ball,2,1,2,1,2,1,2,1,1,-42
 63 | 61,window,end,2,-42,2,-42,2,-42,2,-42,2,-42
 64 | 62,step,road,3,-1,3,-1,3,-1,2,-1,2,1
 65 | 63,result,element,2,-42,1,-42,1,-42,2,-42,2,-42
 66 | 64,parent,child,3,1,3,1,3,1,2,0,2,0
 67 | 65,sun,sail,3,1,3,1,2,1,2,-42,2,1
 68 | 66,river,breath,2,1,3,1,3,1,2,1,3,1
 69 | 67,vessel,anchor,3,1,3,1,2,1,2,0,2,1
 70 | 68,friend,newspaper,3,1,3,1,3,1,3,1,3,1
 71 | 69,everything,master,3,1,3,1,2,1,2,-42,2,-42
 72 | 70,coast,place,2,1,2,1,2,-42,2,-42,1,-42
 73 | 71,state,way,2,-42,2,-42,2,-42,2,-42,3,-42
 74 | 72,anchor,mouth,2,1,2,1,2,1,2,1,3,-42
 75 | 73,hair,room,3,-1,3,-1,3,-1,3,-1,1,-42
 76 | 74,sea,sail,3,1,3,1,2,1,2,-1,2,1
 77 | 75,temple,something,3,-42,3,-42,3,-42,3,-42,3,-42
 78 | 76,system,end,3,-42,3,-42,3,-42,3,-42,3,-42
 79 | 77,stone,way,2,-42,2,-42,2,-42,2,-42,2,-42
 80 | 78,sun,ear,3,1,3,1,3,1,2,1,2,1
 81 | 79,anything,end,2,-42,3,-42,3,-42,3,-42,2,-42
 82 | 80,father,truck,3,-1,3,-1,3,-1,3,-1,3,-1
 83 | 81,head,ball,2,0,3,1,2,1,1,-42,3,-1
 84 | 82,hip,hand,3,1,3,1,2,1,1,-42,2,-1
 85 | 83,body,direction,2,-42,2,-42,2,-42,2,-42,2,-42
 86 | 84,king,camp,3,-1,3,-1,2,-1,1,-42,3,1
 87 | 85,bag,way,3,-42,3,-42,3,-42,3,-42,3,-42
 88 | 86,person,wife,3,0,3,0,3,0,3,0,3,0
 89 | 87,hair,floor,3,-1,3,-1,3,-1,3,-1,2,1
 90 | 88,ball,light,1,-42,2,1,2,1,1,-42,3,-1
 91 | 89,heaven,face,2,1,2,-42,2,-42,2,-42,2,-1
 92 | 90,knife,throat,1,-42,2,-1,2,1,2,1,2,-42
 93 | 91,someone,light,1,-42,3,1,2,1,2,-1,2,-1
 94 | 92,chair,window,2,0,2,1,3,1,2,0,3,0
 95 | 93,person,fox,3,1,3,1,2,1,2,0,3,-1
 96 | 94,sea,middle,2,-42,3,-42,3,-42,3,-42,3,-42
 97 | 95,messenger,master,1,-42,1,-42,1,-42,2,0,1,-42
 98 | 96,system,something,3,-42,3,-42,3,-42,3,-42,3,-42
 99 | 97,shirt,hand,3,1,2,-1,2,-1,1,-42,1,-42
100 | 98,person,ice,3,1,3,1,3,1,2,-1,2,1
101 | 99,step,flood,2,-1,2,-42,2,-42,2,-42,2,-42
102 | 100,daughter,call,2,-42,2,1,2,-42,2,-42,3,-42
103 | 101,eye,fist,3,-1,3,-1,3,-1,2,-1,2,-42
104 | 102,house,hill,3,-1,3,-1,3,-1,2,-1,2,0
105 | 103,stream,hand,3,1,2,1,2,-42,2,-1,3,1
106 | 104,current,shore,2,1,1,-42,2,1,2,-1,3,1
107 | 105,sea,call,2,-42,2,-42,2,-42,2,-42,2,-42
108 | 106,ship,hand,3,1,3,1,3,1,3,1,1,-42
109 | 107,child,glass,2,1,2,1,1,-42,2,-1,2,1
110 | 108,way,end,3,-42,3,-42,3,-42,3,-42,3,-42
111 | 109,lady,eye,2,1,2,1,2,1,2,1,1,-42
112 | 110,house,back,1,-42,1,-42,1,-42,1,-42,2,-1
113 | 111,fist,mouth,2,0,1,-42,2,1,2,1,1,-42
114 | 112,door,wife,2,1,1,-42,1,-42,2,1,2,-1
115 | 113,bay,way,3,-42,3,-42,3,-42,3,-42,3,-42
116 | 114,object,hand,3,-42,3,-42,3,-42,3,-42,3,-42
117 | 115,flood,end,3,-42,3,-42,3,-42,3,-42,3,-42
118 | 116,eye,direction,3,-42,3,-42,3,-42,2,-42,2,-42
119 | 117,river,boat,3,1,3,1,2,1,2,-1,2,1
120 | 118,brother,coal,3,1,2,1,3,1,2,-1,3,1
121 | 119,victim,house,3,-1,3,-1,2,-1,2,-1,3,1
122 | 120,brother,clothes,2,1,3,1,3,1,3,1,2,0
123 | 121,child,purse,3,1,3,1,3,1,2,-1,3,1
124 | 122,bank,flood,3,-42,3,-42,2,-42,2,-42,2,-1
125 | 123,house,farm,3,-1,3,-1,2,-1,1,-42,3,0
126 | 124,side,current,3,-42,3,-42,3,-42,2,-42,2,-42
127 | 125,gentleman,book,3,1,3,1,3,1,3,-1,3,1
128 | 126,ground,king,3,1,3,1,3,1,3,1,3,-1
129 | 127,father,world,3,-1,3,-1,3,-1,3,-1,2,1
130 | 128,wall,hand,3,1,3,1,3,1,3,1,3,-1
131 | 129,grass,hand,3,-1,3,-1,3,-1,2,1,3,-1
132 | 130,bank,suit,3,1,3,1,3,1,3,1,1,-42
133 | 131,patient,glass,2,1,3,1,1,-42,2,-1,2,1
134 | 132,gentleman,train,3,-1,3,-1,3,-1,3,-1,3,-1
135 | 133,meal,piece,3,1,3,1,2,-42,2,-42,2,-42
136 | 134,sun,breath,3,1,3,1,3,1,1,-42,1,-42
137 | 135,everything,child,3,1,3,1,3,1,2,1,2,1
138 | 136,hat,response,2,-42,2,1,2,-42,1,-42,3,-1
139 | 137,torrent,mountain,2,-1,2,-1,1,-42,2,-42,3,1
140 | 138,boy,farm,3,-1,2,-1,3,-1,2,-1,3,1
141 | 139,office,picture,3,1,3,1,1,-42,1,-42,3,0
142 | 140,gentleman,stream,3,-1,3,-1,1,-42,3,1,2,-1
143 | 141,house,barn,1,-42,1,-42,2,0,2,0,2,0
144 | 142,bag,everything,2,-42,2,-42,2,-42,2,-42,2,-42
145 | 143,coach,bank,2,-1,2,-1,2,-1,2,-1,2,1
146 | 144,gentleman,eye,3,1,3,1,3,1,2,-1,2,1
147 | 145,person,ship,3,-1,3,-1,3,-1,3,-1,2,-1
148 | 146,someone,eye,2,1,2,1,2,1,1,-42,1,-42
149 | 147,father,light,2,-42,3,1,2,1,3,1,2,-1
150 | 148,river,sun,3,-1,3,-1,3,-1,2,-1,2,-42
151 | 149,sun,head,3,1,3,1,3,1,2,1,2,1
152 | 150,someone,piece,2,-42,2,-42,3,-42,3,-42,3,-42
153 | 151,gentleman,room,3,-1,3,-1,3,-1,3,-1,3,1
154 | 152,lady,stream,3,-1,2,-1,2,1,2,1,3,-1
155 | 153,foot,wall,3,-1,3,-1,3,-1,3,-1,3,1
156 | 154,breath,soul,2,-42,2,0,1,-42,2,0,1,-42
157 | 155,daughter,anything,2,-42,2,-42,3,-42,2,-42,3,-42
158 | 156,back,room,2,-42,2,-42,2,-42,2,-42,2,-42
159 | 157,scene,room,2,-42,2,-42,2,-42,2,-42,2,-42
160 | 158,hair,effect,3,-42,2,-42,2,-42,3,-42,2,-42
161 | 159,king,effect,1,-42,2,-42,2,-42,2,-42,2,-42
162 | 160,car,hand,3,1,3,1,3,1,3,1,3,1
163 | 161,town,picture,3,1,3,1,3,1,2,1,2,-42
164 | 162,lady,picture,3,1,3,1,3,1,2,-1,3,1
165 | 163,window,air,2,-1,3,1,2,1,3,1,3,-1
166 | 164,piano,suit,3,1,3,1,3,1,2,1,2,-42
167 | 165,father,bag,3,1,3,1,3,1,2,1,3,1
168 | 166,exile,end,2,-42,2,-42,2,-42,2,-42,2,-42
169 | 167,house,picture,3,1,3,1,3,1,3,1,2,-1
170 | 168,office,air,2,-1,2,1,2,-42,1,-42,2,-1
171 | 169,skirt,knee,1,-42,2,-1,1,-42,2,-1,2,-1
172 | 170,body,room,3,-1,2,-1,2,-1,3,-1,3,1
173 | 171,someone,child,3,1,3,1,3,1,2,0,1,-42
174 | 172,lady,hand,2,1,2,1,2,0,1,-42,1,-42
175 | 173,person,elbow,3,1,3,1,2,1,2,0,2,0
176 | 174,river,current,3,1,2,1,2,-1,2,0,2,-1
177 | 175,head,light,2,-1,3,1,3,1,3,1,2,-1
178 | 176,fox,goose,2,1,2,1,2,1,3,0,2,1
179 | 177,person,deck,3,-1,3,-1,3,-1,3,-1,3,1
180 | 178,boy,something,3,-42,3,-42,3,-42,3,-42,3,-42
181 | 179,phone,room,3,-1,3,-1,2,-1,2,-1,1,-42
182 | 180,call,way,2,-42,2,-42,2,-42,2,-42,2,-42
183 | 181,boy,face,3,1,3,1,3,1,1,-42,2,0
184 | 182,energy,hand,2,-42,2,-1,1,-42,2,-1,2,1
185 | 


--------------------------------------------------------------------------------
/src/data_turked.py:
--------------------------------------------------------------------------------
  1 | """
  2 | data_turked is for turked gold annotations of *FRAMES*. Often imported as 'td'
  3 | as in 'TurkedData'.
  4 | 
  5 | author: mbforbes
  6 | """
  7 | 
  8 | # IMPORTS
  9 | # -----------------------------------------------------------------------------
 10 | 
 11 | # builtins
 12 | import logging
 13 | import os
 14 | 
 15 | # 3rd party
 16 | import pandas as pd
 17 | 
 18 | 
 19 | # CONSTANTS
 20 | # -----------------------------------------------------------------------------
 21 | 
 22 | logger = logging.getLogger(__name__)
 23 | 
 24 | LABEL_GREATER = 1
 25 | LABEL_EQ = 0
 26 | LABEL_LESSER = -1
 27 | LABEL_UNK = -42
 28 | 
 29 | # The processed file with frame data.
 30 | PROCESSED_FILE = 'data/verbphysics/action-frames/action-frames.csv'
 31 | 
 32 | # Directories where we saved train/dev/test splits.
 33 | SPLIT_DIR_5 = 'data/verbphysics/action-frames/train-5/'
 34 | SPLIT_DIR_20 = 'data/verbphysics/action-frames/train-20/'
 35 | 
 36 | 
 37 | # CLASSES
 38 | # -----------------------------------------------------------------------------
 39 | 
 40 | class TurkedData(object):
 41 |     """
 42 |     Methods for loading / working with the turked data format, which is a dict
 43 |     of the form (henceforth known as TurkedDict):
 44 | 
 45 |             {
 46 |                 'size': [
 47 |                     (verb_sub[_prep]_1, np.ndarray),
 48 |                     (verb_sub[_prep]_2, np.ndarray),
 49 |                     ...
 50 |                 ],
 51 |                 'weight': [
 52 |                     ...
 53 |                 ],
 54 |                 ...
 55 |             }
 56 |     """
 57 | 
 58 |     @staticmethod
 59 |     def load(fn, agreement_needed, bigger_pot, smaller_pot, eq_pot):
 60 |         """
 61 |         Loads Turked data using specified settings.
 62 | 
 63 |         NOTE(mbforbes): Could do different potentials for stronger agreement.
 64 | 
 65 |         Args:
 66 |             fn (str): Pandas converted CSV file. To generate this, run
 67 |                 `notebooks/verb_process.ipynb`.
 68 | 
 69 |             agreement_needed (int): number out of 3 of agreement needed before
 70 |                 using a turked data point
 71 | 
 72 |             bigger_pot (np.ndarray of 1x3) Potentials for data with "bigger" GT.
 73 | 
 74 |             smaller_pot (np.ndarray of 1x3) Potentials for data with "smaller"
 75 |             GT.
 76 | 
 77 |             eq_pot (np.ndarray of 1x3) Potentials for data with "equal" GT.
 78 | 
 79 |         Returns:
 80 |             TurkedDict
 81 | 
 82 |         """
 83 |         label_pot_map = {
 84 |             LABEL_GREATER: bigger_pot,
 85 |             LABEL_EQ: eq_pot,
 86 |             LABEL_LESSER: smaller_pot,
 87 |         }
 88 | 
 89 |         # select attributes to load
 90 |         attrs = ['size', 'weight', 'verb-speed', 'hardness', 'rigidness']
 91 | 
 92 |         # load up
 93 |         df = pd.read_csv(fn)
 94 | 
 95 |         res = {}
 96 |         for attr in attrs:
 97 |             col_ag = attr + '-agree'
 98 |             col_maj = attr + '-maj'
 99 |             tuples = []
100 | 
101 |             # Pick only rows that agree on a non-UNK result.
102 |             data = df[(df[col_ag] >= agreement_needed) & (df[col_maj] != LABEL_UNK)]
103 | 
104 |             # NOTE(mbforbes): We could have two variants of the potentials, one
105 |             # for unanimous agreement, and a less strong one for 2/3 agreement.
106 |             # This uses one for all.
107 |             for _, row in data.iterrows():
108 |                 v, s, p = row['verb'], row['sub'], row['prep']
109 |                 name = TurkedData.vsp_to_str(v, s, p)
110 |                 pot = label_pot_map[row[col_maj]]
111 |                 tuples += [(name, pot)]
112 | 
113 |             # Save this attribute's tuples
114 |             res[attr] = tuples
115 |         return res
116 | 
117 |     @staticmethod
118 |     def load_raw(partition, attr, agreement_needed, fn=PROCESSED_FILE, split_dir=SPLIT_DIR_5):
119 |         """
120 |         Loads up `attr` data of `partition` of csv file `fn`, filtering out
121 |         those with agreement < agreement_needed and those with majority of 'unk'.
122 | 
123 |         Returns list where each item is:
124 | 
125 |             (verb, preposition|None, gold_label, one-hot vector of frame type, v_s_p)
126 | 
127 |         Where gold_label is one of:
128 | 
129 |             LABEL_GREATER (1)
130 |             LABEL_EQ (0)
131 |             LABEL_LESSER (-1)
132 | 
133 |         Frame type is a 5-d one-hot vector of frame type:
134 | 
135 |             [_d, _p, _od, _op, _dp]
136 | 
137 |         And v_s_p is the string representation of the full frame, i.e.,
138 | 
139 |             verb_sub_preposition
140 | 
141 |         Args:
142 |             partition (str) one of {'train', 'dev', 'test'}
143 |             attr (str)
144 |             agreement_needed (int)
145 |             fn (str) location of the processed csv data file
146 |             d (str, default=SPLIT_DIR_5) directory where we find train / dev /
147 |                 test verb splits
148 | 
149 |         Returns:
150 |             [(str, str|None, int, [int], str)]
151 |         """
152 |         # load up verbs in partition list
153 |         with open(os.path.join(split_dir, partition + '.txt')) as f:
154 |             verbs = set([line.strip() for line in f.readlines()])
155 | 
156 |         # load full data
157 |         df = pd.read_csv(fn)
158 | 
159 |         # filter agreement_needed for attr (and out maj unks). Wanted to filter
160 |         # verbs here but can't broadcast the 'in set' operation, I guess.
161 |         filtered = df[
162 |             (df[attr + '-agree'] >= agreement_needed) &
163 |             (df[attr + '-maj'] != LABEL_UNK)]
164 | 
165 |         # sub -> one hot vector. This could be represented more concisely (e.g.,
166 |         # the index to one-hot), but this is clearer to look at.
167 |         sub_to_onehot = {
168 |             '_d':  [1, 0, 0, 0, 0],
169 |             '_p':  [0, 1, 0, 0, 0],
170 |             '_od': [0, 0, 1, 0, 0],
171 |             '_op': [0, 0, 0, 1, 0],
172 |             '_dp': [0, 0, 0, 0, 1],
173 |         }
174 | 
175 |         # create results, filtering out verbs not in partition
176 |         res = []
177 |         for _, row in filtered.iterrows():
178 |             if row['verb'] not in verbs:
179 |                 continue
180 |             res.append((
181 |                 row['verb'],
182 |                 row['prep'] if not pd.isnull(row['prep']) else None,
183 |                 row[attr + '-maj'],
184 |                 sub_to_onehot[row['sub']],
185 |                 TurkedData.vsp_to_str(row['verb'], row['sub'], row['prep']),
186 |             ))
187 |         return res
188 | 
189 |     @staticmethod
190 |     def n_verbs(data):
191 |         """
192 |         Counts the number of unique verbs in data.
193 | 
194 |         Args:
195 |             data (single attribute list (element) of TurkedDict)
196 | 
197 |         Returns:
198 |             int
199 |         """
200 |         verbs = []
201 |         for d in data:
202 |             node = d[0]
203 |             v, _, _ = TurkedData.str_to_vsp(node)
204 |             verbs += [v]
205 |         return len(set(verbs))
206 | 
207 |     @staticmethod
208 |     def train_dev_test_split(data, d):
209 |         """
210 |         Splits data into train, dev, test sections.
211 | 
212 |         Args:
213 |             data (single attribute list (element) of TurkedDict)
214 | 
215 |             d (str) Directory in which we find train.txt, dev.txt, test.txt
216 |                 files, which are just one verb per line verb list files.
217 | 
218 |         Returns:
219 |             3x tuple, each is a list just like the input data arg.
220 |         """
221 |         # Load up verb lists
222 |         verb_splits = ['train', 'dev', 'test']
223 |         verb_map = {}
224 |         for s in verb_splits:
225 |             with open(os.path.join(d, s + '.txt')) as f:
226 |                 for v in f.readlines():
227 |                     verb_map[v.strip()] = s
228 | 
229 |         # Split data
230 |         data_splits = {
231 |             'train': [],
232 |             'dev': [],
233 |             'test': [],
234 |         }
235 |         for datum in data:
236 |             v, _, _ = TurkedData.str_to_vsp(datum[0])
237 |             split = verb_map[v]
238 |             data_splits[split] += [datum]
239 | 
240 |         # Log info
241 |         total_frames = sum([len(frames) for _, frames in data_splits.iteritems()])
242 |         logger.debug('Data splits:')
243 |         for split in verb_splits:
244 |             frames = data_splits[split]
245 |             logger.debug('\t%s: %d frames (%0.2f%%) (%d verbs)', split, len(frames), float(len(frames) * 100) / total_frames, TurkedData.n_verbs(frames))
246 | 
247 |         return data_splits['train'], data_splits['dev'], data_splits['test']
248 | 
249 |     @staticmethod
250 |     def str_to_vsp(node):
251 |         """
252 |         Args:
253 |             str
254 | 
255 |         Returns:
256 |             str      (verb),
257 |             str      (_sub) (yes, includes the '_'),
258 |             str|None (prep) (or None if sub has no prep)
259 |         """
260 |         pieces = node.split('_')
261 |         if len(pieces) == 2:
262 |             # No prep
263 |             return pieces[0], '_' + pieces[1], None
264 |         elif len(pieces) == 3:
265 |             # Has a prep
266 |             return pieces[0], '_' + pieces[1], pieces[2]
267 |         else:
268 |             assert False, 'Malformed node string: %r' % (node)
269 | 
270 |     @staticmethod
271 |     def vsp_to_str(v, s, p):
272 |         """
273 |         Args:
274 |             v (str) verb
275 |             s (str) sub (_p, _d, etc.)
276 |             p (str) preposition
277 | 
278 |         Returns:
279 |             str
280 |         """
281 |         res = v + s
282 |         if not pd.isnull(p):
283 |             res += '_' + p
284 |         return res
285 | 


--------------------------------------------------------------------------------
/src/main.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Entry point to verbphysics system.
  3 | 
  4 | author: mbforbes
  5 | """
  6 | 
  7 | # IMPORTS
  8 | # -----------------------------------------------------------------------------
  9 | 
 10 | # Logging first this was a fun bug.
 11 | import logging
 12 | import util
 13 | util.ensure_dir('log/')
 14 | logging.basicConfig(
 15 |     level=logging.DEBUG,
 16 |     format='%(asctime)s %(name)-16s %(levelname)-8s %(message)s',
 17 |     datefmt='%m-%d %H:%M:%S',
 18 |     filename='log/latest.log',
 19 |     filemode='w')
 20 | base_logger = logging.getLogger(__name__)
 21 | 
 22 | # builtins
 23 | import argparse
 24 | import sys
 25 | import time
 26 | 
 27 | # 3rd party
 28 | import factorgraph as fg
 29 | 
 30 | # local
 31 | import attrgraph
 32 | import data as ng
 33 | # If I really have to do this then it's a gross oversight of python's.
 34 | from data import SizeQueryN
 35 | import glove
 36 | import data_turked as td
 37 | from settings import Settings
 38 | 
 39 | 
 40 | # GLOBALS (SORRY)
 41 | # -----------------------------------------------------------------------------
 42 | 
 43 | CONSOLE_LOG_LEVEL = logging.DEBUG
 44 | VIZ_OUTPUT_DIR = 'viz/'
 45 | FRAMES_FILENAME = 'data/verbphysics/action-frames/action-frames.csv'
 46 | FRAMES_TRAIN_5_DIR = 'data/verbphysics/action-frames/train-5/'
 47 | FRAMES_TRAIN_20_DIR = 'data/verbphysics/action-frames/train-20/'
 48 | 
 49 | # Setting configurations follow.
 50 | 
 51 | playing = {
 52 |     # Set your desired config here. Default values are defined in settings.py
 53 |     # in Settings._get_default_map().
 54 | }
 55 | 
 56 | # Archival configurations.
 57 | 
 58 | model_a = {
 59 |     Settings.Eval: [Settings.EVAL_DEV, Settings.EVAL_TEST],
 60 |     Settings.GloveVerbSimThresh: [0.4],
 61 |     Settings.GloveNounSimThresh: [0.4],
 62 |     Settings.SelPrefPMICutoff: [5.0],
 63 |     Settings.IncludeSelPrefFactors: [True],
 64 |     Settings.IncludeXgraph: [False],
 65 |     Settings.IncludeVerbSimFactors: [True],
 66 |     Settings.IncludeNounSimFactors: [True],
 67 |     Settings.IncludeInfWithinverbSimframeFactors: [False],
 68 |     Settings.ObjpairSplit: [5],
 69 |     Settings.FrameSplit: [5],
 70 | }
 71 | 
 72 | model_b_frames = {
 73 |     Settings.Eval: [Settings.EVAL_DEV, Settings.EVAL_TEST],
 74 |     Settings.GloveVerbSimThresh: [0.4],
 75 |     Settings.GloveNounSimThresh: [0.4],
 76 |     Settings.SelPrefPMICutoff: [4.0],
 77 |     Settings.IncludeSelPrefFactors: [True],
 78 |     Settings.IncludeXgraph: [False],
 79 |     Settings.IncludeVerbSimFactors: [False],
 80 |     Settings.IncludeNounSimFactors: [True],
 81 |     Settings.IncludeInfWithinverbSimframeFactors: [True],
 82 |     Settings.ObjpairSplit: [20],
 83 |     Settings.FrameSplit: [5],
 84 | }
 85 | 
 86 | model_b_objpairs = {
 87 |     Settings.Eval: [Settings.EVAL_DEV, Settings.EVAL_TEST],
 88 |     Settings.GloveVerbSimThresh: [0.5],
 89 |     Settings.GloveNounSimThresh: [0.45],
 90 |     Settings.SelPrefPMICutoff: [4.0],
 91 |     Settings.IncludeSelPrefFactors: [True],
 92 |     Settings.IncludeXgraph: [True],
 93 |     Settings.IncludeVerbSimFactors: [True],
 94 |     Settings.IncludeNounSimFactors: [True],
 95 |     Settings.IncludeInfWithinverbSimframeFactors: [True],
 96 |     Settings.ObjpairSplit: [5],
 97 |     Settings.FrameSplit: [20],
 98 | }
 99 | 
100 | 
101 | # FUNCTIONS
102 | # -----------------------------------------------------------------------------
103 | 
104 | def _setup_logging(backup=False):
105 |     util.ensure_dir('log/')
106 | 
107 |     # Also log to backup file with date.
108 |     if backup:
109 |         fh = logging.FileHandler('log/' + time.strftime('%y-%m-%d_%H-%M-%S') +
110 |             '.log')
111 |         fh.setLevel(logging.DEBUG)
112 |         f_formatter = logging.Formatter(
113 |             fmt='%(asctime)s %(name)-16s %(levelname)-8s %(message)s',
114 |             datefmt='%H:%M:%S'
115 |         )
116 |         fh.setFormatter(f_formatter)
117 |         logging.getLogger('').addHandler(fh)
118 | 
119 |     # Also log to console.
120 |     console = logging.StreamHandler()
121 |     console.setLevel(CONSOLE_LOG_LEVEL)
122 |     c_formatter = logging.Formatter(
123 |         fmt='%(asctime)s %(name)-16s %(levelname)-8s %(message)s',
124 |         datefmt='%H:%M:%S'
125 |     )
126 |     console.setFormatter(c_formatter)
127 |     logging.getLogger('').addHandler(console)
128 | 
129 | 
130 | def _build_xgraph(graphs, tuples, pot):
131 |     """
132 |     Makes the interconnected (across knowledge dimension) graph.
133 | 
134 |     Args:
135 |         graphs ([AttrGraph])
136 |         tuples ([(str, str)]) Attr pairs to add cxns between frame RVs
137 |         pot (np.ndarray of shape 3x3)
138 | 
139 |     Returns
140 |         fg.Graph: the xgraph
141 |     """
142 |     xgraph = fg.Graph(debug=False)
143 |     base_logger.debug('Adding xgraph xfactors...')
144 |     total = 0
145 |     for t in tuples:
146 |         attr1, attr2 = t
147 |         g1 = [g for g in graphs if g.name == attr1]
148 |         g2 = [g for g in graphs if g.name == attr2]
149 | 
150 |         # Might not have one or both of the graphs because of the current
151 |         # settings.
152 |         if len(g1) != 1 or len(g2) != 1:
153 |             base_logger.debug(
154 |                 '\t skipping links between missing graphs %s and %s', attr1,
155 |                 attr2)
156 |             continue
157 | 
158 |         g1 = g1[0]
159 |         g2 = g2[0]
160 | 
161 |         # Find RVs that match across both graphs. Pruned RVs won't be returned
162 |         # by get_rvs() (as they are actually deleted from the graph's underlying
163 |         # dict), but we do want to make sure we're only linking frames.
164 |         matches = []
165 |         for rv_name, rv1 in g1.graph.get_rvs().iteritems():
166 |             if rv1.meta['type'] != 'frame':
167 |                 continue
168 |             if g2.graph.has_rv(rv_name):
169 |                 rv2 = g2.graph.get_rvs()[rv_name]
170 |                 matches.append([rv1, rv2])
171 | 
172 |         # add factors to our linking graph
173 |         for match in matches:
174 |             xgraph.factor(match, 'xfactor', pot, {'type': 'xfactor'})
175 | 
176 |         # reporting
177 |         base_logger.debug(
178 |             '\t added %d links between frame RVs between %s and %s',
179 |             len(matches), attr1, attr2)
180 |         total += len(matches)
181 |     base_logger.debug('Added %d xgraph xfactors in total' % (total))
182 |     return xgraph
183 | 
184 | 
185 | def _overall_stats(label, tuples):
186 |     """
187 |     Computes overall accuracy; returns in 'Settings'-friendly format.
188 | 
189 |     Args: tuples([(int, int)]) Each entry is (# correct, # total) label (str)
190 |         What to call this
191 | 
192 |     Returns: (str, str) key, val of settings column to add
193 |     """
194 |     n_correct = sum(tp[0] for tp in tuples)
195 |     n_total = sum(tp[1] for tp in tuples)
196 |     return 'OVERALL %s acc' % (label), '%d/%d (%0.2f%%)' % (
197 |         n_correct, n_total, (n_correct*100.0)/n_total)
198 | 
199 | 
200 | def main(config, product, viz):
201 |     """
202 |     Runs the verbphysics system using combinations of configurations specified
203 |     by config.
204 | 
205 |     Args:
206 |         config (dict): The configuration dictionary to use. Keys should be
207 |             Settings.XXX string constants; vals should be lists of values to
208 |             try.
209 | 
210 |         product (bool): Whether to try all (polynomially many)
211 |             combinations of settings specified in config (True), or whether to
212 |             try varying along each config setting individually (linearly many)
213 |             (False).
214 | 
215 |         viz (bool): Whether to dump visualization data of the built model.
216 |     """
217 |     # load up stuff needed
218 |     base_logger.debug('Loading ngramdb cached data...')
219 |     d = ng.Data()
220 |     base_logger.debug('Loading PMI...')
221 |     pmi = ng.PMI()
222 |     base_logger.debug('Loading GloVe...')
223 |     glv = glove.Glove()
224 | 
225 |     # Init settings.
226 |     settings = Settings()
227 |     if product:
228 |         settings.trial_product(config)
229 |     else:
230 |         settings.trial_sequence(config)
231 | 
232 |     # Keep cycling through experiments.
233 |     base_logger.debug('Beginning experiments...')
234 |     while(settings.next()):
235 |         # Load data and init graphs
236 |         base_logger.debug('Loading turked data...')
237 |         verb_data = td.TurkedData.load(
238 |             FRAMES_FILENAME,
239 |             settings.get(Settings.AgreementNeeded),
240 |             settings.get(Settings.GTBiggerPot),
241 |             settings.get(Settings.GTSmallerPot),
242 |             settings.get(Settings.GTEqPot))
243 |         eval_mode = settings.get(Settings.Eval)
244 |         frame_split = settings.get(Settings.FrameSplit)
245 |         if frame_split == 5:
246 |             framesplitdir = FRAMES_TRAIN_5_DIR
247 |         elif frame_split == 20:
248 |             framesplitdir = FRAMES_TRAIN_20_DIR
249 |         else:
250 |             base_logger.error('Unknown frame split: %r', frame_split)
251 |             sys.exit(1)
252 |         graphs = [attrgraph.AttrGraph(glv, d, pmi, verb_data, a, eval_mode,
253 |             framesplitdir) for a in settings.get(Settings.Attrs)]
254 | 
255 |         # Build attr graphs
256 |         for g in graphs:
257 |             g.build(settings)
258 | 
259 |         # Run LBP
260 |         normalize = settings.get(Settings.NormalizeLBP)
261 |         maxiters = settings.get(Settings.LBPMaxIters)
262 |         if not settings.get(Settings.IncludeXgraph):
263 |             # no connections between graphs; run each independently
264 |             for g in graphs:
265 |                 g.run(True, normalize, maxiters, True)
266 |         else:
267 |             # build special graph that has connections between graphs
268 |             xgraph = _build_xgraph(graphs, settings.get(Settings.XgraphTuples),
269 |                 settings.get(Settings.XgraphPot))
270 | 
271 |             # init all the graphs
272 |             xgraph.init_messages()
273 |             for g in graphs:
274 |                 g.graph.init_messages()
275 | 
276 |             # Run LBP piecewise across all graphs (including xgraph)
277 |             for i in range(1, maxiters + 1):
278 |                 base_logger.debug('Running LBP iter %d on all graphs...', i)
279 |                 convg = True
280 | 
281 |                 # run for the attr graphs
282 |                 for g in graphs:
283 |                     convg &= g.run(False, normalize, 1, False)
284 | 
285 |                 # run for the xgraph
286 |                 xconvg, _ = xgraph.lbp(False, normalize, 1, False)
287 |                 convg &= xconvg
288 | 
289 |                 # check convergence
290 |                 if convg:
291 |                     base_logger.debug('All graphs converged! Stopping LBP.')
292 |                     break
293 | 
294 |         # Decide what to eval (5 splits only)
295 |         objpair_split = settings.get(Settings.ObjpairSplit)
296 |         eval_frames = frame_split == 5
297 |         eval_objpairs = objpair_split == 5
298 | 
299 |         # Eval and pre-viz
300 |         verb_res_list, np_res_list = [], []
301 |         for g in graphs:
302 |             verb_res, np_res = g.eval(settings, eval_frames, eval_objpairs,
303 |                 True)
304 |             verb_res_list.append(verb_res)
305 |             np_res_list.append(np_res)
306 |             g.save_marginals()
307 | 
308 |         # Compute & save overall statistics
309 |         if eval_frames:
310 |             settings.add_result(*_overall_stats('frame', verb_res_list))
311 |         if eval_objpairs:
312 |             settings.add_result(*_overall_stats('np', np_res_list))
313 | 
314 |         # Viz
315 |         if viz:
316 |             for g in graphs:
317 |                 g.viz(VIZ_OUTPUT_DIR)
318 | 
319 |     settings.log_results()
320 | 
321 | 
322 | if __name__ == '__main__':
323 |     # Logic we don't want to worry about throughout
324 |     _setup_logging(backup=True)
325 | 
326 |     # these are the possible configs to choose from
327 |     config_options = {
328 |         'playing': playing,
329 |         'model_a': model_a,
330 |         'model_b_frames': model_b_frames,
331 |         'model_b_objpairs': model_b_objpairs,
332 |     }
333 |     config_opt_str = ' | '.join(config_options.keys())
334 | 
335 |     # cmd line
336 |     parser = argparse.ArgumentParser(
337 |         description='verbphysics reference implementation')
338 |     parser.add_argument(
339 |         '--config', metavar='CONFIG', default='model_a',
340 |         help='hyperparameter configuration to use; options: ' +
341 |         config_opt_str + ' (default: model_a')
342 |     parser.add_argument(
343 |         '--poly', type=bool, default=True, help='Whether to try '
344 |         'polynomially-many hyperparameter config combinations (True, default) '
345 |         'or vary config dimension sequentially (False). '
346 |     )
347 |     parser.add_argument(
348 |         '--viz', action='store_true', help='Whether to dump model / data to '
349 |         'JSON for visualization (default False).'
350 |     )
351 |     args = parser.parse_args()
352 | 
353 |     # checking
354 |     if args.config not in config_options:
355 |         print 'Error: "%s" unknown config. Options are %s' % (args.config,
356 |             config_opt_str)
357 |         sys.exit(1)
358 | 
359 |     main(config_options[args.config], args.poly, args.viz)
360 | 


--------------------------------------------------------------------------------
/docs/factorgraph-viz.js:
--------------------------------------------------------------------------------
  1 | //
  2 | // factorgraph-viz
  3 | //
  4 | // Visualizing factor graphs using d3-force.
  5 | //
  6 | // author: mbforbes
  7 | //
  8 | //
  9 | // factorgraph-viz
 10 | //
 11 | // Visualizing factor graphs using d3-force.
 12 | //
 13 | // author: mbforbes
 14 | //
 15 | /**
 16 |  * nodetype returns a function that will take FGNodes as arguments and return
 17 |  * whether they match the desired type.
 18 |  * @param desired
 19 |  */
 20 | function nodetype(desired) {
 21 |     return function (node) {
 22 |         return node.type === desired;
 23 |     };
 24 | }
 25 | /**
 26 |  * nodesubtype returns a function that will take FGNodes as arguments and return
 27 |  * whether they match the desired subtype.
 28 |  * @param desired
 29 |  */
 30 | function nodesubtype(desired) {
 31 |     return function (node) {
 32 |         // TODO: do we want to check the node's focus?
 33 |         // let focus = node.focus || false;
 34 |         let focus = false;
 35 |         return (!focus) && node.subtype === desired;
 36 |     };
 37 | }
 38 | /**
 39 |  * nodefocus returns whether a node is the node to focus on visually.
 40 |  * @param node
 41 |  */
 42 | function nodefocus(node) {
 43 |     return node.focus || false;
 44 | }
 45 | /**
 46 |  * textclass returns the class that should be applied to the text surrounding
 47 |  * the provided node.
 48 |  * @param node
 49 |  */
 50 | function textclass(node) {
 51 |     return node.type === 'rv' ? 'rvtext' : 'factext';
 52 | }
 53 | /**
 54 |  * nodename determines the text that is rendered next to a node.
 55 |  * @param node
 56 |  */
 57 | function nodename(node) {
 58 |     if (node.type == 'fac') {
 59 |         // maybe add extra info (e.g. sel pref fac is reversed)
 60 |         let specific = '';
 61 |         if (node.specific != null) {
 62 |             specific = ' [' + node.specific + ']';
 63 |         }
 64 |         return node.subtype + specific;
 65 |     }
 66 |     else {
 67 |         // rv
 68 |         return node.id;
 69 |     }
 70 | }
 71 | //
 72 | // factorgraph-viz
 73 | //
 74 | // Visualizing factor graphs using d3-force.
 75 | //
 76 | // author: mbforbes
 77 | //
 78 | //
 79 | // util.ts has a few helper functions, mostly regarding colorizing.
 80 | //
 81 | /// <reference path="node.ts" />
 82 | function argmax(arr) {
 83 |     if (arr.length < 1) {
 84 |         return -1;
 85 |     }
 86 |     let max_val = arr[0], max_idx = 0;
 87 |     for (let i = 1; i < arr.length; i++) {
 88 |         if (arr[i] > max_val) {
 89 |             max_val = arr[i];
 90 |             max_idx = i;
 91 |         }
 92 |     }
 93 |     return max_idx;
 94 | }
 95 | function color(none, unsureColor, unsureCutoff, values, d) {
 96 |     if (d.weights == null) {
 97 |         return d3.color(none);
 98 |     }
 99 |     let max_idx = argmax(d.weights);
100 |     let max_val = d.weights[max_idx];
101 |     // clamp unsure ones to final value (hopefully something like grey)
102 |     if (max_val < unsureCutoff) {
103 |         return d3.color(unsureColor);
104 |     }
105 |     return d3.color(values[argmax(d.weights)]);
106 | }
107 | //
108 | // factorgraph-viz
109 | //
110 | // Visualizing factor graphs using d3-force.
111 | //
112 | // author: mbforbes
113 | //
114 | //
115 | // graph.ts defines the monster build(...) function for constructing the factor
116 | // graph. It's full of closures as an excuse for accessing what are basically
117 | // globals. I blame d3.
118 | //
119 | /// <reference path="config.ts" />
120 | /// <reference path="node.ts" />
121 | /// <reference path="util.ts" />
122 | function appendText(svg) {
123 |     let count = 1;
124 |     return function (label, d) {
125 |         if (d) {
126 |             svg.append('g').append('text')
127 |                 .attr('transform', 'translate(20,' + count * 20 + ')')
128 |                 .text(label + ': ' + d);
129 |             count += 1;
130 |         }
131 |     };
132 | }
133 | /**
134 |  *
135 |  * build is the central function of this codebase. It pareses the factor graph
136 |  * data and constructs it.
137 |  *
138 |  * Note: the nodes here are technically FGNodes, but the horrendous type
139 |  * massaging needed to make this work with d3's type hariness is not worth the
140 |  * effort.
141 |  * @param config
142 |  * @param data
143 |  */
144 | function build(config, data) {
145 |     let svg = d3.select("svg"), width = +svg.attr("width"), height = +svg.attr("height");
146 |     // Debug logging. Can be nice as Chrome's console lets you interactively
147 |     // explore the objects you're getting.
148 |     console.log('Factor graph data:');
149 |     console.log(data);
150 |     function isolate(force, filter) {
151 |         let initialize = force.initialize;
152 |         force.initialize = function () { initialize.call(force, data.nodes.filter(filter)); };
153 |         return force;
154 |     }
155 |     // TODO: We can actually extract most of this information. Stats should only
156 |     // be used to provide additional info that can't be extracted from the graph
157 |     // structure.
158 |     let appeneder = appendText(svg);
159 |     if (data.stats) {
160 |         appeneder('random variables', data.stats.n_rvs);
161 |         appeneder('factors', data.stats.n_facs);
162 |         appeneder('focus', data.stats.focus);
163 |         appeneder('correct', data.stats.correct);
164 |     }
165 |     let leftScale = config.position.leftScale;
166 |     let rightScale = config.position.rightScale;
167 |     let centerScale = config.position.centerScale;
168 |     let sim = d3.forceSimulation(data.nodes)
169 |         .force('charge', d3.forceManyBody().strength(-500))
170 |         .force('link', d3.forceLink(data.links).id(function (d) { return d.id; }))
171 |         .force('center', isolate(d3.forceCenter(width * centerScale, height / 2), nodefocus))
172 |         .force('left', isolate(d3.forceX(width * leftScale).strength(config.position.leftStrength), nodesubtype(config.position.leftSubtype)))
173 |         .force('right', isolate(d3.forceX(width * rightScale).strength(config.position.rightStrength), nodesubtype(config.position.rightSubtype)))
174 |         .force('up', isolate(d3.forceY(config.position.upScale * height).strength(config.position.upStrength), nodesubtype(config.position.upSubtype)))
175 |         .force('down', isolate(d3.forceY(config.position.downScale * height).strength(config.position.downStrength), nodesubtype(config.position.downSubtype)))
176 |         .force('middle', d3.forceY(height / 2).strength(config.position.middleStrength))
177 |         .on('tick', ticked);
178 |     // use color config we've received to partially bind coloring function
179 |     let colorize = color.bind(null, config.color.none, config.color.unsureColor, config.color.unsureCutoff, config.color.values);
180 |     // new for svg --- create the objects directly; then ticked just modifies
181 |     // their positions rather than drawing them.
182 |     let link = svg.append("g")
183 |         .attr("class", "links")
184 |         .selectAll("line")
185 |         .data(data.links)
186 |         .enter().append("line")
187 |         .attr("stroke", colorize);
188 |     let text = svg.append('g')
189 |         .selectAll('text')
190 |         .data(data.nodes)
191 |         .enter().append('text')
192 |         .attr('class', textclass)
193 |         .text(nodename);
194 |     let node = svg.append("g")
195 |         .attr("class", "nodes")
196 |         .selectAll("circle")
197 |         .data(data.nodes.filter(nodetype('rv')))
198 |         .enter().append("circle")
199 |         .attr("r", config.size.rv)
200 |         .attr("fill", colorize)
201 |         .call(d3.drag()
202 |         .on("start", dragstarted)
203 |         .on("drag", dragged)
204 |         .on("end", dragended));
205 |     let fac = svg.append("g")
206 |         .attr("class", "facs")
207 |         .selectAll("rect")
208 |         .data(data.nodes.filter(nodetype('fac')))
209 |         .enter().append("rect")
210 |         .attr("fill", colorize)
211 |         .attr("width", config.size.factor)
212 |         .attr("height", config.size.factor)
213 |         .call(d3.drag()
214 |         .on("start", dragstarted)
215 |         .on("drag", dragged)
216 |         .on("end", dragended));
217 |     // Assumes RVs and factor are roughly the same size.
218 |     let bigger = Math.max(config.size.rv, config.size.factor);
219 |     function ticked() {
220 |         link
221 |             .attr("x1", function (d) { return d.source.x; })
222 |             .attr("y1", function (d) { return d.source.y; })
223 |             .attr("x2", function (d) { return d.target.x; })
224 |             .attr("y2", function (d) { return d.target.y; });
225 |         node
226 |             .attr("cx", function (d) { return d.x; })
227 |             .attr("cy", function (d) { return d.y; });
228 |         fac
229 |             .attr("x", function (d) { return d.x - config.size.factor / 2; })
230 |             .attr("y", function (d) { return d.y - config.size.factor / 2; });
231 |         text
232 |             .attr("transform", function (d) {
233 |             return "translate(" + (d.x + bigger) + "," + (d.y + 10) + ")";
234 |         });
235 |     }
236 |     // The following functions allow for dragging interactivity. They're here
237 |     // because they require access to variables defined in this function. (Well,
238 |     // dragged() might not, but it fits with the others.)
239 |     function dragsubject() {
240 |         return sim.find(d3.event.x, d3.event.y);
241 |     }
242 |     function dragstarted() {
243 |         if (!d3.event.active) {
244 |             sim.alphaTarget(0.3).restart();
245 |         }
246 |         d3.event.subject.fx = d3.event.subject.x;
247 |         d3.event.subject.fy = d3.event.subject.y;
248 |     }
249 |     function dragged() {
250 |         d3.event.subject.fx = d3.event.x;
251 |         d3.event.subject.fy = d3.event.y;
252 |     }
253 |     function dragended() {
254 |         if (!d3.event.active) {
255 |             sim.alphaTarget(0);
256 |         }
257 |         d3.event.subject.fx = null;
258 |         d3.event.subject.fy = null;
259 |     }
260 | }
261 | ;
262 | //
263 | // factorgraph-viz
264 | //
265 | // Visualizing factor graphs using d3-force.
266 | //
267 | // author: mbforbes
268 | //
269 | //
270 | // main.ts is where the execution begins.
271 | //
272 | /// <reference path="config.ts" />
273 | /// <reference path="graph.ts" />
274 | // Constants
275 | let FG_NAME_ELEMENT_ID = 'fg-title';
276 | let SVG_ELEMENT_ID = 'fg-svg';
277 | let USER_INPUT_ID = 'userInput';
278 | let SUGGESTIONS_ELEMENT_ID = 'suggestions';
279 | let SUGGESTION_NOTICE_ELEMENT_ID = 'suggestionNotice';
280 | let AUTOCOMPLETE_LIMIT_DEFAULT = 50;
281 | let CONFIG_FILE = 'data/config/default.json';
282 | // Globals (sorry).
283 | let cacheConfig;
284 | let cacheFactorgraphFns = [];
285 | /**
286 |  * Extracts general config and list of factorgraph file names. Calls preload.
287 |  * @param config
288 |  */
289 | function prepreload(config) {
290 |     cacheConfig = config;
291 |     d3.json(config.data_filenames, preload);
292 | }
293 | /**
294 |  * Saves the list of factor graph file names.
295 |  * @param factorgraphFns
296 |  */
297 | function preload(factorgraphFns) {
298 |     cacheFactorgraphFns = factorgraphFns;
299 |     maybeLoad(cacheConfig.startup_filename);
300 | }
301 | /**
302 |  * Helper to clear all children of a DOM node.
303 |  * @param el
304 |  */
305 | function clearChildren(el) {
306 |     while (el.firstChild) {
307 |         el.removeChild(el.firstChild);
308 |     }
309 | }
310 | /**
311 |  * Removes everything from within the svg.
312 |  */
313 | function destroy() {
314 |     clearChildren(document.getElementById(SVG_ELEMENT_ID));
315 | }
316 | /**
317 |  * Loads factor graph found in `fn`.
318 |  * @param fn
319 |  */
320 | function load(fn) {
321 |     destroy();
322 |     d3.json(fn, build.bind(null, cacheConfig));
323 | }
324 | /**
325 |  * Loads factor graph found in `fn` if it's in our list of valid factor graph
326 |  * names.
327 |  * @param name
328 |  */
329 | function maybeLoad(name) {
330 |     if (cacheFactorgraphFns.indexOf(name) != -1) {
331 |         let prefix = cacheConfig.display_prefix || '';
332 |         document.getElementById(FG_NAME_ELEMENT_ID).innerText = prefix + name;
333 |         load(cacheConfig.data_dir + name + '.json');
334 |     }
335 | }
336 | /**
337 |  * Called every time the user text box changes its content.
338 |  */
339 | function userTypes() {
340 |     let inp = document.getElementById(USER_INPUT_ID).value;
341 |     // Prefix filter. Don't show anything with blank input
342 |     let opts = [];
343 |     if (inp.length > 0) {
344 |         opts = cacheFactorgraphFns.filter(fn => fn.startsWith(inp));
345 |     }
346 |     // Clear any existing suggestions.
347 |     let sug = document.getElementById(SUGGESTIONS_ELEMENT_ID);
348 |     clearChildren(sug);
349 |     // Display suggestions notice only if we have at least 1 suggestion.
350 |     let sugNotice = document.getElementById(SUGGESTION_NOTICE_ELEMENT_ID);
351 |     sugNotice.style.visibility = opts.length > 0 ? 'visible' : 'hidden';
352 |     // Add suggestions.
353 |     let autocomplete_limit = cacheConfig.autocomplete_limit || AUTOCOMPLETE_LIMIT_DEFAULT;
354 |     for (let opt of opts.slice(0, autocomplete_limit)) {
355 |         let el = document.createElement('button');
356 |         el.className = 'suggestion';
357 |         el.innerText = opt;
358 |         el.setAttribute('onclick', 'maybeLoad("' + opt + '");');
359 |         sug.appendChild(el);
360 |     }
361 |     // Display note if they were truncated.
362 |     if (opts.length > autocomplete_limit) {
363 |         let el = document.createElement('p');
364 |         el.className = 'limited';
365 |         el.innerText = '(only first ' + autocomplete_limit + ' of ' +
366 |             opts.length + ' shown)';
367 |         sug.appendChild(el);
368 |     }
369 | }
370 | /**
371 |  * Called when the user submits the text box (presses enter or clicks button).
372 |  * Always returns false so we don't do a post.
373 |  */
374 | function userSubmits() {
375 |     maybeLoad(document.getElementById(USER_INPUT_ID).value);
376 |     return false;
377 | }
378 | // execution starts here
379 | d3.json(CONFIG_FILE, prepreload);
380 | 


--------------------------------------------------------------------------------
/src/settings.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Settings is for running experiments with different parameters. Supports
  3 | stuff like auto grid search and logging (yes, logging!).
  4 | 
  5 | TODO:
  6 |     - [ ] sanity check passed experiments to be of type 'list'. If passing a
  7 |           single setting that happens to be iterable it will happily iterate
  8 |           through, e.g., all characters of a string.
  9 | 
 10 | author: mbforbes
 11 | """
 12 | 
 13 | # IMPORTS
 14 | # -----------------------------------------------------------------------------
 15 | 
 16 | # builtins
 17 | import code  # code.interact(local=dict(globals(), **locals()))
 18 | from itertools import product
 19 | import logging
 20 | 
 21 | # 3rd party
 22 | import numpy as np
 23 | from tabulate import tabulate
 24 | 
 25 | 
 26 | # TOP-LEVEL FUNCTIONS
 27 | # -----------------------------------------------------------------------------
 28 | 
 29 | def cell_massage(val):
 30 |     """
 31 |     Preprocessing values to ensure that they can fit well in the cell of a
 32 |     printed table.
 33 | 
 34 |     Args:
 35 |         val
 36 | 
 37 |     Returns:
 38 |         val (or something)
 39 |     """
 40 |     # tabulate appears to sometimes work for bools and sometimes not. So I'm
 41 |     # doing this so that it always works.
 42 |     if type(val) is bool:
 43 |         return 'True' if val else 'False'
 44 |     # tabulate TOTALLY doesn't handle numpy arrays as cell entries.
 45 |     if type(val) is np.ndarray:
 46 |         return ', '.join([str(row) for row in val])
 47 |     # default
 48 |     return val
 49 | 
 50 | 
 51 | # CLASSES
 52 | # -----------------------------------------------------------------------------
 53 | 
 54 | class Settings(object):
 55 |     """
 56 |     Class for trying all (exponentially many) combinations of all parameter
 57 |         settings. Must call next() before each trial run.
 58 | 
 59 |     New features:
 60 | 
 61 |     - [x]   np.ndarray aligned printing
 62 | 
 63 |     - [x]   Print settings that aren't changing at the top. If they're default,
 64 |             note them as so.
 65 | 
 66 |             Each iteration, note only the thing that is changing.
 67 | 
 68 |             Integrate with results. Output in a table format with the stuff that
 69 |             is changing.
 70 | 
 71 |             Example:
 72 | 
 73 |                 Settings that aren't changing:
 74 | 
 75 |                    foo: 0.5 (default)
 76 |                 barbar: 0.7 (default)
 77 |                    baz: 0.9
 78 | 
 79 |                 (.. experiments run here ...)
 80 | 
 81 |                 la -> |  0.5  |  0.7  |  0.9
 82 |                 ------+-------+-------+------
 83 |                       |   98% |  30%  |  40%
 84 | 
 85 |                 2D for 2 varied. TODO: For > 2, multiple tables?
 86 | 
 87 |                 TODO: Use pandas for this?
 88 |     """
 89 |     # Class vars as constants for keys
 90 | 
 91 |     # Used with iterators to tell when to stop.
 92 |     NothingLeft = object()
 93 | 
 94 |     Eval = 'eval'
 95 |     GloveVerbSimThresh = 'glove-verb-sim-thresh'
 96 |     GloveNounSimThresh = 'glove-noun-sim-thresh'
 97 |     Attrs = 'attrs'
 98 |     VerbSimPot = 'verb-sim-pot'
 99 |     NounEqPot = 'noun-eq-pot'
100 |     NounSimPot = 'noun-sim-pot'
101 |     NounSimRevPot = 'noun-sim-rev-pot'
102 |     MaxNounsPerFrame = 'max-nouns-per-frame'
103 |     FilterAbstract = 'filter-abstract'
104 |     GTBiggerPot = 'gt-bigger-pot'
105 |     GTSmallerPot = 'gt-smaller-pot'
106 |     GTEqPot = 'gt-eq-pot'
107 |     AgreementNeeded = 'agreement-needed'
108 |     SelPrefMethod = 'sel-pref-method'
109 |     SelPrefFreqCutoff = 'sel-pref-freq-cutoff'
110 |     SelPrefPMICutoff = 'sel-pref-pmi-cutoff'
111 |     SelPrefPot = 'sel-pref-pot'
112 |     NormalizeLBP = 'normalize-lbp'
113 |     LBPMaxIters = 'lbp-max-iters'
114 |     IncludeVerbSimFactors = 'include-verb-sim-factors'
115 |     IncludeNounSimFactors = 'include-noun-sim-factors'
116 |     IncludeSelPrefFactors = 'include-sel-pref-factors'
117 |     IncludeInfWithinverbSimframeFactors = 'include-inf-withinverb-simframe-factors'
118 |     WithinverbSimframePot = 'withinverb-simframe-pot'
119 |     IncludeXgraph = 'include-xgraph'
120 |     XgraphTuples = 'xgraph-tuples'
121 |     XgraphPot = 'xgraph-pot'
122 |     MaxSeeds = 'max-seeds'
123 |     RawNounsFilename = 'raw-nouns-filename'
124 |     EvalNounsFilename = 'eval-nouns-filename'
125 |     Lemmatize = 'lemmatize'
126 |     SelPrefMinFreqForPMI = 'sel-pref-min-freq-for-pmi'
127 |     IncludeNgramDBNouns = 'include-ngramdb-nouns'
128 |     IncludeGoldNounpairs = 'include-gold-nounpairs'
129 |     GoldNounpairAgreementNeeded = 'gold-nounpair-agreement-needed'
130 |     GoldNounpairGreaterPot = 'gold-nounpair-greater-pot'
131 |     GoldNounpairLesserPot = 'gold-nounpair-lesser-pot'
132 |     GoldNounpairEqPot = 'gold-nounpair-eq-pot'
133 |     AddRemainderAsNonseeds = 'add-remainder-as-nonseeds'
134 |     FrameSeedMethod = 'frame-seed-method'
135 |     NounpairSeedMethod = 'nounpair-seed-method'
136 |     SelPrefPotMethod = 'selpref-pot-method'
137 |     SelPrefEmbFilename = 'selpref-emb-filename'
138 |     ObjpairSplit = 'objpair-split'
139 |     FrameSplit = 'frame-split'
140 | 
141 |     # Class vars in all caps as constants for vals
142 |     EVAL_DEV = 'dev'
143 |     EVAL_TEST = 'test'
144 | 
145 |     SEL_PREF_FREQ = 'freq'
146 |     SEL_PREF_PMI = 'pmi'
147 | 
148 |     POTENTIAL_METHOD_HARDCODED = 'hardcoded'
149 |     POTENTIAL_METHOD_TRAINED = 'trained'
150 |     POTENTIAL_METHOD_BOTH = 'both'
151 | 
152 |     # digging into more detail here for selpref
153 |     SEL_PREF_HARDCODED = 'hardcoded'
154 |     SEL_PREF_MLE = 'mle'
155 |     SEL_PREF_EMB = 'emb'
156 | 
157 |     # unary potentials
158 |     POT_UNARY_MEDIUM_BIGGER = np.array([0.7, 0.2, 0.1])
159 |     POT_UNARY_MEDIUM_SMALLER = np.array([0.2, 0.7, 0.1])
160 |     POT_UNARY_MEDIUM_EQ = np.array([0.15, 0.15, 0.7])
161 | 
162 |     POT_UNARY_STRONG_BIGGER = np.array([0.9, 0.07, 0.03])
163 |     POT_UNARY_STRONG_SMALLER = np.array([0.07, 0.9, 0.03])
164 |     POT_UNARY_STRONG_EQ = np.array([0.05, 0.05, 0.9])
165 | 
166 |     # binary potentials
167 |     POT_BINARY_MEDIUM_SIM = np.array([
168 |         [0.7, 0.2, 0.1],
169 |         [0.2, 0.7, 0.1],
170 |         [0.15, 0.15, 0.7],
171 |     ])
172 |     POT_BINARY_MEDIUM_REV = np.array([
173 |         [0.2, 0.7, 0.1],
174 |         [0.7, 0.2, 0.1],
175 |         [0.15, 0.15, 0.7],
176 |     ])
177 | 
178 |     POT_BINARY_STRONG_SIM = np.array([
179 |         [0.9, 0.07, 0.03],
180 |         [0.07, 0.9, 0.03],
181 |         [0.05, 0.05, 0.9],
182 |     ])
183 |     POT_BINARY_STRONG_REV = np.array([
184 |         [0.07, 0.9, 0.03],
185 |         [0.9, 0.07, 0.03],
186 |         [0.05, 0.05, 0.9],
187 |     ])
188 | 
189 |     @staticmethod
190 |     def _get_default_map():
191 |         return {
192 |             Settings.Eval: Settings.EVAL_DEV,
193 |             Settings.Attrs: ['size', 'weight', 'verb-speed', 'hardness', 'rigidness'],
194 |             Settings.MaxSeeds: -1,  # -1 means no limit
195 |             Settings.GloveVerbSimThresh: 0.5,
196 |             Settings.GloveNounSimThresh: 0.45,
197 |             Settings.VerbSimPot: Settings.POT_BINARY_MEDIUM_SIM,
198 |             Settings.NounEqPot: Settings.POT_UNARY_MEDIUM_EQ,
199 |             Settings.NounSimPot: Settings.POT_BINARY_MEDIUM_SIM,
200 |             Settings.NounSimRevPot: Settings.POT_BINARY_MEDIUM_REV,
201 |             Settings.MaxNounsPerFrame: 1,
202 |             Settings.FilterAbstract: True,
203 |             Settings.GTBiggerPot: Settings.POT_UNARY_MEDIUM_BIGGER,
204 |             Settings.GTSmallerPot: Settings.POT_UNARY_MEDIUM_SMALLER,
205 |             Settings.GTEqPot: Settings.POT_UNARY_MEDIUM_EQ,
206 |             Settings.AgreementNeeded: 2,
207 |             Settings.SelPrefFreqCutoff: 1000,
208 |             Settings.SelPrefMinFreqForPMI: 1,
209 |             Settings.SelPrefPMICutoff: 4.0,
210 |             Settings.SelPrefMethod: Settings.SEL_PREF_PMI,
211 |             Settings.SelPrefPot: Settings.POT_BINARY_MEDIUM_SIM,
212 |             Settings.NormalizeLBP: True,
213 |             Settings.LBPMaxIters: 20,
214 |             Settings.IncludeSelPrefFactors: True,
215 |             Settings.IncludeXgraph: True,
216 |             Settings.IncludeVerbSimFactors: True,
217 |             Settings.IncludeNounSimFactors: True,
218 |             Settings.IncludeInfWithinverbSimframeFactors: True,
219 |             Settings.WithinverbSimframePot: Settings.POT_BINARY_MEDIUM_SIM,
220 |             Settings.XgraphTuples: [
221 |                 ('size', 'weight'),
222 |                 ('size', 'hardness'),
223 |                 ('weight', 'hardness'),
224 |             ],
225 |             Settings.XgraphPot: Settings.POT_BINARY_MEDIUM_SIM,
226 |             Settings.RawNounsFilename: '',
227 |             Settings.EvalNounsFilename: '',
228 |             Settings.Lemmatize: True,
229 |             Settings.IncludeNgramDBNouns: False,
230 |             Settings.IncludeGoldNounpairs: True,
231 |             Settings.GoldNounpairAgreementNeeded: 2,
232 |             Settings.GoldNounpairGreaterPot: Settings.POT_UNARY_MEDIUM_BIGGER,
233 |             Settings.GoldNounpairLesserPot: Settings.POT_UNARY_MEDIUM_SMALLER,
234 |             Settings.GoldNounpairEqPot: Settings.POT_UNARY_MEDIUM_EQ,
235 |             Settings.AddRemainderAsNonseeds: True,
236 |             Settings.FrameSeedMethod: Settings.POTENTIAL_METHOD_BOTH,
237 |             Settings.NounpairSeedMethod: Settings.POTENTIAL_METHOD_BOTH,
238 |             Settings.SelPrefPotMethod: Settings.SEL_PREF_HARDCODED,
239 |             Settings.SelPrefEmbFilename: '',
240 |             Settings.ObjpairSplit: 20,
241 |             Settings.FrameSplit: 5,
242 |         }
243 | 
244 |     def __init__(self, logger=None):
245 |         """
246 |         Sets dict with default settings.
247 | 
248 |         Settings to do:
249 |         - [x] constants above
250 |         - [x] number of nounsp
251 |         - [x] Potentials (bigger, smaller, eq)
252 |         - [x] Agreement needed (x/3)
253 |         - [x] Verb sim fac pots
254 |         - [x] Noun sim fac pots
255 |         - [x] Sel pref pots
256 |         - [x] Sel pref cutoff
257 |         - [x] whether to normalize in lbp
258 |         - [x] max n iterations to run lbp for
259 |         - [x] which factors to add
260 |         - [x] whether to filter abstract nouns
261 |         - [x] check out data.py settings
262 |         - [x] check rest of this file
263 |         """
264 |         # Some admin
265 |         if logger is None:
266 |             logger = logging.getLogger(__name__)
267 |         self.logger = logger
268 | 
269 |         # Default values
270 |         self._params = Settings._get_default_map()
271 |         self.param_keys = []
272 |         self.param_iterator = None
273 | 
274 |     def get(self, key):
275 |         return self._params[key]
276 | 
277 |     def _setup_trial(self, trial_keys):
278 |         """
279 |         Tracks which configs vary (are "trial" keys).
280 | 
281 |         Args:
282 |             trial_keys ([str])
283 |         """
284 |         self.default_keys = set(self._get_default_map().keys()) - set(trial_keys)
285 |         self.trial_keys = trial_keys
286 |         self.trial_num = 0
287 |         self.trial_log = {}
288 |         self.trial_results = {}
289 |         self.trial_results_all_keys = []
290 | 
291 |     def trial_sequence(self, params):
292 |         """
293 |         Sets up a trial to try the specified ranges of parameter values in
294 |         sequence (holding all other parameters to their defaults and varying
295 |         only one at a time).
296 | 
297 |         Args:
298 |             params ({Settings.KEY: [list of values to try]})
299 |         """
300 |         self._setup_trial(params.keys())
301 | 
302 |         # This implementation is kind of gross because it's bolted onto how the
303 |         # trial_product was designed. We really want to iterate over both keys
304 |         # and values and just set what we want. But I'm too lazy to learn about
305 |         # how iterators work in python. So we just use all the keys.
306 |         dm = self._get_default_map()
307 |         keys = dm.keys()
308 |         vals = [dm[k] for k in keys]
309 |         trials = []
310 |         for k, v in params.iteritems():
311 |             kidx = keys.index(k)
312 |             for val in v:
313 |                 trial = vals[:]
314 |                 trial[kidx] = val
315 |                 trials += [tuple(trial)]
316 |         self.param_keys = keys
317 |         self.param_iterator = iter(trials)
318 | 
319 |     def trial_product(self, params):
320 |         """
321 |         Sets up a trial to try the product (all exponentially many
322 |         combinations) of the specified ranges of parameter values.
323 | 
324 |         Args: params ({Settings.KEY: [list of values to try]})
325 |         """
326 |         self._setup_trial(params.keys())
327 | 
328 |         param_keys = []
329 |         param_vals = []
330 |         for k,v in params.iteritems():
331 |             param_keys += [k]
332 |             param_vals += [v]
333 | 
334 |         # self.current_indices = [-1 for _ in range(len(param_keys))]
335 |         self.param_keys = param_keys
336 |         self.param_iterator = product(*param_vals)
337 | 
338 |     def next(self):
339 |         """
340 |         Move on to the next parameter setting combination.
341 | 
342 |         Returns:
343 |             bool Whether there's anything left
344 |         """
345 |         next_params = next(self.param_iterator, Settings.NothingLeft)
346 |         if next_params is Settings.NothingLeft:
347 |             return False
348 |         assert len(next_params) == len(self.param_keys)
349 | 
350 |         self.trial_num += 1
351 |         self.trial_log[self.trial_num] = {}
352 |         self.trial_results[self.trial_num] = {}
353 |         for i, k in enumerate(self.param_keys):
354 |             self._params[k] = next_params[i]
355 |             self.trial_log[self.trial_num][k] = self._params[k]
356 |         return True
357 | 
358 |     def add_result(self, key, val):
359 |         """
360 |         Adds result in form of key: val *to currently running trial*.
361 | 
362 |         Args:
363 |             key (any hashable)
364 |             val (any)
365 |         """
366 |         if key not in self.trial_results_all_keys:
367 |             self.trial_results_all_keys.append(key)
368 |         self.trial_results[self.trial_num][key] = val
369 | 
370 |     def log_results(self):
371 |         """
372 |         Logs results. Call after trials have finished.
373 | 
374 |         First logs the config that didn't change.
375 | 
376 |         Then logs a table of the experiments run and any results that were
377 |         added.
378 |         """
379 |         self.logger.info('Static config (defaults):')
380 |         full_dm = self._get_default_map()
381 |         pure_dm = {k: cell_massage(v) for k,v in full_dm.iteritems() if k in self.default_keys}
382 |         list_pure_dm = [list(item) for item in pure_dm.iteritems()]
383 |         for line in tabulate(list_pure_dm, tablefmt="fancy_grid").split('\n'):
384 |             self.logger.info(line)
385 | 
386 |         self.logger.info('Trial configs:')
387 |         rows = []
388 |         for i in sorted(self.trial_log.keys()):
389 |             row = {}
390 |             # settings
391 |             for tk in self.trial_keys:
392 |                 row[tk] = cell_massage(self.trial_log[i][tk])
393 |             # ... then results
394 |             for rk in self.trial_results_all_keys:
395 |                 val = '---'
396 |                 if rk in self.trial_results[i]:
397 |                     val = cell_massage(self.trial_results[i][rk])
398 |                 row[rk] = val
399 |             rows.append(row)
400 | 
401 |         # TODO: use ordereddict and set key order so table headers go settings
402 |         # and then results.
403 |         # headers = self.trial_keys + self.trial_results_all_keys
404 |         for line in tabulate(rows, headers="keys", tablefmt="fancy_grid").split('\n'):
405 |             self.logger.info(line)
406 | 
407 |     def debug_log_config(self):
408 |         """
409 |         Dumps full config to debug log.
410 |         """
411 |         self.logger.debug('Settings:')
412 |         for k,v in self._params.iteritems():
413 |             self.logger.debug('%(key)25s: %(val)s' % {'key': k, 'val': v})
414 | 


--------------------------------------------------------------------------------
/lib/ngramdb/ngramdb/ngramdb.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import uuid
  3 | 
  4 | from myria import MyriaConnection
  5 | 
  6 | import util
  7 | from constants import *
  8 | from ngramtoken import ngrams_from_tupledict
  9 | 
 10 | 
 11 | class NgramDb(object):
 12 | 
 13 |     def __init__(self, connection_id):
 14 |         self._connection = MyriaConnection(
 15 |                 hostname=REST_URL,
 16 |                 port=REST_PORT,
 17 |                 ssl=True)
 18 | 
 19 |         connection_id = connection_id.replace(' ', '_')
 20 | 
 21 |         if not connection_id.replace('_', '').isalpha():
 22 |             raise ValueError("connection_id must be letters only, no "
 23 |                              "numbers or punctuation")
 24 | 
 25 |         self._connection_id = connection_id
 26 | 
 27 |         self.queries = []
 28 | 
 29 |     def create_query(
 30 |             self,
 31 |             words=None,
 32 |             postags=None,
 33 |             deprels=None,
 34 |             headids=None,
 35 |             ignore_position=False,
 36 |             absolute_position=False,
 37 |             limit=None,
 38 |             threshold=None,
 39 |             description=None,
 40 |             output=None):
 41 |         """Creates an NgramQuery object, which can be passed to the
 42 |         `run_query` method of this NgramDb object.
 43 | 
 44 |         Keyword arguments:
 45 |             words
 46 |                 - a list of strings
 47 |             postags
 48 |                 - a list of Penn-treebank style POS tags
 49 |             deprels
 50 |                 - a list of Stanford-style dependency relations
 51 |             headids
 52 |                 - a list of integers corresponding to the list position of
 53 |                   this token's head
 54 |             ignore_position
 55 |                 - do not pay attention to the ordering of the tokens in
 56 |                   the lists
 57 |             absolute_position
 58 |                 - the positions of tokens must match the positions in the
 59 |                   ngram exactly
 60 |             limit
 61 |                 - TODO: NOT COMPLETELY IMPLEMENTED
 62 |             threshold
 63 |                 - only return ngrams with at least this frequency
 64 |             description
 65 |                 - a plain-language description of this query
 66 |             output
 67 |                 - name of the Myria table that will store this query's
 68 |                   results; default is this NgramDb's connection_id
 69 | 
 70 |         Except for words, all arguments are optional.
 71 | 
 72 |         Any position in words, postags, deprels, or headids can be defined as
 73 |         `None` to denote that space as a "wildcard". For example,
 74 |         postags=["NNS",
 75 | 
 76 |         Any string in words, postags, or deprels may use the "|" character to
 77 |         signify "or". For example, words=["cat|dog|mous", "eats|runs"] will
 78 |         match "cat", "mouse", or "dog" in the first token, and "eats" or "runs"
 79 |         in the second token. """
 80 |         if not description or not isinstance(description, str):
 81 |             description = "[ ngramdb query #{} from {} ]".format(
 82 |                 len(self.queries), self._connection_id)
 83 | 
 84 |         if not output or not isinstance(output, str):
 85 |             output = self._connection_id
 86 | 
 87 |         return NgramDbQuery(
 88 |                 words=words,
 89 |                 postags=postags,
 90 |                 deprels=deprels,
 91 |                 headids=headids,
 92 |                 ignore_position=ignore_position,
 93 |                 absolute_position=absolute_position,
 94 |                 limit=limit,
 95 |                 threshold=threshold,
 96 |                 description=description,
 97 |                 output=output)
 98 | 
 99 |     def run_query(self, query):
100 |         """Runs an NgramQuery and returns a list of Ngrams.
101 | 
102 |         See create_query for details on creating a query.
103 |         """
104 |         self.queries.append(query)
105 | 
106 |         q_plan = self._make_join_context_query_plan(query)
107 | 
108 |         relation_key = q_plan['fragments'][-1]['operators'][-1]['relationKey']
109 | 
110 |         try:
111 |             # answer = self._connection.execute_query(q_plan)
112 |             myria_query = self._connection.submit_query(q_plan)
113 |             query_id = myria_query['queryId']
114 | 
115 |             full_status = self._connection.get_query_status(query_id)
116 |             status = full_status['status']
117 | 
118 |             while status not in ('UNKNOWN', 'SUCCESS', 'ERROR'):
119 |                 time.sleep(0.1)
120 |                 full_status = self._connection.get_query_status(query_id)
121 |                 status = full_status['status']
122 | 
123 |             if status in ('UNKNOWN', 'ERROR'):
124 |                 raise RuntimeError(
125 |                         "Myria error: {}".format(full_status['message']))
126 | 
127 |             else:
128 |                 raw_results = self._connection.download_dataset(relation_key)
129 |                 full_results = ngrams_from_tupledict(raw_results)
130 |                 return full_results
131 | 
132 |         except KeyboardInterrupt:
133 |             raise KeyboardInterrupt
134 | 
135 |     def create_and_run_query(
136 |             self,
137 |             words=None,
138 |             postags=None,
139 |             deprels=None,
140 |             headids=None,
141 |             ignore_position=False,
142 |             absolute_position=False,
143 |             limit=None,
144 |             threshold=None,
145 |             description=None,
146 |             output=None):
147 |         """Creates and runs an NgramQuery and returns a list of Ngrams.
148 | 
149 |         See create_query for details on creating a query.
150 |         """
151 | 
152 |         query = self.create_query(
153 |                 words=words,
154 |                 postags=postags,
155 |                 deprels=deprels,
156 |                 headids=headids,
157 |                 ignore_position=ignore_position,
158 |                 absolute_position=absolute_position,
159 |                 limit=limit,
160 |                 threshold=threshold,
161 |                 description=description,
162 |                 output=output)
163 | 
164 |         return self.run_query(query)
165 | 
166 |     @classmethod
167 |     def _make_join_context_query_plan(cls, query):
168 |         subquery = cls._build_join_context_subquery(query)
169 |         sql = ' '.join(SQL_CONTEXT_TEMPLATE.format(subquery=subquery).split())
170 |         q_plan = JSON_CONTEXT_TEMPLATE
171 |         q_plan['fragments'][0]['operators'][0]['sql'] = sql
172 |         q_plan['rawQuery'] = query.description
173 |         # Ugh line length.
174 |         last_op = q_plan['fragments'][-1]['operators'][-1]
175 |         last_op['relationKey']['relationName'] = query.output
176 |         return q_plan
177 | 
178 |     @classmethod
179 |     def _build_join_context_subquery(cls, query):
180 |         sub_rel_str, sub_pred_str = cls._build_join_subquery_components(query)
181 |         sub_template = "SELECT DISTINCT tt0.nid, tt0.freq FROM {} WHERE {}"
182 | 
183 |         if isinstance(query.threshold, int):
184 |             sub_template = sub_template + \
185 |                 " AND tt0.freq >= {}".format(query.threshold)
186 | 
187 |         sub_template = sub_template + " ORDER BY tt0.freq DESC, tt0.nid ASC"
188 | 
189 |         if isinstance(query.limit, int):
190 |             sub_template = sub_template + " LIMIT {}".format(query.limit)
191 | 
192 |         subquery = sub_template.format(sub_rel_str, sub_pred_str, query.limit)
193 |         return subquery
194 | 
195 |     @classmethod
196 |     def _build_join_subquery_components(cls, query):
197 |         # cheating at refactoring is fun lol
198 |         words = query.words
199 |         postags = query.postags
200 |         deprels = query.deprels
201 |         headids = query.headids
202 |         ignore_position = query.ignore_position
203 |         absolute_position = query.absolute_position
204 |         threshold = query.threshold
205 |         ngram_length = query.ngram_length
206 | 
207 |         zipped = zip(words, postags, deprels, range(ngram_length), headids)
208 | 
209 |         relations = []
210 |         predicates = []
211 | 
212 |         # get all appropriate pairs of tokens
213 |         token_idx_pairs = [
214 |             (i, j) for j in range(ngram_length) for i in range(j)]
215 |             #if ignore_position else [(i, i+1) for i in range(ngram_length-1)]
216 | 
217 |         # create relations and predicates for each pair
218 |         for i, pair in enumerate(token_idx_pairs):
219 |             tka_idx, tkb_idx = pair
220 |             token_pair = (zipped[tka_idx], zipped[tkb_idx])
221 | 
222 |             pair_id = "tt{}".format(i)
223 | 
224 |             if i > 0:
225 |                 predicates.append(
226 |                     util.make_predicate(pair_id, "nid", "tt0.nid"))
227 | 
228 |             tka_raw, tkb_raw = token_pair
229 | 
230 |             def build_token_kwargs(tka_raw, tkb_raw):
231 |                 tka_kwargs = {}
232 |                 tkb_kwargs = {}
233 | 
234 |                 tka_kwargs['word'] = tka_raw[0]
235 |                 tkb_kwargs['word'] = tkb_raw[0]
236 | 
237 |                 tka_kwargs['postag'] = tka_raw[1]
238 |                 tkb_kwargs['postag'] = tkb_raw[1]
239 | 
240 |                 tka_kwargs['deprel'] = tka_raw[2]
241 |                 tkb_kwargs['deprel'] = tkb_raw[2]
242 | 
243 |                 if ignore_position:
244 |                     tka_kwargs['offset'] = None
245 |                     tkb_kwargs['offset'] = None
246 |                 elif absolute_position:
247 |                     tka_kwargs['offset'] = tka_raw[3]+1
248 |                     tkb_kwargs['offset'] = tkb_raw[3]+1
249 |                 else:
250 |                     tka_kwargs['offset'] = None
251 |                     tkb_kwargs['offset'] = tkb_raw[3] - tka_raw[3]
252 | 
253 |                 head = None
254 |                 if tka_raw[4] == None and tkb_raw[4] == None:
255 |                     pass
256 |                 elif tka_raw[3] == tkb_raw[4]:
257 |                     head = "tka"
258 |                 elif tka_raw[3] == tkb_raw[4]:
259 |                     head = "tkb"
260 | 
261 |                 return (tka_kwargs, tkb_kwargs, head)
262 | 
263 |             tka_kwargs, tkb_kwargs, head = build_token_kwargs(tka_raw, tkb_raw)
264 | 
265 |             subrelations, subpredicates = cls._build_pair_predicate(
266 |                 pair_id, tka_kwargs, tkb_kwargs, head)
267 | 
268 |             relations.extend(subrelations)
269 | 
270 |             if ignore_position:
271 |                 sr, sp = cls._build_pair_predicate(
272 |                     pair_id, tkb_kwargs, tka_kwargs, head)
273 |                 relations.extend(sr)
274 | 
275 |                 sp1 = "({})".format(" AND ".join(subpredicates))
276 |                 sp2 = "({})".format(" AND ".join(sp))
277 | 
278 |                 predicates.append("({})".format(" OR ".join((sp1, sp2))))
279 | 
280 |             else:
281 |                 subpredicate = "({})".format(" AND ".join(subpredicates))
282 |                 predicates.append(subpredicate)
283 | 
284 |         # put 'em all together
285 |         sub_rel_str = ", ".join(set(relations))
286 |         sub_pred_str = " AND ".join(set(predicates))
287 | 
288 |         return (sub_rel_str, sub_pred_str)
289 | 
290 |     @classmethod
291 |     def _build_pair_predicate(
292 |             cls, this_pair, tka_kwargs, tkb_kwargs, head=None):
293 |         # kwargs: word, postag, deprel, offset
294 | 
295 |         subrelations = [util.aliased_relation(TT_RELATION, this_pair)]
296 |         subpredicates = []
297 | 
298 |         i = int(this_pair[2:])
299 | 
300 |         def make_word_pred(tk, word):
301 |             if word is None:
302 |                 return None
303 | 
304 |             joined_tk_words = ','.join(
305 |                 "'{}'".format(w) for w in word.split('|'))
306 |             return util.make_predicate(
307 |                     this_pair,
308 |                     "{}_surface".format(tk),
309 |                     "({})".format(joined_tk_words),
310 |                     ' IN ')
311 | 
312 |         subpredicates.append(make_word_pred('tka', tka_kwargs['word']))
313 |         subpredicates.append(make_word_pred('tkb', tkb_kwargs['word']))
314 | 
315 |         def make_postag_pred(tk, postag):
316 |             if postag is None:
317 |                 return None
318 | 
319 |             this_pos = 'pos{}_{}'.format(i, tk)
320 |             subrelations.append(util.aliased_relation(POS_RELATION, this_pos))
321 | 
322 |             these_postags = postag.split('|')
323 | 
324 |             pred1 = util.make_predicate(
325 |                     this_pair,
326 |                     "{}_posid".format(tk),
327 |                     "{}.posid".format(this_pos))
328 | 
329 |             pred2 = util.make_predicate(
330 |                     this_pos,
331 |                     "postag",
332 |                     "({})".format(','.join("'{}'".format(p) for p in
333 |                                   these_postags)),
334 |                     ' IN ')
335 |             return " AND ".join((pred1, pred2))
336 | 
337 |         subpredicates.append(make_postag_pred('tka', tka_kwargs['postag']))
338 |         subpredicates.append(make_postag_pred('tkb', tkb_kwargs['postag']))
339 | 
340 |         def make_deprel_pred(tk, deprel):
341 |             if deprel is None:
342 |                 return None
343 | 
344 |             this_deprel = 'deprel{}_{}'.format(i, tk)
345 |             subrelations.append(
346 |                 util.aliased_relation(DEP_RELATION, this_deprel))
347 | 
348 |             these_deprels = deprel.split('|')
349 | 
350 |             pred1 = util.make_predicate(
351 |                     this_pair,
352 |                     "{}_depid".format(tk),
353 |                     "{}.depid".format(this_deprel))
354 | 
355 |             pred2 = util.make_predicate(
356 |                     this_deprel,
357 |                     "deprel",
358 |                     "({})".format(','.join("'{}'".format(p) for p in
359 |                                   these_deprels)),
360 |                     ' IN ')
361 | 
362 |             return " AND ".join((pred1, pred2))
363 | 
364 |         subpredicates.append(make_deprel_pred('tka', tka_kwargs['deprel']))
365 |         subpredicates.append(make_deprel_pred('tkb', tkb_kwargs['deprel']))
366 | 
367 |         # if first token's offset is not none, then the position is absolute --
368 |         # (both should be set)
369 |         if tka_kwargs['offset'] is not None:
370 |             subpredicates.append(util.make_predicate(
371 |                 this_pair, "tka_position", tka_kwargs['offset']))
372 | 
373 |             subpredicates.append(util.make_predicate(
374 |                 this_pair, "tkb_position", tkb_kwargs['offset']))
375 | 
376 |         # otherwise, position is relative (but we still care)
377 |         elif tkb_kwargs['offset'] is not None:
378 |             subpredicates.append(util.make_predicate(
379 |                 this_pair, "tkb_position",
380 |                 "{}.tka_position".format(this_pair),
381 |                 '>'))
382 | 
383 |         if head is not None:
384 |             if head == 'tka':
385 |                 subpredicates.append(util.make_predicate(
386 |                     this_pair, "tkb_headposition",
387 |                     "{}.{}".format(this_pair, "tka_position")))
388 |             elif head == 'tkb':
389 |                 subpredicates.append(util.make_predicate(
390 |                     this_pair, "tka_headposition",
391 |                     "{}.{}".format(this_pair, "tkb_position")))
392 | 
393 |         return (subrelations, [s for s in subpredicates if s is not None])
394 | 
395 | 
396 | class NgramDbQuery(object):
397 | 
398 |     def __init__(
399 |             self,
400 |             words=None,
401 |             postags=None,
402 |             deprels=None,
403 |             headids=None,
404 |             ignore_position=False,
405 |             absolute_position=False,
406 |             limit=None,
407 |             threshold=None,
408 |             description="[ ngramdb query ]",
409 |             output="TEMPOUT"):
410 | 
411 |         # no conflicting args!!
412 |         if ignore_position and absolute_position:
413 |             raise ValueError("ignore_position and absolute_position cannot"
414 |                              "both be True")
415 | 
416 |         # make sure we have enough reasonable words, or the query could freeze
417 |         # the db
418 |         if words is None:
419 |             raise ValueError("'words' keyword argument must have a list of "
420 |                              "at least " + str(MIN_WORD_COUNT) + " word(s)")
421 | 
422 |         if sum(1 for w in words if w is not None) < MIN_WORD_COUNT:
423 |             raise ValueError("'words' keyword argument must have a list of "
424 |                              "at least " + str(MIN_WORD_COUNT) + " word(s)")
425 | 
426 |         if all(len(w) < MIN_WORD_LEN for w in words if w is not None):
427 |             raise ValueError("'words' keyword argument must have at least 1 "
428 |                              "word that is " + str(MIN_WORD_LEN) +
429 |                              " or more letters long")
430 | 
431 |         # error check the rest of the arguments
432 |         try:
433 |             self.ngram_length, max_name = max([(len(x), y) for x, y in zip(
434 |                         (words, postags, deprels, headids),
435 |                         ("words", "postags", "deprels", "headids"))
436 |                 if x is not None],
437 |                 key=lambda x: x[0])
438 | 
439 |         except TypeError as e:
440 |             raise ValueError("Must provide at least keyword arg of 'words', "
441 |                              "'posids', 'depids', 'headids'")
442 | 
443 |         def check_arg(kw, arg):
444 |             if arg is not None:
445 |                 if len(arg) != self.ngram_length:
446 |                     raise ValueError(
447 |                         "{} and {} must have same number of items"
448 |                         " (need {}, found {})".format(
449 |                             max_name, kw, self.ngram_length, len(arg)))
450 | 
451 |                 else:
452 |                     return arg
453 | 
454 |             else:
455 |                 return [None for _ in range(self.ngram_length)]
456 | 
457 |         # normalize arguments
458 |         self.words = [x.lower() if x is not None else x
459 |                       for x in check_arg('words', words)]
460 |         self.postags = [x.upper() if x is not None else x
461 |                         for x in check_arg('posids', postags)]
462 |         self.deprels = [x for x in check_arg('depids', deprels)]
463 |         self.headids = [int(x) if x is not None else x
464 |                         for x in check_arg('headids', headids)]
465 | 
466 |         # set all the other stuff
467 |         self.ignore_position = ignore_position
468 |         self.absolute_position = absolute_position
469 |         self.limit = limit
470 |         self.threshold = threshold
471 |         self.description = description
472 |         self.output = output
473 | 
474 |     def __eq__(self, other):
475 |         '''
476 |         Implementing for caching with these as keys to a dictionary.
477 | 
478 |         Would be as simple as compairing self.__dict__.items() (as below in
479 |         __str__) but there is some output info stored that we don't want to
480 |         compare.
481 |         '''
482 |         return (self.words == other.words and
483 |                 self.postags == other.postags and
484 |                 self.deprels == other.deprels and
485 |                 self.headids == other.headids and
486 |                 self.ignore_position == other.ignore_position and
487 |                 self.absolute_position == other.absolute_position and
488 |                 self.limit == other.limit and
489 |                 self.threshold == other.threshold)
490 | 
491 |     def __ne__(self, other):
492 |         '''
493 |         Yep, this doesn't happen automatically. Thanks, python.
494 |         '''
495 |         return not self == other
496 | 
497 |     def __hash__(self):
498 |         '''
499 |         Implementing for caching with these as keys to a dictionary.
500 |         '''
501 |         # Can't use None in hashing because it can return inconsistent numbers
502 |         # (as it just uses None's address in memory, which changes if the OS
503 |         # has memory randomization turned on).
504 |         words = tuple([w if w is not None else '' for w in self.words])
505 |         postags = tuple([p if p is not None else '' for p in self.postags])
506 |         deprels = tuple([d if d is not None else '' for d in self.deprels])
507 |         headids = tuple([h if h is not None else '' for h in self.headids])
508 |         limit = 0 if self.limit is None else self.limit
509 |         threshold = 0 if self.threshold is None else self.threshold
510 |         return hash((
511 |             words,
512 |             postags,
513 |             deprels,
514 |             headids,
515 |             self.ignore_position,
516 |             self.absolute_position,
517 |             limit,
518 |             threshold))
519 | 
520 |     def __str__(self):
521 |         values = ["{}={}".format(k, repr(v))
522 |                   for k, v in sorted(self.__dict__.items(),
523 |                                      key=lambda x: type(x[1]))]
524 |         return '\n'.join(values)
525 | 


--------------------------------------------------------------------------------
/data/verbphysics/objects/train-20/train.csv:
--------------------------------------------------------------------------------
  1 | ,obj1,obj2,size-agree,size-maj,weight-agree,weight-maj,strength-agree,strength-maj,rigidness-agree,rigidness-maj,speed-agree,speed-maj
  2 | 0,person,dress,1,-42,3,1,3,1,1,-42,1,-42
  3 | 1,person,step,3,1,3,1,3,-1,2,-1,3,1
  4 | 2,body,mouth,3,1,3,1,3,1,2,1,2,-1
  5 | 3,sun,coal,3,1,3,1,2,1,1,-42,2,1
  6 | 4,vessel,something,3,-42,3,-42,3,-42,3,-42,3,-42
  7 | 5,place,farm,3,-42,3,-42,2,-42,3,-42,2,0
  8 | 6,master,dress,3,1,3,1,3,1,3,1,3,1
  9 | 7,ground,body,3,1,3,1,3,1,3,1,3,-1
 10 | 8,ash,mouth,3,-1,3,-1,3,-1,2,-1,2,-1
 11 | 9,gentleman,knife,3,1,3,1,2,-1,2,-1,2,1
 12 | 10,train,face,3,1,3,1,3,1,3,1,3,1
 13 | 11,friend,mouth,3,1,3,1,3,1,2,1,2,0
 14 | 12,energy,sun,2,-42,2,-1,1,-42,2,-42,2,-42
 15 | 13,father,basin,3,1,2,1,2,1,2,-1,3,1
 16 | 14,bag,gate,3,-1,3,-1,3,-1,3,-1,2,-42
 17 | 15,brother,book,3,1,3,1,3,1,3,-1,3,1
 18 | 16,way,road,2,-42,3,-42,2,0,2,0,2,0
 19 | 17,back,something,2,-42,2,-42,2,-42,2,-42,2,-42
 20 | 18,lady,car,3,-1,3,-1,3,-1,3,-1,3,-1
 21 | 19,dinner,daughter,3,-1,3,-1,3,-1,1,-42,3,-1
 22 | 20,person,lad,3,0,3,0,3,0,3,0,3,0
 23 | 21,fist,hand,2,-1,3,0,2,0,2,0,2,0
 24 | 22,ground,room,1,-42,2,1,2,1,1,-42,2,-42
 25 | 23,child,doorway,3,-1,2,-1,2,-1,3,-1,3,1
 26 | 24,victim,face,3,1,3,1,2,1,3,0,2,0
 27 | 25,rain,light,1,-42,1,-42,2,-42,1,-42,3,-1
 28 | 26,horse,coal,3,1,3,1,2,1,2,-1,3,1
 29 | 27,poet,door,2,-1,2,1,1,-42,2,-1,2,-42
 30 | 28,brother,ball,3,1,3,1,3,1,3,-1,2,-1
 31 | 29,lady,direction,3,-42,2,-42,2,-42,2,-42,2,-42
 32 | 30,house,sea,3,-1,2,-1,3,-42,2,-42,2,-1
 33 | 31,coach,arm,3,1,3,1,3,1,2,0,2,0
 34 | 32,lady,object,3,-42,3,-42,3,-42,3,-42,3,-42
 35 | 33,something,hand,2,1,2,1,2,-42,2,-42,2,-42
 36 | 34,father,seal,1,-42,1,-42,1,-42,2,-42,2,-42
 37 | 35,edition,place,3,-42,3,-42,3,-42,3,-42,3,-42
 38 | 36,room,wife,2,1,2,1,2,1,2,1,2,-42
 39 | 37,messenger,camp,2,-1,2,-1,2,-42,2,-1,2,1
 40 | 38,window,floor,3,-1,3,-1,3,-1,2,0,3,0
 41 | 39,place,hand,3,1,2,1,2,1,2,1,3,-1
 42 | 40,door,floor,3,-1,3,-1,2,0,3,0,2,1
 43 | 41,bay,boat,3,1,2,-42,1,-42,1,-42,3,-1
 44 | 42,food,way,3,-42,3,-42,3,-42,3,-42,3,-42
 45 | 43,hat,back,3,-1,3,-1,3,-1,3,-1,2,-1
 46 | 44,someone,dinner,3,1,3,1,2,1,1,-42,3,1
 47 | 45,someone,fool,2,-42,1,-42,2,-42,2,-42,2,-42
 48 | 46,stone,hand,1,-42,2,1,3,1,3,1,2,-1
 49 | 47,ice,head,2,-42,1,-42,2,1,2,1,2,-42
 50 | 48,coach,hat,3,1,3,1,3,1,3,1,3,1
 51 | 49,ear,something,3,-42,3,-42,3,-42,3,-42,3,-42
 52 | 50,someone,boy,2,-42,2,-42,2,-42,2,0,1,-42
 53 | 51,stone,bed,2,-1,2,-1,2,1,3,1,1,-42
 54 | 52,person,daughter,2,1,2,1,2,1,1,-42,2,1
 55 | 53,person,barn,3,-1,3,-1,3,-1,3,-1,3,1
 56 | 54,sun,tree,3,1,3,1,2,1,2,-42,2,1
 57 | 55,door,light,2,1,3,1,3,1,2,1,2,-1
 58 | 56,ball,mouth,1,-42,1,-42,1,-42,2,1,1,-42
 59 | 57,child,picture,3,1,3,1,3,1,3,-1,3,1
 60 | 58,brother,hand,3,1,3,1,3,1,2,0,3,0
 61 | 59,back,air,2,-42,2,1,2,1,2,1,2,-42
 62 | 60,gentleman,ball,2,1,2,1,2,1,2,1,1,-42
 63 | 61,window,end,2,-42,2,-42,2,-42,2,-42,2,-42
 64 | 62,step,road,3,-1,3,-1,3,-1,2,-1,2,1
 65 | 63,result,element,2,-42,1,-42,1,-42,2,-42,2,-42
 66 | 64,parent,child,3,1,3,1,3,1,2,0,2,0
 67 | 65,sun,sail,3,1,3,1,2,1,2,-42,2,1
 68 | 66,river,breath,2,1,3,1,3,1,2,1,3,1
 69 | 67,vessel,anchor,3,1,3,1,2,1,2,0,2,1
 70 | 68,friend,newspaper,3,1,3,1,3,1,3,1,3,1
 71 | 69,everything,master,3,1,3,1,2,1,2,-42,2,-42
 72 | 70,coast,place,2,1,2,1,2,-42,2,-42,1,-42
 73 | 71,state,way,2,-42,2,-42,2,-42,2,-42,3,-42
 74 | 72,anchor,mouth,2,1,2,1,2,1,2,1,3,-42
 75 | 73,hair,room,3,-1,3,-1,3,-1,3,-1,1,-42
 76 | 74,sea,sail,3,1,3,1,2,1,2,-1,2,1
 77 | 75,temple,something,3,-42,3,-42,3,-42,3,-42,3,-42
 78 | 76,system,end,3,-42,3,-42,3,-42,3,-42,3,-42
 79 | 77,stone,way,2,-42,2,-42,2,-42,2,-42,2,-42
 80 | 78,sun,ear,3,1,3,1,3,1,2,1,2,1
 81 | 79,anything,end,2,-42,3,-42,3,-42,3,-42,2,-42
 82 | 80,father,truck,3,-1,3,-1,3,-1,3,-1,3,-1
 83 | 81,head,ball,2,0,3,1,2,1,1,-42,3,-1
 84 | 82,hip,hand,3,1,3,1,2,1,1,-42,2,-1
 85 | 83,body,direction,2,-42,2,-42,2,-42,2,-42,2,-42
 86 | 84,king,camp,3,-1,3,-1,2,-1,1,-42,3,1
 87 | 85,bag,way,3,-42,3,-42,3,-42,3,-42,3,-42
 88 | 86,person,wife,3,0,3,0,3,0,3,0,3,0
 89 | 87,hair,floor,3,-1,3,-1,3,-1,3,-1,2,1
 90 | 88,ball,light,1,-42,2,1,2,1,1,-42,3,-1
 91 | 89,heaven,face,2,1,2,-42,2,-42,2,-42,2,-1
 92 | 90,knife,throat,1,-42,2,-1,2,1,2,1,2,-42
 93 | 91,someone,light,1,-42,3,1,2,1,2,-1,2,-1
 94 | 92,chair,window,2,0,2,1,3,1,2,0,3,0
 95 | 93,person,fox,3,1,3,1,2,1,2,0,3,-1
 96 | 94,sea,middle,2,-42,3,-42,3,-42,3,-42,3,-42
 97 | 95,messenger,master,1,-42,1,-42,1,-42,2,0,1,-42
 98 | 96,system,something,3,-42,3,-42,3,-42,3,-42,3,-42
 99 | 97,shirt,hand,3,1,2,-1,2,-1,1,-42,1,-42
100 | 98,person,ice,3,1,3,1,3,1,2,-1,2,1
101 | 99,step,flood,2,-1,2,-42,2,-42,2,-42,2,-42
102 | 100,daughter,call,2,-42,2,1,2,-42,2,-42,3,-42
103 | 101,eye,fist,3,-1,3,-1,3,-1,2,-1,2,-42
104 | 102,house,hill,3,-1,3,-1,3,-1,2,-1,2,0
105 | 103,stream,hand,3,1,2,1,2,-42,2,-1,3,1
106 | 104,current,shore,2,1,1,-42,2,1,2,-1,3,1
107 | 105,sea,call,2,-42,2,-42,2,-42,2,-42,2,-42
108 | 106,ship,hand,3,1,3,1,3,1,3,1,1,-42
109 | 107,child,glass,2,1,2,1,1,-42,2,-1,2,1
110 | 108,way,end,3,-42,3,-42,3,-42,3,-42,3,-42
111 | 109,lady,eye,2,1,2,1,2,1,2,1,1,-42
112 | 110,house,back,1,-42,1,-42,1,-42,1,-42,2,-1
113 | 111,fist,mouth,2,0,1,-42,2,1,2,1,1,-42
114 | 112,door,wife,2,1,1,-42,1,-42,2,1,2,-1
115 | 113,bay,way,3,-42,3,-42,3,-42,3,-42,3,-42
116 | 114,object,hand,3,-42,3,-42,3,-42,3,-42,3,-42
117 | 115,flood,end,3,-42,3,-42,3,-42,3,-42,3,-42
118 | 116,eye,direction,3,-42,3,-42,3,-42,2,-42,2,-42
119 | 117,river,boat,3,1,3,1,2,1,2,-1,2,1
120 | 118,brother,coal,3,1,2,1,3,1,2,-1,3,1
121 | 119,victim,house,3,-1,3,-1,2,-1,2,-1,3,1
122 | 120,brother,clothes,2,1,3,1,3,1,3,1,2,0
123 | 121,child,purse,3,1,3,1,3,1,2,-1,3,1
124 | 122,bank,flood,3,-42,3,-42,2,-42,2,-42,2,-1
125 | 123,house,farm,3,-1,3,-1,2,-1,1,-42,3,0
126 | 124,side,current,3,-42,3,-42,3,-42,2,-42,2,-42
127 | 125,gentleman,book,3,1,3,1,3,1,3,-1,3,1
128 | 126,ground,king,3,1,3,1,3,1,3,1,3,-1
129 | 127,father,world,3,-1,3,-1,3,-1,3,-1,2,1
130 | 128,wall,hand,3,1,3,1,3,1,3,1,3,-1
131 | 129,grass,hand,3,-1,3,-1,3,-1,2,1,3,-1
132 | 130,bank,suit,3,1,3,1,3,1,3,1,1,-42
133 | 131,patient,glass,2,1,3,1,1,-42,2,-1,2,1
134 | 132,gentleman,train,3,-1,3,-1,3,-1,3,-1,3,-1
135 | 133,meal,piece,3,1,3,1,2,-42,2,-42,2,-42
136 | 134,sun,breath,3,1,3,1,3,1,1,-42,1,-42
137 | 135,everything,child,3,1,3,1,3,1,2,1,2,1
138 | 136,hat,response,2,-42,2,1,2,-42,1,-42,3,-1
139 | 137,torrent,mountain,2,-1,2,-1,1,-42,2,-42,3,1
140 | 138,boy,farm,3,-1,2,-1,3,-1,2,-1,3,1
141 | 139,office,picture,3,1,3,1,1,-42,1,-42,3,0
142 | 140,gentleman,stream,3,-1,3,-1,1,-42,3,1,2,-1
143 | 141,house,barn,1,-42,1,-42,2,0,2,0,2,0
144 | 142,bag,everything,2,-42,2,-42,2,-42,2,-42,2,-42
145 | 143,coach,bank,2,-1,2,-1,2,-1,2,-1,2,1
146 | 144,gentleman,eye,3,1,3,1,3,1,2,-1,2,1
147 | 145,person,ship,3,-1,3,-1,3,-1,3,-1,2,-1
148 | 146,someone,eye,2,1,2,1,2,1,1,-42,1,-42
149 | 147,father,light,2,-42,3,1,2,1,3,1,2,-1
150 | 148,river,sun,3,-1,3,-1,3,-1,2,-1,2,-42
151 | 149,sun,head,3,1,3,1,3,1,2,1,2,1
152 | 150,someone,piece,2,-42,2,-42,3,-42,3,-42,3,-42
153 | 151,gentleman,room,3,-1,3,-1,3,-1,3,-1,3,1
154 | 152,lady,stream,3,-1,2,-1,2,1,2,1,3,-1
155 | 153,foot,wall,3,-1,3,-1,3,-1,3,-1,3,1
156 | 154,breath,soul,2,-42,2,0,1,-42,2,0,1,-42
157 | 155,daughter,anything,2,-42,2,-42,3,-42,2,-42,3,-42
158 | 156,back,room,2,-42,2,-42,2,-42,2,-42,2,-42
159 | 157,scene,room,2,-42,2,-42,2,-42,2,-42,2,-42
160 | 158,hair,effect,3,-42,2,-42,2,-42,3,-42,2,-42
161 | 159,king,effect,1,-42,2,-42,2,-42,2,-42,2,-42
162 | 160,car,hand,3,1,3,1,3,1,3,1,3,1
163 | 161,town,picture,3,1,3,1,3,1,2,1,2,-42
164 | 162,lady,picture,3,1,3,1,3,1,2,-1,3,1
165 | 163,window,air,2,-1,3,1,2,1,3,1,3,-1
166 | 164,piano,suit,3,1,3,1,3,1,2,1,2,-42
167 | 165,father,bag,3,1,3,1,3,1,2,1,3,1
168 | 166,exile,end,2,-42,2,-42,2,-42,2,-42,2,-42
169 | 167,house,picture,3,1,3,1,3,1,3,1,2,-1
170 | 168,office,air,2,-1,2,1,2,-42,1,-42,2,-1
171 | 169,skirt,knee,1,-42,2,-1,1,-42,2,-1,2,-1
172 | 170,body,room,3,-1,2,-1,2,-1,3,-1,3,1
173 | 171,someone,child,3,1,3,1,3,1,2,0,1,-42
174 | 172,lady,hand,2,1,2,1,2,0,1,-42,1,-42
175 | 173,person,elbow,3,1,3,1,2,1,2,0,2,0
176 | 174,river,current,3,1,2,1,2,-1,2,0,2,-1
177 | 175,head,light,2,-1,3,1,3,1,3,1,2,-1
178 | 176,fox,goose,2,1,2,1,2,1,3,0,2,1
179 | 177,person,deck,3,-1,3,-1,3,-1,3,-1,3,1
180 | 178,boy,something,3,-42,3,-42,3,-42,3,-42,3,-42
181 | 179,phone,room,3,-1,3,-1,2,-1,2,-1,1,-42
182 | 180,call,way,2,-42,2,-42,2,-42,2,-42,2,-42
183 | 181,boy,face,3,1,3,1,3,1,1,-42,2,0
184 | 182,energy,hand,2,-42,2,-1,1,-42,2,-1,2,1
185 | 183,stone,direction,3,-42,3,-42,3,-42,3,-42,3,-42
186 | 184,state,step,3,1,2,1,2,1,2,1,1,-42
187 | 185,shoulder,light,3,-42,2,1,2,-42,2,1,2,-1
188 | 186,house,mouth,3,1,2,1,3,1,3,1,3,-1
189 | 187,father,bay,3,-1,2,-1,3,-1,2,1,2,1
190 | 188,side,soul,2,-42,2,1,2,-42,3,1,3,-42
191 | 189,front,light,3,-42,2,-42,2,-42,2,-42,2,-42
192 | 190,finger,grass,2,1,3,1,3,1,2,1,3,1
193 | 191,town,newspaper,3,1,3,1,2,1,3,1,2,-1
194 | 192,body,effect,3,-42,3,-42,3,-42,3,-42,3,-42
195 | 193,side,everything,2,-1,2,-1,3,-42,3,-42,3,-42
196 | 194,hat,way,3,-42,3,-42,3,-42,3,-42,3,-42
197 | 195,world,anything,2,1,2,1,2,-42,2,-42,2,1
198 | 196,piece,knee,2,-42,2,-42,2,-42,2,-42,2,-42
199 | 197,office,wind,1,-42,2,1,2,-1,1,-42,3,-1
200 | 198,magistrate,wife,2,1,2,1,1,-42,1,-42,1,-42
201 | 199,person,fool,3,0,3,0,3,0,3,0,3,0
202 | 200,master,road,2,-1,2,-1,2,-1,2,-1,2,1
203 | 201,father,coast,2,-1,2,-1,2,-42,2,-1,2,1
204 | 202,sea,city,2,1,2,-42,2,-42,2,-1,2,-42
205 | 203,boy,lamp,3,1,3,1,3,1,3,-1,3,1
206 | 204,coach,door,2,-42,2,-42,2,-42,2,-42,2,-42
207 | 205,anchor,middle,3,-42,3,-42,3,-42,3,-42,3,-42
208 | 206,messenger,boy,2,0,2,0,2,0,3,0,2,0
209 | 207,father,step,3,1,2,1,1,-42,2,-1,2,1
210 | 208,meal,child,3,-1,3,-1,3,-1,2,-1,2,-1
211 | 209,block,wall,3,-1,2,-1,2,-1,2,0,2,0
212 | 210,home,city,3,-1,2,-1,2,0,2,-1,2,0
213 | 211,father,wife,2,1,2,1,2,1,2,0,2,0
214 | 212,side,beach,2,-1,2,-42,3,-42,2,-42,2,-42
215 | 213,goose,hill,3,-1,3,-1,3,-1,3,-1,3,1
216 | 214,horse,room,3,-1,3,-1,2,-1,3,-1,2,1
217 | 215,air,light,2,-42,1,-42,1,-42,2,-1,1,-42
218 | 216,person,spoon,3,1,3,1,2,1,3,-1,3,1
219 | 217,newspaper,floor,2,-1,2,-1,2,-1,3,-1,2,-42
220 | 218,car,end,3,-42,3,-42,3,-42,3,-42,3,-42
221 | 219,servant,room,3,-1,3,-1,3,-1,3,-1,3,1
222 | 220,eye,car,3,-1,3,-1,3,-1,3,-1,3,-1
223 | 221,base,hand,2,1,2,1,1,-42,2,0,2,-42
224 | 222,clock,direction,3,-42,2,-42,3,-42,3,-42,3,-42
225 | 223,father,clothes,3,1,3,1,2,1,2,1,2,1
226 | 224,wall,room,1,-42,1,-42,2,1,1,-42,2,-42
227 | 225,room,piece,2,-42,2,-42,2,-42,2,-42,2,-42
228 | 226,energy,room,2,-42,2,-1,2,-42,2,-1,2,1
229 | 227,sea,light,2,1,3,1,2,1,2,-42,3,-1
230 | 228,gentleman,everything,2,-1,2,-1,2,-1,2,-42,1,-42
231 | 229,clothes,bed,2,-1,2,-1,2,-1,2,-1,2,0
232 | 230,someone,grass,3,1,3,1,3,1,3,1,3,1
233 | 231,something,end,3,-42,3,-42,3,-42,3,-42,3,-42
234 | 232,city,impatient,2,-42,2,-42,2,-42,2,-42,3,-42
235 | 233,horse,direction,3,-42,2,-42,2,-42,2,-42,3,-42
236 | 234,king,ship,2,-1,2,-1,2,-1,2,-1,2,-1
237 | 235,head,direction,3,-42,3,-42,3,-42,3,-42,2,-42
238 | 236,city,dress,3,1,3,1,3,1,3,1,1,-42
239 | 237,current,end,3,-42,3,-42,3,-42,3,-42,3,-42
240 | 238,person,boat,3,-1,3,-1,3,-1,3,-1,2,-1
241 | 239,anything,vessel,3,-42,3,-42,3,-42,3,-42,3,-42
242 | 240,dinner,dress,3,-1,2,0,2,-1,2,-42,2,0
243 | 241,person,precipice,2,-1,2,-1,2,-1,2,-1,3,1
244 | 242,harlot,face,3,1,3,1,3,1,3,-1,2,1
245 | 243,master,violin,3,1,3,1,3,1,3,-1,2,1
246 | 244,call,door,3,-42,3,-42,3,-42,3,-42,3,-42
247 | 245,person,clothes,2,1,3,1,3,1,2,1,3,1
248 | 246,dress,face,3,1,2,-1,2,1,2,-1,2,-42
249 | 247,king,knee,3,1,3,1,2,1,2,-1,2,1
250 | 248,window,front,3,-42,3,-42,3,-42,3,-42,2,-42
251 | 249,stone,torrent,2,-1,2,-1,2,-42,2,-42,2,-1
252 | 250,friend,bed,3,-1,2,-1,2,-1,3,-1,2,1
253 | 251,seal,way,3,-42,3,-42,3,-42,3,-42,3,-42
254 | 252,brother,doorway,3,-1,1,-42,2,-1,3,-1,3,1
255 | 253,cup,air,2,-1,2,1,2,1,2,1,2,-1
256 | 254,state,city,3,1,2,1,1,-42,2,1,1,-42
257 | 255,glass,hand,1,-42,2,0,3,-1,3,1,2,-1
258 | 256,father,child,3,1,3,1,3,1,2,1,2,-1
259 | 257,ship,rain,2,1,3,1,2,1,3,1,1,-42
260 | 258,messenger,city,3,-1,3,-1,3,-1,3,-1,2,1
261 | 259,wife,mouth,3,1,3,1,3,1,2,1,3,1
262 | 260,state,piece,3,1,3,1,3,1,2,1,2,-1
263 | 261,state,book,3,1,2,1,2,1,2,1,2,-42
264 | 262,person,vial,3,1,3,1,3,1,3,-1,3,1
265 | 263,someone,step,3,1,3,1,2,-1,3,-1,3,1
266 | 264,hand,position,3,-42,2,-42,2,-42,3,-42,3,-42
267 | 265,brother,face,3,1,3,1,3,1,2,1,3,1
268 | 266,gentleman,bottle,3,1,3,1,3,1,3,-1,3,1
269 | 267,body,breath,3,1,3,1,3,1,3,1,1,-42
270 | 268,body,master,3,0,3,0,3,0,3,0,3,0
271 | 269,brother,daughter,2,1,2,1,3,1,2,0,2,0
272 | 270,something,mouth,2,-42,2,-42,2,-42,2,-42,2,-42
273 | 271,eye,road,3,-1,3,-1,3,-1,3,-1,2,1
274 | 272,coach,front,3,-42,3,-42,3,-42,3,-42,3,-42
275 | 273,watch,house,3,-1,3,-1,3,-1,2,-1,2,-42
276 | 274,ash,floor,3,-1,3,-1,3,-1,3,-1,2,1
277 | 275,horse,bed,2,1,2,1,2,1,2,-1,3,1
278 | 276,hair,body,3,-1,3,-1,3,-1,3,-1,3,0
279 | 277,head,doorway,3,-1,3,-1,2,-1,2,-1,2,1
280 | 278,lady,fool,2,-42,2,-42,2,-42,2,-42,2,-42
281 | 279,gentleman,purse,3,1,3,1,3,1,1,-42,3,1
282 | 280,result,position,1,-42,2,1,2,1,2,1,2,-42
283 | 281,street,middle,3,-42,3,-42,3,-42,3,-42,3,-42
284 | 282,hair,wind,3,-1,2,1,2,-1,3,1,2,-1
285 | 283,train,effect,2,-42,1,-42,1,-42,2,1,2,1
286 | 284,lad,camp,3,-1,2,-1,2,-1,3,-1,2,1
287 | 285,place,ship,2,1,2,-42,2,-42,2,-42,2,-1
288 | 286,fox,hand,3,1,3,1,2,1,2,-1,3,1
289 | 287,meal,ground,2,-1,2,-1,2,-1,2,-1,2,-42
290 | 288,bank,hill,3,-1,3,-1,2,-1,2,0,3,0
291 | 289,boy,call,2,-42,2,-42,2,-42,2,-42,2,-42
292 | 290,father,pocket,3,1,3,1,3,1,2,1,3,1
293 | 291,food,car,3,-1,3,-1,3,-1,3,-1,3,-1
294 | 292,sea,basin,3,1,3,1,1,-42,3,-1,2,1
295 | 293,clothes,body,2,-1,3,-1,2,-1,3,-1,1,-42
296 | 294,arm,king,3,-1,3,-1,1,-42,2,0,2,0
297 | 295,boat,servant,3,1,3,1,3,1,3,1,3,1
298 | 296,train,servant,3,1,3,1,3,1,3,1,3,1
299 | 297,sun,eye,3,1,3,1,3,1,2,1,2,1
300 | 298,world,eye,3,1,3,1,3,1,2,1,2,-1
301 | 299,place,store,2,1,1,-42,2,-1,2,-1,1,-42
302 | 300,everything,ball,1,-42,1,-42,2,0,1,-42,2,-42
303 | 301,brother,back,2,1,3,1,1,-42,1,-42,2,0
304 | 302,coach,wife,2,0,2,0,1,-42,2,0,2,0
305 | 303,wife,book,3,1,3,1,2,1,2,-1,3,1
306 | 304,ground,road,2,1,2,1,1,-42,2,-1,1,-42
307 | 305,poet,picture,3,1,3,1,3,1,2,-1,3,1
308 | 306,hat,arm,3,-1,2,-1,3,-1,2,-1,2,-1
309 | 307,way,doorway,2,1,1,-42,1,-42,1,-42,1,-42
310 | 308,servant,floor,3,-1,2,-1,2,-1,3,-1,3,1
311 | 309,worker,wind,2,-1,3,1,1,-42,3,1,2,-1
312 | 310,way,wind,2,-42,2,-42,2,-42,3,-42,2,-42
313 | 311,system,picture,2,-42,2,-42,2,-42,2,-42,2,-42
314 | 312,button,eye,3,-1,3,-1,3,1,3,1,2,-1
315 | 313,back,light,1,-42,3,1,3,1,2,1,2,-1
316 | 314,daughter,picture,3,1,3,1,3,1,2,-1,3,1
317 | 315,knife,store,3,-1,3,-1,2,-1,2,0,2,-42
318 | 316,watch,face,3,-1,3,-1,2,1,3,1,2,-42
319 | 317,person,dinner,3,1,3,1,3,1,3,1,3,1
320 | 318,brother,hair,3,1,3,1,3,1,3,1,2,1
321 | 319,teacher,air,2,-1,2,1,1,-42,2,1,2,1
322 | 320,body,face,3,1,3,1,3,1,3,0,1,-42
323 | 321,daughter,bottle,3,1,3,1,3,1,2,-1,3,1
324 | 322,bag,mouth,3,1,3,-1,3,-1,3,-1,2,-42
325 | 323,state,door,3,1,2,1,2,-42,3,-42,2,-42
326 | 324,bay,direction,2,-42,2,-42,2,-42,2,-42,3,-42
327 | 325,ship,element,3,1,3,1,3,1,3,1,3,1
328 | 326,street,way,1,-42,1,-42,1,-42,1,-42,2,-42
329 | 327,father,strap,3,1,3,1,3,1,2,-1,2,1
330 | 328,father,coal,2,1,2,1,2,1,2,-1,3,1
331 | 329,corner,road,2,-1,2,-1,1,-42,1,-42,2,0
332 | 330,brother,room,3,-1,3,-1,2,-1,3,-1,3,1
333 | 331,ground,way,2,-42,2,-42,1,-42,1,-42,2,-42
334 | 332,chair,back,2,1,2,1,2,-1,3,1,2,-1
335 | 333,gentleman,picture,3,1,3,1,3,1,2,-1,3,1
336 | 334,servant,end,2,-42,2,-42,2,-42,2,-42,2,-42
337 | 335,gulp,throat,3,-1,2,-1,2,-1,3,-1,3,1
338 | 336,messenger,door,3,-1,2,1,3,-1,3,-1,2,1
339 | 337,edition,position,3,-42,3,-42,3,-42,3,-42,3,-42
340 | 338,parcel,mouth,2,1,2,1,1,-42,3,1,2,-1
341 | 339,air,shore,2,1,2,-1,2,-1,2,-1,2,1
342 | 340,magistrate,door,3,-1,3,1,3,-1,3,-1,3,1
343 | 341,lady,air,2,-1,3,1,3,1,2,1,2,-1
344 | 342,friend,piece,2,1,2,1,2,1,2,-42,2,1
345 | 343,king,middle,2,-42,2,-42,2,-42,3,-42,3,-42
346 | 344,lady,piece,2,1,2,1,2,1,2,1,2,1
347 | 345,beard,face,3,-1,3,-1,2,-1,3,-1,1,-42
348 | 346,poet,something,2,-42,2,-42,2,-42,2,-42,2,-42
349 | 347,friend,shore,3,-1,3,-1,3,-1,2,-1,1,-42
350 | 348,state,king,3,1,3,1,2,-42,2,1,1,-42
351 | 349,boat,face,3,1,3,1,3,1,3,1,3,1
352 | 350,hat,step,2,-1,2,-1,2,-1,2,-1,1,-42
353 | 351,servant,vial,2,1,2,1,2,1,2,-1,2,1
354 | 352,father,stone,3,1,2,1,3,-1,3,-1,3,1
355 | 353,boy,stream,2,-1,2,1,1,-42,3,1,1,-42
356 | 354,brother,house,3,-1,3,-1,3,-1,3,-1,3,1
357 | 355,door,knee,3,1,3,1,2,1,3,1,3,-1
358 | 356,current,face,3,1,3,1,3,1,2,1,3,1
359 | 357,hedge,way,2,-42,2,-42,2,-42,2,-42,2,-42
360 | 358,train,shore,2,-1,1,-42,2,-42,3,1,3,1
361 | 359,end,road,2,-1,2,-42,2,-42,1,-42,2,-42
362 | 360,person,door,2,-1,1,-42,2,-1,3,-1,3,1
363 | 361,light,face,1,-42,2,-1,2,-1,2,-1,2,1
364 | 362,person,gulp,2,1,2,1,2,-42,3,-42,3,-42
365 | 363,friend,car,3,-1,3,-1,3,-1,3,-1,3,-1
366 | 364,bank,house,2,1,2,1,2,1,2,1,3,0
367 | 365,way,ship,2,-42,2,-42,3,-42,2,-1,3,-42
368 | 366,watch,camp,3,-1,3,-1,1,-42,2,-42,1,-42
369 | 367,wife,newspaper,3,1,3,1,3,1,2,1,3,1
370 | 368,ship,road,1,-42,2,1,2,-1,2,-1,3,1
371 | 369,servant,bottle,3,1,3,1,3,1,3,-1,3,1
372 | 370,daughter,wife,2,-1,2,-1,2,-1,2,-1,2,-1
373 | 371,daughter,exile,3,-42,3,-42,3,-42,3,-42,3,-42
374 | 372,bank,grass,2,1,3,1,3,1,3,1,1,-42
375 | 373,place,king,3,1,3,1,3,1,3,1,3,-1
376 | 374,object,mouth,2,1,2,1,2,1,1,-42,2,-42
377 | 375,finger,pocket,2,-1,2,1,2,1,2,-1,2,1
378 | 376,soul,ship,2,-1,2,-1,2,-1,2,-1,1,-42
379 | 377,person,something,2,-42,2,-42,2,-42,3,-42,2,-42
380 | 378,truck,road,2,1,2,1,2,1,2,-1,3,1
381 | 379,hill,middle,3,-42,3,-42,3,-42,3,-42,3,-42
382 | 380,patient,head,3,1,3,1,2,0,2,0,3,0
383 | 381,boy,grass,3,1,3,1,3,1,3,1,3,1
384 | 382,messenger,something,2,-42,2,-42,2,-42,2,-42,2,-42
385 | 383,step,light,2,-42,2,1,2,1,2,1,2,-42
386 | 384,clothes,floor,3,-1,3,-1,3,-1,3,-1,2,1
387 | 385,person,road,3,-1,3,-1,2,-1,3,-1,3,1
388 | 386,boy,head,3,1,3,1,3,1,2,-1,2,0
389 | 387,lady,boy,2,1,2,1,2,1,2,0,2,0
390 | 388,father,front,3,-42,3,-42,3,-42,3,-42,3,-42
391 | 389,person,everything,2,1,2,1,2,-42,2,-42,2,1
392 | 390,way,room,3,-42,3,-42,3,-42,3,-42,3,-42
393 | 391,person,sea,3,-1,3,-1,2,-42,3,1,2,-42
394 | 392,deck,room,2,-1,3,-1,2,-1,2,-1,2,0
395 | 393,someone,store,3,-1,3,-1,3,-1,3,-1,3,1
396 | 394,eye,breath,1,-42,3,1,2,1,3,1,2,-1
397 | 395,servant,dress,3,1,3,1,3,1,3,1,3,1
398 | 396,way,ball,3,-42,2,-42,3,-42,2,-42,2,-42
399 | 397,wife,picture,2,1,2,1,2,1,2,-1,3,1
400 | 398,gentleman,city,3,-1,3,-1,3,-1,3,-1,2,1
401 | 399,hair,boy,3,-1,3,-1,3,-1,3,-1,2,-42
402 | 400,messenger,way,2,-42,2,-42,2,-42,2,-42,2,-42
403 | 401,vessel,shore,2,-1,1,-42,1,-42,2,-1,2,1
404 | 402,knee,bed,3,-1,3,-1,3,-1,3,-1,3,1
405 | 403,block,room,3,-1,2,-1,2,-1,2,0,2,0
406 | 404,shop,factory,3,-1,3,-1,2,-1,2,0,3,0
407 | 405,someone,middle,2,1,2,1,2,-42,2,-42,2,1
408 | 406,fox,street,3,-1,3,-1,3,-1,3,-1,3,1
409 | 407,something,floor,3,-1,2,-1,3,-1,3,-1,3,1
410 | 408,chest,way,3,-42,3,-42,3,-42,3,-42,3,-42
411 | 409,eye,book,3,-1,3,-1,3,-1,3,-1,1,-42
412 | 410,bag,knife,3,1,2,-1,3,-1,3,-1,1,-42
413 | 411,sea,beach,3,1,2,-42,3,1,2,-1,3,1
414 | 412,servant,town,3,-1,3,-1,3,-1,3,-1,2,1
415 | 413,lady,home,3,-1,3,-1,3,-1,3,-1,3,1
416 | 414,person,floor,2,-1,3,-1,2,-1,3,-1,3,1
417 | 415,mouth,face,3,-1,3,-1,3,0,2,0,2,0
418 | 416,food,place,2,-1,2,-1,2,-42,2,-1,2,-42
419 | 417,stair,block,2,1,1,-42,2,-1,2,0,2,0
420 | 418,father,arm,3,1,3,1,3,1,2,1,2,0
421 | 419,bed,floor,3,-1,2,-1,3,-1,3,-1,2,-42
422 | 420,bed,face,3,1,3,1,2,1,2,1,2,-1
423 | 421,shirt,skirt,3,0,2,0,3,0,3,0,3,0
424 | 422,lady,boat,3,-1,3,-1,2,-1,3,-1,2,-1
425 | 423,boy,middle,2,-42,2,-42,2,-42,3,-42,2,-42
426 | 424,lady,scene,3,-1,2,-1,2,-1,2,-1,2,1
427 | 425,room,picture,3,1,3,1,3,1,3,1,2,0
428 | 426,lady,room,3,-1,3,-1,3,-1,2,-1,3,1
429 | 427,teacher,purse,3,1,3,1,3,1,2,-42,3,1
430 | 428,piece,coal,2,-42,2,-42,2,-42,2,-42,2,0
431 | 429,city,position,2,-42,2,-42,2,-42,2,-42,2,-42
432 | 430,stair,house,3,-1,3,-1,2,-1,2,0,2,0
433 | 431,vessel,direction,3,-42,3,-42,3,-42,3,-42,3,-42
434 | 432,watch,pocket,3,-1,3,1,2,1,3,1,1,-42
435 | 433,friend,room,3,-1,2,-1,1,-42,3,-1,3,1
436 | 434,clothes,hand,3,1,2,1,3,-1,3,-1,2,-1
437 | 435,fox,back,2,1,2,1,2,1,2,-1,2,1
438 | 436,master,piece,2,1,2,-42,2,1,1,-42,2,-42
439 | 437,glass,floor,3,-1,3,-1,3,-1,2,0,2,0
440 | 438,sun,wall,3,1,3,1,3,1,1,-42,2,1
441 | 439,river,city,2,-1,3,-1,2,1,3,-1,3,1
442 | 440,messenger,farm,3,-1,3,-1,3,-1,3,-1,3,1
443 | 441,person,someone,3,0,3,0,3,0,3,0,3,0
444 | 442,boy,breath,2,1,2,1,2,1,2,1,2,1
445 | 443,gentleman,end,3,-42,3,-42,3,-42,3,-42,3,-42
446 | 444,body,floor,2,-1,2,-1,3,-1,3,-1,2,1
447 | 445,someone,floor,3,-1,3,-1,3,-1,3,-1,3,1
448 | 446,daughter,bow,3,1,3,1,3,1,2,-1,2,1
449 | 447,hair,temple,3,-1,3,-1,3,-1,3,-1,3,1
450 | 448,brother,city,3,-1,3,-1,3,-1,3,-1,3,1
451 | 449,seal,place,3,-1,3,-1,3,-1,3,-1,3,1
452 | 450,foot,street,3,-1,3,-1,2,-1,3,-1,2,1
453 | 451,person,gentleman,3,0,3,0,2,0,3,0,3,0
454 | 452,base,way,2,-42,2,-42,2,-42,2,-42,2,-42
455 | 453,step,air,2,-42,2,-42,2,-42,2,-42,2,-42
456 | 454,master,ball,3,1,3,1,3,1,2,-1,3,-1
457 | 455,foot,ground,3,-1,2,-1,2,-1,3,-1,3,1
458 | 456,brother,door,2,-1,2,1,3,-1,2,-1,3,1
459 | 457,house,floor,3,1,3,1,2,-1,2,-1,2,0
460 | 458,arm,position,2,-42,2,-42,2,-42,2,-42,2,-42
461 | 459,clock,wife,3,-1,3,-1,2,-1,2,1,3,-1
462 | 460,stone,lamp,2,-1,1,-42,3,1,3,1,2,1
463 | 461,child,farm,3,-1,3,-1,3,-1,3,-1,3,1
464 | 462,stone,light,2,-42,2,1,2,1,2,1,2,-42
465 | 463,button,side,2,-1,2,-1,2,-1,2,-42,2,-42
466 | 464,person,flood,3,-1,3,-1,3,-1,3,1,3,-1
467 | 465,patient,piece,2,-42,2,-42,3,-42,2,-42,2,-42
468 | 466,master,way,2,-42,2,-42,2,-42,2,-42,2,-42
469 | 467,chest,fist,3,1,3,1,2,-1,2,1,2,-1
470 | 468,magistrate,seal,3,1,3,1,2,1,2,1,2,1
471 | 469,place,city,1,-42,1,-42,2,0,3,0,2,0
472 | 470,brother,way,2,-42,3,-42,2,-42,2,-42,2,-42
473 | 471,office,position,2,1,2,1,2,1,2,1,2,-42
474 | 472,beard,air,2,-1,1,-42,3,-42,2,1,2,-1
475 | 473,person,result,2,-42,2,-42,2,-42,3,-42,3,-42
476 | 474,king,response,2,-42,2,-42,3,-42,3,-42,2,-42
477 | 475,block,road,2,-1,2,-1,1,-42,2,0,1,-42
478 | 476,cup,room,3,-1,3,-1,3,-1,2,0,2,0
479 | 477,parcel,piece,2,-42,2,-42,3,-42,2,-42,3,-42
480 | 478,stream,end,3,-42,3,-42,3,-42,3,-42,3,-42
481 | 479,meal,hand,3,1,2,1,3,-1,2,-1,2,-1
482 | 480,world,something,3,1,3,1,3,1,2,1,2,1
483 | 481,head,face,3,1,3,1,3,1,3,1,2,0
484 | 482,horse,piece,3,1,3,1,3,1,1,-42,3,1
485 | 483,sun,room,3,1,3,1,2,1,2,-1,2,1
486 | 484,horse,picture,3,1,3,1,2,1,2,-1,3,1
487 | 485,way,flood,3,-42,3,-42,3,-42,3,-42,3,-42
488 | 486,breath,shore,3,-1,3,-1,3,-1,3,-1,2,1
489 | 487,tear,side,2,-42,1,-42,2,-1,2,-1,2,1
490 | 488,bed,dress,3,1,3,1,3,1,3,1,2,-1
491 | 489,office,side,3,-42,3,-42,3,-42,3,-42,3,-42
492 | 490,person,world,3,-1,3,-1,3,-1,2,-1,2,-1
493 | 491,magistrate,hand,3,1,3,1,2,1,1,-42,2,0
494 | 492,lady,stone,3,1,3,1,3,-1,3,-1,2,1
495 | 493,boy,place,2,-1,2,-1,2,-1,2,-1,3,1
496 | 494,stream,direction,3,-42,3,-42,2,-42,2,-42,2,-42
497 | 495,head,grass,2,1,3,1,3,1,2,1,3,1
498 | 496,person,father,3,0,3,0,3,0,2,0,2,0
499 | 497,father,shore,3,-1,3,-1,2,-1,1,-42,2,-42
500 | 498,father,wall,3,-1,3,-1,2,-1,3,-1,3,1
501 | 499,foot,boat,3,-1,3,-1,3,-1,3,-1,2,-1
502 | 500,piece,door,2,-42,2,-42,3,-42,2,-42,2,-42
503 | 501,car,camp,2,-1,1,-42,2,-42,2,1,2,1
504 | 502,sink,something,3,-42,3,-42,3,-42,3,-42,3,-42
505 | 503,father,face,3,1,3,1,2,1,2,0,2,1
506 | 504,bank,position,2,-42,2,-42,2,-42,2,-42,2,-42
507 | 505,lady,step,2,1,2,1,2,1,2,-1,3,1
508 | 506,watch,piece,3,-42,3,-42,3,-42,3,-42,3,-42
509 | 507,step,something,3,-42,3,-42,3,-42,3,-42,3,-42
510 | 508,daughter,floor,3,-1,3,-1,3,-1,3,-1,3,1
511 | 509,person,beard,3,1,3,1,3,1,2,1,2,0
512 | 510,child,wife,3,-1,3,-1,3,-1,2,0,2,1
513 | 511,boat,town,3,-1,3,-1,3,-1,1,-42,3,1
514 | 512,lady,elbow,3,1,3,1,3,1,1,-42,3,0
515 | 513,clock,bed,3,-1,3,-1,2,-1,2,1,3,0
516 | 514,father,rain,2,1,2,1,2,1,2,1,3,-1
517 | 515,nose,everything,3,-1,3,-1,2,-1,2,-1,2,-1
518 | 516,call,end,2,-42,2,-42,2,-42,2,-42,2,-42
519 | 517,cross,side,2,-1,2,-1,2,1,2,1,2,0
520 | 518,door,car,3,-1,3,-1,1,-42,2,0,3,-1
521 | 519,place,position,2,-42,2,-42,2,-42,2,-42,2,-42
522 | 520,breath,door,2,-1,2,-1,2,-1,2,-1,2,1
523 | 521,person,state,2,-1,2,-1,2,-1,2,-42,2,1
524 | 522,home,head,3,1,3,1,3,1,1,-42,3,-1
525 | 523,energy,arm,1,-42,2,-1,2,-42,2,-1,2,1
526 | 524,spoon,hand,3,-1,3,-1,3,-1,3,1,2,-1
527 | 525,king,direction,3,-42,2,-42,2,-42,1,-42,2,-42
528 | 526,room,bed,3,1,2,1,3,1,2,1,2,0
529 | 527,eye,master,2,-1,2,-1,2,-1,2,-1,1,-42
530 | 528,state,head,2,1,2,1,2,1,2,1,2,-1
531 | 529,foot,bottle,2,0,3,-42,1,-42,3,-1,3,1
532 | 530,friend,face,3,1,3,1,3,1,1,-42,2,0
533 | 531,energy,breath,2,-42,3,-42,2,-42,3,-42,3,-42
534 | 532,way,something,3,-42,3,-42,3,-42,3,-42,3,-42
535 | 533,mountain,end,2,-42,2,1,2,1,2,1,2,-42
536 | 534,source,way,3,-42,3,-42,3,-42,3,-42,3,-42
537 | 535,father,wind,2,-1,2,1,2,1,2,1,2,-1
538 | 536,lady,bag,3,1,3,1,3,1,2,1,3,1
539 | 537,heaven,light,2,1,2,-42,2,-42,1,-42,1,-42
540 | 538,person,room,3,-1,3,-1,3,-1,3,-1,3,1
541 | 539,lady,hat,3,1,3,1,3,1,2,-1,3,1
542 | 540,river,place,3,-42,3,-42,2,-42,2,-1,2,1
543 | 541,way,wife,3,-42,3,-42,3,-42,3,-42,3,-42
544 | 542,wife,air,2,-1,2,1,2,1,2,1,2,-42
545 | 543,stream,flood,3,-1,2,-1,3,-1,1,-42,3,-1
546 | 544,king,mouth,2,1,2,1,2,1,3,0,2,0
547 | 545,boat,shore,3,-1,2,-1,1,-42,3,1,3,1
548 | 546,head,throat,3,1,3,1,3,1,1,-42,2,0
549 | 547,father,parcel,2,1,2,1,2,1,2,-1,2,1
550 | 548,finger,everything,3,-1,3,-1,2,-1,2,-42,2,-42
551 | 549,ash,hand,3,-1,3,-1,2,-1,2,-1,2,-1
552 | 550,someone,wall,3,-1,3,-1,3,-1,3,-1,3,1
553 | 551,hair,something,2,-1,2,-1,2,-1,2,-1,2,-42
554 | 552,arm,strap,2,1,2,1,2,1,1,-42,2,1
555 | 553,coach,book,3,1,3,1,3,1,2,-1,2,1
556 | 554,office,piece,3,1,2,1,2,-42,2,-42,2,-1
557 | 555,ear,mouth,1,-42,1,-42,2,-1,2,-42,2,0
558 | 556,horse,position,2,-42,2,-42,2,-42,2,-42,2,-42
559 | 557,hat,something,2,-1,2,-1,2,-1,2,-1,1,-42
560 | 558,wife,wind,2,-42,1,-42,2,-42,1,-42,2,-1
561 | 559,foot,place,3,-1,3,-1,3,-1,3,-1,3,1
562 | 560,call,friend,2,-42,2,-42,2,-42,2,-42,2,-42
563 | 561,person,phone,3,1,3,1,3,1,2,-1,2,1
564 | 562,train,room,3,1,3,1,3,1,2,1,3,1
565 | 563,back,middle,2,-1,1,-42,1,-42,1,-42,2,0
566 | 564,king,abode,3,-1,3,-1,3,-1,3,-1,3,1
567 | 565,dress,skirt,2,1,2,0,2,0,2,0,2,0
568 | 566,someone,brick,3,1,3,1,2,-1,3,-1,3,1
569 | 567,king,exile,2,0,2,0,2,0,2,0,2,0
570 | 568,fist,mountain,3,-1,3,-1,2,-1,3,-1,3,1
571 | 569,cup,middle,3,-42,3,-42,3,-42,3,-42,3,-42
572 | 570,messenger,doorway,3,-1,2,-42,1,-42,3,-1,3,1
573 | 571,stone,position,2,-42,2,1,2,1,2,1,2,-42
574 | 572,river,eye,3,1,3,1,3,1,3,-1,2,1
575 | 573,messenger,gate,2,-1,2,-1,3,-1,3,-1,2,1
576 | 574,poet,glass,3,1,3,1,3,1,3,-1,2,1
577 | 575,someone,end,2,-42,2,-42,2,-42,2,-42,2,-42
578 | 576,rain,shore,3,-42,2,-42,2,-1,2,-1,2,1
579 | 577,chair,place,2,-1,2,-1,1,-42,1,-42,1,-42
580 | 578,place,head,2,1,2,-42,2,1,1,-42,2,-1
581 | 579,world,breath,3,1,3,1,3,1,2,1,3,-1
582 | 580,house,wall,3,1,3,1,2,1,2,0,3,0
583 | 581,door,bed,2,-1,3,-1,1,-42,2,1,1,-42
584 | 582,watch,strap,1,-42,2,1,1,-42,2,1,2,-42
585 | 583,someone,object,2,1,2,1,1,-42,2,-1,2,1
586 | 584,wife,sip,2,1,2,1,2,1,2,1,1,-42
587 | 585,current,stream,2,-42,2,-42,2,-42,2,-42,2,-42
588 | 586,hair,middle,2,-42,2,-42,2,-42,2,-42,3,-42
589 | 587,person,sun,3,-1,3,-1,2,-1,2,-1,1,-42
590 | 588,river,wind,2,-42,2,-42,2,-42,2,-42,2,-42
591 | 589,child,hand,3,1,3,1,3,1,2,-1,1,-42
592 | 590,world,sun,3,-1,2,-1,2,-42,2,1,2,1
593 | 591,finger,glass,2,-1,3,-1,3,1,3,-1,3,1
594 | 592,place,friend,2,1,2,1,2,1,2,1,3,-1
595 | 593,master,room,3,-1,3,-1,3,-1,3,-1,3,1
596 | 594,gentleman,air,2,-1,3,1,1,-42,3,1,2,-1
597 | 595,river,side,2,-42,2,1,2,-42,1,-42,3,1
598 | 596,person,mouth,3,1,3,1,3,1,1,-42,2,0
599 | 597,door,doorway,2,-1,2,1,1,-42,2,0,3,1
600 | 598,result,effect,2,-42,3,-42,3,-42,3,-42,3,-42
601 | 599,watch,ship,3,-1,3,-1,2,-1,2,-1,2,-42
602 | 600,office,floor,3,1,3,1,2,0,2,0,3,0
603 | 601,person,heaven,2,-1,2,-1,2,-1,2,-1,2,-42
604 | 602,wife,glass,2,1,3,1,3,1,2,-1,3,1
605 | 603,king,air,2,-1,2,1,1,-42,2,1,2,-1
606 | 604,someone,hand,3,1,3,1,3,1,2,0,2,0
607 | 605,fool,hand,2,-42,2,-42,2,-42,2,-42,2,-42
608 | 606,house,call,2,-42,2,-42,2,-42,2,-42,2,-42
609 | 607,ice,way,3,-42,2,-42,2,-42,2,-42,2,-42
610 | 608,person,brick,3,1,3,1,2,1,3,-1,3,1
611 | 609,friend,direction,2,-42,2,-42,2,-42,2,-42,2,-42
612 | 610,piece,lip,1,-42,2,1,2,1,2,1,2,-42
613 | 611,someone,bottle,3,1,3,1,3,1,3,-1,3,1
614 | 612,person,purse,3,1,3,1,3,1,1,-42,3,1
615 | 613,victim,scene,3,-1,2,-1,2,-42,2,-42,3,1
616 | 614,cup,front,2,-1,2,-1,2,-1,1,-42,2,0
617 | 615,lady,nose,3,1,3,1,2,1,1,-42,2,0
618 | 616,town,shore,2,-42,3,-42,2,-42,2,-42,2,-42
619 | 617,friend,position,2,-42,2,-42,2,-42,2,-42,2,-42
620 | 618,brother,middle,2,-42,2,-42,2,-42,2,-42,2,-42
621 | 619,boat,lock,3,1,3,1,2,1,1,-42,3,1
622 | 620,person,sip,2,-42,2,-42,2,-42,3,-42,2,-42
623 | 621,element,hand,2,-42,2,-42,2,-42,3,-42,2,-42
624 | 622,lady,breath,3,1,3,1,3,1,2,1,1,-42
625 | 623,father,direction,3,-42,2,-42,3,-42,3,-42,2,-42
626 | 624,king,rope,3,1,3,1,2,1,3,1,3,1
627 | 625,door,book,3,1,3,1,3,1,3,1,2,0
628 | 626,town,position,2,-42,2,-42,2,-42,2,-42,2,-42
629 | 627,person,lung,3,1,3,1,3,1,2,1,2,0
630 | 628,sail,ship,3,-1,3,-1,3,-1,3,-1,2,0
631 | 629,abode,place,1,-42,2,0,2,0,2,0,2,0
632 | 630,window,effect,3,-42,2,-42,3,-42,2,-42,2,-42
633 | 631,hill,beach,2,-1,1,-42,1,-42,2,1,2,0
634 | 632,watch,friend,3,-1,3,-1,2,-42,2,1,1,-42
635 | 633,something,book,3,-42,3,-42,3,-42,3,-42,3,-42
636 | 634,something,light,1,-42,3,1,3,1,2,1,3,-1
637 | 635,person,hedge,3,-1,2,-1,3,1,3,1,3,1
638 | 636,person,gully,3,-1,3,-1,2,-1,3,-1,3,1
639 | 637,lady,call,2,-42,2,1,2,-42,2,-42,3,-42
640 | 638,person,way,3,-42,3,-42,2,-42,2,-42,2,-42
641 | 639,hat,hand,3,1,2,-1,2,-1,2,-1,2,-1
642 | 640,back,end,2,-42,2,-42,2,-42,2,-42,2,-42
643 | 641,world,mouth,3,1,3,1,3,1,2,1,2,1
644 | 642,father,doorway,3,-1,2,-1,2,-1,3,-1,3,1
645 | 643,brother,vessel,2,-1,2,-1,1,-42,3,-1,2,-1
646 | 644,knee,hand,2,0,1,-42,3,0,2,1,1,-42
647 | 645,phone,end,3,-42,3,-42,3,-42,3,-42,3,-42
648 | 646,lady,back,2,-42,2,1,2,1,1,-42,1,-42
649 | 647,chair,gate,2,-1,3,-1,2,-1,2,0,2,0
650 | 648,result,something,3,-42,3,-42,3,-42,3,-42,3,-42
651 | 649,knife,floor,3,-1,3,-1,2,-1,2,-1,2,0
652 | 650,person,edition,1,-42,1,-42,2,-42,2,-1,2,-42
653 | 651,sail,way,3,-42,3,-42,3,-42,2,-42,2,-42
654 | 652,factory,room,3,1,3,1,1,-42,1,-42,1,-42
655 | 653,brother,end,3,-42,3,-42,3,-42,3,-42,3,-42
656 | 654,hat,backwards,3,-42,3,-42,3,-42,3,-42,3,-42
657 | 655,lady,piano,3,-1,2,-1,2,-1,3,-1,3,1
658 | 656,town,effect,2,-42,2,-42,2,-42,2,-42,2,-42
659 | 657,bank,way,3,-42,2,-42,3,-42,3,-42,3,-42
660 | 658,king,stream,3,-1,3,-1,3,1,2,1,2,1
661 | 659,coach,road,3,-1,2,-1,3,-1,3,-1,3,1
662 | 660,brother,dinner,3,1,3,1,3,1,2,-42,3,1
663 | 661,seed,knee,3,-1,3,-1,2,-1,2,1,2,-1
664 | 662,person,violin,3,1,3,1,3,1,3,-1,3,1
665 | 663,everything,ear,2,1,2,1,2,-42,2,-42,2,-42
666 | 664,person,call,2,1,2,-42,2,-42,2,1,1,-42
667 | 665,arm,way,3,-42,2,-42,2,-42,2,-42,2,-42
668 | 666,piece,light,3,-42,2,-42,3,-42,2,-42,2,-1
669 | 667,father,place,3,-1,3,-1,3,-1,3,-1,3,1
670 | 668,house,boat,3,1,3,1,3,1,2,0,3,-1
671 | 669,foot,middle,2,-42,2,-42,2,-42,2,-42,2,0
672 | 670,house,something,2,1,2,1,1,-42,2,1,2,-1
673 | 671,person,rain,2,1,2,-42,2,1,2,1,3,-1
674 | 672,place,train,2,-42,2,-42,2,-42,2,-42,2,-42
675 | 673,person,back,3,1,3,1,2,1,2,-1,2,1
676 | 674,victim,meal,3,1,3,1,3,1,3,1,3,1
677 | 675,piece,way,2,-1,2,-42,3,-42,3,-42,2,-42
678 | 676,boy,rope,3,1,3,1,3,1,3,1,3,1
679 | 677,body,train,3,-1,3,-1,3,-1,3,-1,3,-1
680 | 678,lady,chin,3,1,3,1,2,0,2,-1,2,0
681 | 679,body,grass,2,-1,2,1,2,1,2,-1,3,1
682 | 680,brother,breath,2,1,3,1,2,1,3,1,2,-1
683 | 681,room,middle,2,-42,2,-42,2,-42,2,-42,2,-42
684 | 682,child,soul,2,1,3,1,2,1,2,1,1,-42
685 | 683,heaven,horse,2,1,2,1,2,1,2,1,2,-42
686 | 684,foot,arm,3,-1,3,-1,2,0,1,-42,3,0
687 | 685,daughter,lamp,3,1,3,1,3,1,3,-1,3,1
688 | 686,poet,position,2,-42,2,-42,2,-42,2,-1,2,-42
689 | 687,father,call,2,1,2,1,2,1,1,-42,2,-1
690 | 688,sea,mouth,3,1,3,1,3,1,3,-1,3,1
691 | 689,person,bay,3,-1,2,-1,2,-1,2,-1,2,1
692 | 690,servant,mouth,3,1,3,1,3,1,2,0,3,1
693 | 691,everything,piece,3,1,3,1,2,1,1,-42,2,-42
694 | 692,father,boy,3,1,3,1,2,1,2,1,2,1
695 | 693,father,knee,3,1,3,1,3,1,2,-1,2,1
696 | 694,patient,meal,3,1,3,1,2,1,1,-42,3,1
697 | 695,king,dress,2,1,2,1,2,1,2,1,3,1
698 | 696,sun,train,3,1,3,1,3,1,2,1,2,1
699 | 697,house,street,2,-1,3,-1,3,-42,1,-42,3,-42
700 | 698,exile,element,2,1,2,1,1,-42,2,-1,2,1
701 | 699,ice,everything,3,-1,3,-1,2,-1,2,-42,2,-42
702 | 700,room,gate,2,1,2,1,2,-42,2,1,2,-42
703 | 701,wife,bottle,2,1,3,1,3,1,2,-1,3,1
704 | 702,beard,lip,3,1,1,-42,2,-42,1,-42,2,-1
705 | 703,king,wife,3,1,2,1,2,1,2,1,1,-42
706 | 704,door,stream,3,-1,2,-1,1,-42,3,1,3,-1
707 | 705,daughter,bed,3,-1,2,-1,2,-1,3,-1,3,1
708 | 706,victim,doorway,2,-1,2,-42,2,-1,2,-1,2,1
709 | 707,arm,something,2,-42,2,-42,2,-42,2,-42,2,-42
710 | 708,piece,tree,2,-1,2,-1,2,-42,2,-42,2,-42
711 | 709,eye,ball,3,-1,3,-1,3,-1,3,-1,3,-1
712 | 710,horse,air,2,-1,3,1,1,-42,2,1,2,0
713 | 711,foot,bed,3,-1,3,-1,3,-1,3,-1,3,1
714 | 712,boat,way,2,-42,2,-42,2,-42,2,-42,2,-42
715 | 713,master,fist,3,1,3,1,3,1,2,-1,2,0
716 | 714,hat,head,3,-1,3,-1,3,-1,3,-1,1,-42
717 | 715,chin,face,3,-1,3,-1,1,-42,2,1,2,0
718 | 716,lock,throat,3,-1,2,-1,3,1,2,1,2,-1
719 | 717,heaven,rain,2,1,2,1,2,1,1,-42,2,-1
720 | 718,father,position,2,-42,2,-42,2,-42,2,-42,2,-42
721 | 719,person,lover,3,0,3,0,3,0,3,0,2,0
722 | 720,stone,breath,3,1,3,1,3,1,3,1,3,-1
723 | 721,someone,picture,2,1,2,1,2,1,2,-42,2,1
724 | 722,end,position,3,-42,3,-42,3,-42,3,-42,3,-42
725 | 723,ice,vessel,1,-42,1,-42,1,-42,2,1,2,-1
726 | 724,window,object,3,-42,3,-42,2,-42,2,-42,3,-42
727 | 725,house,tree,3,1,3,1,3,1,2,0,2,0
728 | 726,effect,end,3,-42,3,-42,3,-42,3,-42,3,-42
729 | 727,fox,stone,3,1,3,1,3,-1,3,-1,3,1
730 | 728,master,door,2,-42,2,-42,2,-42,2,-42,2,-42
731 | 729,anything,mouth,2,1,1,-42,3,-42,1,-42,3,-42
732 | 730,knee,way,3,-42,3,-42,3,-42,3,-42,3,-42
733 | 731,end,face,2,-42,3,-42,3,-42,3,-42,2,-42
734 | 


--------------------------------------------------------------------------------
/src/data.py:
--------------------------------------------------------------------------------
   1 | """
   2 | Functions for querying ngramdb and managing local cache(s) of query results.
   3 | 
   4 | Author: mbforbes
   5 | """
   6 | 
   7 | # IMPORTS
   8 | # ------------------------------------------------------------------------------
   9 | 
  10 | from __future__ import division
  11 | 
  12 | # builtins
  13 | import code  # code.interact(local=dict(globals(), **locals()))
  14 | from collections import Counter
  15 | import cPickle as pickle
  16 | import glob
  17 | import math
  18 | import os
  19 | import sys
  20 | 
  21 | # 3rd party
  22 | from nltk.corpus import wordnet as wn
  23 | from nltk.stem.wordnet import WordNetLemmatizer
  24 | from tqdm import tqdm
  25 | 
  26 | # local
  27 | from ngramdb import NgramDb
  28 | from ngramdb.util import pprint_ngram_list
  29 | 
  30 | 
  31 | # CONSTANTS
  32 | # ------------------------------------------------------------------------------
  33 | 
  34 | # ngramdb cache info
  35 | CACHE_SPREAD_DIR = 'data/ngramdb/queries/'
  36 | CACHE_SPREAD_EXT = '.cache'
  37 | PMI_CACHE_FN = 'data/ngramdb/pmi/pmi.cache'
  38 | 
  39 | QUIT = 'q'
  40 | 
  41 | # Wordnet stuff
  42 | # The following synset names are synsets given by 'abstraction'
  43 | ABSTRACT_SS_NAMES = [
  44 |     'abstraction.n.01',
  45 |     'abstraction.n.02',
  46 |     'abstraction.n.03',
  47 |     'abstraction.n.04',
  48 |     'abstractedness.n.01',
  49 |     'abstraction.n.06',
  50 | ]
  51 | ABSTRACT_SS = [wn.synset(x) for x in ABSTRACT_SS_NAMES]
  52 | 
  53 | # Verb endings
  54 | SUBS = ['_d', '_p', '_dp', '_op', '_od']
  55 | 
  56 | # For PMI and use in system.
  57 | 
  58 | # We deal with people separately because we assume that all nominal subjects
  59 | # refer to the same physical-propertied object (roughly the same size, weight,
  60 | # etc.) Here show all nouns that we assume refer to a person. (I only saw 'man'
  61 | # but I'm adding more in case others show up.) We'll remove all of these and
  62 | # include only HUMAN_NOUN.
  63 | PERSON_NOUNS = ['man', 'woman', 'he', 'she', 'I', 'you', 'human', 'person']
  64 | 
  65 | # The replacement for all PERSON_NOUNS.
  66 | HUMAN_NOUN = 'PERSON'
  67 | 
  68 | 
  69 | # TOP LEVEL FUNCTIONS
  70 | # ------------------------------------------------------------------------------
  71 | 
  72 | def attr_filter(attr, val):
  73 |     """
  74 |     General filter constructor: ensures obj's attr == val. Example attrs:
  75 |         - 'deprel'
  76 |         - 'postag'
  77 | 
  78 |     Takes:
  79 |         attr (str)
  80 |         val  (str)
  81 | 
  82 |     Returns:
  83 |         f(obj) -> bool
  84 |     """
  85 |     return lambda o: o.__dict__[attr] == val
  86 | 
  87 | 
  88 | def passes_filters(token, filters):
  89 |     """
  90 |     Returns whether token passes all filters.
  91 | 
  92 |     Takes:
  93 |         token   (Token)
  94 |         filters ([f(Token) -> bool])
  95 | 
  96 |     """
  97 |     for f in filters:
  98 |         if not f(token):
  99 |             return False
 100 |     return True
 101 | 
 102 | 
 103 | def filter_count_n(ngrams, fs):
 104 |     """
 105 |     Args:
 106 |         ngrams ([Ngram] (I think))
 107 |         fs ([[filter]]): Critical: *LIST* of filter lists.
 108 | 
 109 |     Returns:
 110 |         Counter[tuple(str)]
 111 |     """
 112 |     c = Counter()
 113 |     for idx, ng in enumerate(ngrams):
 114 |         # Grab words via filters.
 115 |         wlists = []
 116 |         for f in fs:
 117 |             w = [x for x in ng if passes_filters(x, f)]
 118 |             wlists.append(w)
 119 | 
 120 |         # Don't add this ngram if any word has multiple matches.
 121 |         mul = False
 122 |         for w in wlists:
 123 |             if len(w) != 1:
 124 |                 mul = True
 125 |                 break
 126 |         if mul:
 127 |             continue
 128 | 
 129 |         # Flatten the list
 130 |         ws = [l[0] for l in wlists]
 131 | 
 132 |         # Check the positions are increasing.
 133 |         noninc = False
 134 |         for i in range(0, len(ws) - 1):
 135 |             if ws[i].position >= ws[i+1].position:
 136 |                 noninc = True
 137 |                 break
 138 |         if noninc:
 139 |             continue
 140 | 
 141 |         # Turn into a tuple (to be a key of the Counter).
 142 |         tup = tuple([w.surface for w in ws])
 143 |         c[tup] += ng.freq
 144 |     return c
 145 | 
 146 | 
 147 | def is_abstract(noun):
 148 |     """
 149 |     Try to (heuristically) filter abstract nouns.
 150 | 
 151 |     Args:
 152 |         noun (str|unicode)
 153 |     """
 154 |     assert type(noun) in [str, unicode], 'bad noun type: %r. noun: %s' % (
 155 |             type(noun), noun)
 156 |     noun_ss = wn.synsets(noun, pos=wn.NOUN)
 157 | 
 158 |     # if wordnet doesn't know about the noun, let it fly
 159 |     if len(noun_ss) == 0:
 160 |         return False
 161 | 
 162 |     # Checking just the first (most common?) synset for the noun, but checking
 163 |     # all hypernym paths for that synset, and all paths must be clean of any
 164 |     # abstraction.
 165 | 
 166 |     # previously, looped over all with:
 167 |     #     for n_ss in noun_ss:
 168 |     # but that proved not restrictive enough
 169 |     n_ss = noun_ss[0]
 170 |     paths = n_ss.hypernym_paths()
 171 |     # all ps must pass
 172 |     paths_good = True
 173 |     for p in paths:
 174 |         for a in ABSTRACT_SS:
 175 |             if a in p:
 176 |                 paths_good = False
 177 |                 break
 178 |         if not paths_good:
 179 |             break
 180 |     if paths_good:
 181 |         # debug
 182 |         # print paths
 183 |         return False
 184 |     return True
 185 | 
 186 | 
 187 | def filter_abstract_from_counter(c):
 188 |     """
 189 |     Try to (heuristically) filter abstract nouns.
 190 | 
 191 |     Args:
 192 |         c (Counter) Frequency counts of nouns
 193 |     """
 194 |     for n in c.keys():
 195 |         if is_abstract(n):
 196 |             del c[n]
 197 | 
 198 | 
 199 | # CLASSES
 200 | # ------------------------------------------------------------------------------
 201 | 
 202 | class SizeQueryN(object):
 203 |     """Keys to the n-obj cache"""
 204 | 
 205 |     def __init__(self, query, raw_f_list):
 206 |         """
 207 |         Takes:
 208 |             query  (NGramDBQuery)
 209 |             raw_f_list ([[(str, str)]])
 210 |         """
 211 |         self.query = query
 212 |         self.raw_f_list = raw_f_list
 213 | 
 214 |     def __eq__(self, other):
 215 |         return (self.query == other.query and
 216 |             self.raw_f_list == other.raw_f_list)
 217 | 
 218 |     def __hash__(self):
 219 |         f_hash = hash(tuple([tuple(rf) for rf in self.raw_f_list]))
 220 |         return hash((hash(self.query), f_hash))
 221 | 
 222 | 
 223 | class DBWrapper(object):
 224 |     def __init__(self, cxn_id, fn_n):
 225 |         """
 226 |         Args:
 227 |             cxn_id (str) connection ID for the DB
 228 |             fn_n (str) directory of spread cache n-obj cache
 229 |         """
 230 |         self.db = NgramDb(cxn_id)
 231 |         self.fn_n = fn_n
 232 |         self.cache_n = {}
 233 | 
 234 |     def load_caches(self):
 235 |         if os.path.isdir(self.fn_n):
 236 |             # Turning off for now; should have logging framework.
 237 |             # print '<loading cache N>'
 238 |             files = glob.glob(self.fn_n + '*' + CACHE_SPREAD_EXT)
 239 |             # hashes = [os.path.split(f)[1].rstrip(CACHE_SPREAD_EXT) for f in files]
 240 |             for f in tqdm(files):
 241 |                 k, v = self.load_spreadfile(f)
 242 |                 self.cache_n[k] = v
 243 |             # Turning off for now; should have logging framework.
 244 |             # print '<cache N: %d queries>' % (len(self.cache_n.keys()))
 245 | 
 246 |     def print_cache_stats(self):
 247 |         """
 248 |         Prints aggregated verb stats for cache.
 249 |         """
 250 |         n = 100
 251 |         c = Counter()
 252 |         for k, v in self.cache_n.iteritems():
 253 |             w = [w for w in k.query.words if w is not None]
 254 |             w = w[0]
 255 |             c[w] += sum(v.values())
 256 |         print '%d most common verbs:' & (n)
 257 |         for k,v in c.most_common(n):
 258 |             print '\t', k, '\t', v
 259 | 
 260 |     def write_caches(self):
 261 |         """
 262 |         Writes (all) caches to files.
 263 |         """
 264 |         print '<writing caches>'
 265 |         self.write_spread(self.cache_n, CACHE_SPREAD_DIR)
 266 | 
 267 |     def write_kv(self, fn, k, v):
 268 |         """
 269 |         Writes a single k, v to file name fn.
 270 | 
 271 |         Args:
 272 |             fn (str)
 273 |             k (Object)
 274 |             v (Object)
 275 |         """
 276 |         with open(fn, 'w') as f:
 277 |             pickle.dump(k, f)
 278 |             pickle.dump(v, f)
 279 | 
 280 |     def load_spreadfile(self, fn):
 281 |         """
 282 |         utility to load a single spreadfile and return its contents as k, v
 283 | 
 284 |         Args:
 285 |             fn (str)
 286 |         """
 287 |         with open(fn, 'r') as f:
 288 |             k = pickle.load(f)
 289 |             v = pickle.load(f)
 290 |         return k, v
 291 | 
 292 |     def check_cache_files(self, d):
 293 |         """
 294 |         Checks d to find any malformed cache entries. Does not require cache to
 295 |         be loaded in advance.
 296 | 
 297 |         Args:
 298 |             d (str): Directory.
 299 |         """
 300 |         files = glob.glob(d + '*' + CACHE_SPREAD_EXT)
 301 |         n_good, n_bad, n_total = 0, 0, 0
 302 |         for f in files:
 303 |             good = True
 304 |             try:
 305 |                 k, v = self.load_spreadfile(f)
 306 |                 if type(k) != SizeQueryN:
 307 |                     good = False
 308 |                     print 'ERRR: cache file key is not type SizeQueryN; is %r (%s)' % (type(k), f)
 309 |                 if type(v) != Counter:
 310 |                     good = False
 311 |                     print 'ERRR: cache file value is not type Counter; is %r (%s)' % (type(v), f)
 312 |             except:
 313 |                 good = False
 314 |                 print 'ERRR: Problem loading cache file (%s)' % (f)
 315 |             if good:
 316 |                 n_good += 1
 317 |             else:
 318 |                 n_bad += 1
 319 |             n_total += 1
 320 |         if n_good + n_bad != n_total:
 321 |             print 'ERRR: cache checking error: good (%d) + bad (%d) != total (%d)' % (n_good, n_bad, n_total)
 322 |         print '%d/%d/%d good/bad/total' % (n_good, n_bad, n_total)
 323 | 
 324 |     def write_spread_item(self, k, v, d):
 325 |         """
 326 |         Writes a single item (k, v) to d.
 327 | 
 328 |         Args:
 329 |             k (SizeQueryN): Cache key.
 330 |             v (Counter): Cache value.
 331 |             d (str): Directory.
 332 | 
 333 |         Returns:
 334 |             bool: True if the item was written, False if it was found and
 335 |                   didn't need to be written.
 336 |         """
 337 |         orig = str(hash(k))
 338 |         written, done = False, False
 339 |         postfix, postfix_n = '', 0
 340 |         while not done:
 341 |             fn = os.path.join(d, orig) + postfix + CACHE_SPREAD_EXT
 342 | 
 343 |             # no collision! just write.
 344 |             if not os.path.isfile(fn):
 345 |                 self.write_kv(fn, k, v)
 346 |                 written = True
 347 |                 done = True
 348 |             else:
 349 |                 # existing key, existing value
 350 |                 ek, ev = self.load_spreadfile(fn)
 351 |                 if ek == k and ev == v:
 352 |                     # best case scenario: collision and it's what we want!
 353 |                     # no need to write.
 354 |                     done = True
 355 |                 else:
 356 |                     # a collision AND it's not what we want. try the next
 357 |                     # one.
 358 |                     postfix_n += 1
 359 |                     postfix = '-%d' % (postfix_n)
 360 |         return written
 361 | 
 362 |     def write_spread(self, c, d):
 363 |         """
 364 |         Lazily writes out c across d, only writing what is necessary.
 365 | 
 366 |         Args:
 367 |             c (dict): Cache.
 368 |             d (str): Directory.
 369 |         """
 370 |         # tracking
 371 |         n_total, n_written = len(c.keys()), 0
 372 | 
 373 |         # write me maybe
 374 |         for k,v in c.iteritems():
 375 |             written = self.write_spread_item(k, v, d)
 376 |             n_written = n_written + 1 if written else n_written
 377 |         print '<%d/%d/%d written/skipped/total>' % (n_written, n_total - n_written, n_total)
 378 | 
 379 |     def run(self, sq, fetch=False):
 380 |         """
 381 |         Cache-aware query runner.
 382 | 
 383 |         If fetch=True, always fetches the result and returns it as the second
 384 |         value. Otherwise, the second value returned will be None.
 385 | 
 386 |         Takes:
 387 |             sq (SizeQueryN)
 388 | 
 389 |         Returns:
 390 |             Counter[tuple(str)], (result|None)
 391 |         """
 392 |         # Cache getting layer.
 393 |         cache = self.cache_n
 394 |         in_cache = sq in cache.keys()
 395 |         if (not fetch) and in_cache:
 396 |             # Turning off for now; should have logging framework.
 397 |             # print '<cache hit>'
 398 |             return cache[sq], None
 399 |         if fetch:
 400 |             print '<skipping cache>'
 401 |         else:
 402 |             print '<cache miss>'
 403 | 
 404 |         # Running layer.
 405 |         res = self.db.run_query(sq.query)
 406 |         f_list = []
 407 |         for raw_fs in sq.raw_f_list:
 408 |             fs = [attr_filter(f[0], f[1]) for f in raw_fs]
 409 |             f_list.append(fs)
 410 |         count = filter_count_n(res, f_list)
 411 | 
 412 |         # Cache writing layer.
 413 |         if not in_cache:
 414 |             cache[sq] = count
 415 |             # In case things go wrong: always write the cache(s) after a
 416 |             # successful query. We only need to write this one query.
 417 |             self.write_spread_item(sq, count, self.fn_n)
 418 | 
 419 |         # Returning layer. We could just always return res, but this will keep
 420 |         # memory freer and make exploration more explicit.
 421 |         if fetch:
 422 |             return count, res
 423 |         return count, None
 424 | 
 425 | 
 426 | class Data(object):
 427 |     """
 428 |     This is the API for how to interact with the data programmatically.
 429 | 
 430 |     This class aims to ease the transition of this code base along three axes:
 431 | 
 432 |     -   (a) interactive      --> programmatic
 433 |     -   (b) database-focused --> data-focused
 434 |     -   (c) CLI-focused      --> API-focused
 435 |     """
 436 | 
 437 |     def __init__(self, w=None):
 438 |         """
 439 |         NOTE(mbforbes): Main consideration is whether I want this to be a
 440 |         lighter-weight init than loading the DB.
 441 | 
 442 |         Args:
 443 |             w (DBWrapper, optional): Default is None, in which case one is
 444 |                 loaded from the cache. If w is provided, it should have the
 445 |                 caches loaded already.
 446 | 
 447 |         """
 448 |         if w is None:
 449 |             w = DBWrapper('max data API', CACHE_SPREAD_DIR)
 450 |             w.load_caches()
 451 |         self._w = w
 452 | 
 453 |         # This is also maybe used below---init once.
 454 |         self.lmtz = WordNetLemmatizer()
 455 | 
 456 |     def get_queries_for_verb(self, verb):
 457 |         """
 458 |         Gets queries, noun indexes, and preposition indexes for a verb.
 459 | 
 460 |         Args:
 461 |             verb (str): Verb to get queries for.
 462 | 
 463 |         Returns:
 464 |             (
 465 |                 SizeQueryN,  -- queries
 466 |                 [[int]],     -- corresponding noun indexes
 467 |                 [[int]],     -- corresponding preposition indexes
 468 |             )
 469 |         """
 470 |         qs = [
 471 |             # e.g. I threw the ball
 472 |             # saving: obj
 473 |             SizeQueryN(
 474 |                 # Query
 475 |                 self._w.db.create_query(
 476 |                     words=[None, verb, None],
 477 |                     postags=['PRP', 'VBD', 'NN|NNS'],
 478 |                     deprels=['nsubj', None, 'dobj']
 479 |                 ),
 480 |                 # List of filter lists.
 481 |                 [
 482 |                     [('deprel', 'nsubj')],
 483 |                     [('deprel', 'dobj')],
 484 |                 ]
 485 |             ),
 486 |             # e.g. I walked into the room
 487 |             # saving: prp, obj
 488 |             SizeQueryN(
 489 |                 # Query
 490 |                 self._w.db.create_query(
 491 |                     words=[None, verb, None, None],
 492 |                     postags=['PRP', 'VBD', 'IN', 'NN|NNS'],
 493 |                     deprels=['nsubj', None, None, 'pobj']
 494 |                 ),
 495 |                 # List of filter lists.
 496 |                 [
 497 |                     [('deprel', 'nsubj')],
 498 |                     [('postag', 'IN')],
 499 |                     [('deprel', 'pobj')],
 500 |                 ]
 501 |             ),
 502 |             # e.g. I put it (in / inside / on / under) the cupboard
 503 |             # e.g. I put it over the cupboard
 504 |             # saving: obj1, prp, obj
 505 |             SizeQueryN(
 506 |                 # Query
 507 |                 self._w.db.create_query(
 508 |                     words=[None, verb, None, None, None],
 509 |                     postags=['PRP', 'VBD', 'NN|NNS', 'IN', 'NN|NNS'],
 510 |                     deprels=['nsubj', None, 'dobj', None, 'pobj']
 511 |                 ),
 512 |                 # List of filter lists.
 513 |                 [
 514 |                     [('deprel', 'nsubj')],
 515 |                     [('deprel', 'dobj')],
 516 |                     [('postag', 'IN')],
 517 |                     [('deprel', 'pobj')],
 518 |                 ]
 519 |             ),
 520 |             # ------------------------------------------------------------------
 521 |             # META NOTES:
 522 |             #    The following two are pretty interesting, but they give
 523 |             #    absolute info rather than relative info. (Philosophically, a
 524 |             #    sentence is actually relative to "normal" experiences, but it's
 525 |             #    much harder to figure this reference frame out than with a
 526 |             #    direct comparison.)
 527 |             #
 528 |             #    Thus, it's probably OK to put these on the back-burner for now.
 529 |             # ------------------------------------------------------------------
 530 |             # # e.g. the plane flew
 531 |             # # saving: obj
 532 |             # SizeQueryN(
 533 |             #     # Query
 534 |             #     self._w.db.create_query(
 535 |             #         words=[None, verb],
 536 |             #         postags=['NN|NNS', 'VBD'],
 537 |             #         deprels=['nsubj', 'ROOT']
 538 |             #     ),
 539 |             #     # List of filter lists.
 540 |             #     [
 541 |             #         [('deprel', 'nsubj')],
 542 |             #     ]
 543 |             # ),
 544 |             # # e.g. the plane flew by
 545 |             # # saving: obj, prep
 546 |             # SizeQueryN(
 547 |             #     # Query
 548 |             #     self._w.db.create_query(
 549 |             #         words=[None, verb, None],
 550 |             #         postags=['NN|NNS', 'VBD', 'IN'],
 551 |             #         deprels=['nsubj', 'ROOT', None]
 552 |             #     ),
 553 |             #     # List of filter lists.
 554 |             #     [
 555 |             #         [('deprel', 'nsubj')],
 556 |             #         [('postag', 'IN')],
 557 |             #     ]
 558 |             # ),
 559 | 
 560 |             # ------------------------------------------------------------------
 561 |             # META NOTES:
 562 |             #    The following two are good analogs to the nsubj being PRP; if
 563 |             #    we talk about objects (nouns) doing things to other objects
 564 |             #    (nouns), this should capture that.
 565 |             #
 566 |             #    So, these, I think, should be kept.
 567 |             # ------------------------------------------------------------------
 568 |             # e.g. the plane flew by the blimp
 569 |             # saving: obj, prep, obj
 570 |             SizeQueryN(
 571 |                 # Query
 572 |                 self._w.db.create_query(
 573 |                     words=[None, verb, None, None],
 574 |                     postags=['NN|NNS', 'VBD', 'IN', 'NN|NNS'],
 575 |                     deprels=['nsubj', 'ROOT', None, 'pobj']
 576 |                 ),
 577 |                 # List of filter lists.
 578 |                 [
 579 |                     [('deprel', 'nsubj')],
 580 |                     [('postag', 'IN')],
 581 |                     [('deprel', 'pobj')],
 582 |                 ]
 583 |             ),
 584 |             # e.g. the boot squashed the bug
 585 |             # saving: obj, prep, obj
 586 |             SizeQueryN(
 587 |                 # Query
 588 |                 self._w.db.create_query(
 589 |                     words=[None, verb, None],
 590 |                     postags=['NN|NNS', 'VBD', 'NN|NNS'],
 591 |                     deprels=['nsubj', 'ROOT', 'dobj']
 592 |                 ),
 593 |                 # List of filter lists.
 594 |                 [
 595 |                     [('deprel', 'nsubj')],
 596 |                     [('deprel', 'dobj')],
 597 |                 ]
 598 |             ),
 599 | 
 600 |             # ------------------------------------------------------------------
 601 |             # META NOTES:
 602 |             #    The following query is interesting, but it's a three-way
 603 |             #    comparison, which will take extra work to integrate.
 604 |             #
 605 |             #    Thus: back-burner for now.
 606 |             # ------------------------------------------------------------------
 607 |             # # e.g. the man squashed the bug with his shoe
 608 |             # # saving: obj, prep, obj
 609 |             # SizeQueryN(
 610 |             #     # Query
 611 |             #     self._w.db.create_query(
 612 |             #         words=[None, verb, None, None, None],
 613 |             #         postags=['NN|NNS', 'VBD', 'NN|NNS', 'IN', 'NN|NNS'],
 614 |             #         deprels=['nsubj', 'ROOT', 'dobj', None, 'pobj']
 615 |             #     ),
 616 |             #     # List of filter lists.
 617 |             #     [
 618 |             #         [('deprel', 'nsubj')],
 619 |             #         [('deprel', 'dobj')],
 620 |             #         [('postag', 'IN')],
 621 |             #         [('deprel', 'pobj')],
 622 |             #     ]
 623 |             # ),
 624 |         ]
 625 |         noun_idxes = [
 626 |             [1],
 627 |             [2],
 628 |             [1, 3],
 629 |             # [0],
 630 |             # [0],
 631 |             [0, 2],
 632 |             [0, 1],
 633 |             # [0, 1, 3],
 634 |         ]
 635 |         prep_idxes  = [
 636 |             [],
 637 |             [1],
 638 |             [2],
 639 |             # [],
 640 |             # [1],
 641 |             [1],
 642 |             [],
 643 |             # [2],
 644 |         ]
 645 |         return qs, noun_idxes, prep_idxes
 646 | 
 647 |     def get_freq_nouns(self, v, s, p, cutoff=1000):
 648 |         """
 649 |         Get frequent nouns for v_sub occurring at or above cutoff.
 650 | 
 651 |         Args:
 652 |             v (str): Verb
 653 |             s (str): Sub ('_d', '_p', etc.)
 654 |             p (str|None): Preposition (or None if sub doesn't use one)
 655 |             cutoff (int): Frequency cutoff below which nouns will not be
 656 |                 returned.
 657 | 
 658 |         Returns:
 659 |             [str] | [(str, str)]
 660 |         """
 661 |         c, n_idxes = self._get_cache_res_prep(v, s, p)
 662 |         fc = Counter()
 663 |         for surface, count in c.iteritems():
 664 |             il = list(surface)
 665 |             # NOTE: Assuming at most 2 nouns per query.
 666 |             o = il[n_idxes[0]] if len(n_idxes) == 1 else (il[n_idxes[0]], il[n_idxes[1]])
 667 |             fc[o] += count
 668 |         res = []
 669 |         for o, freq in fc.most_common():
 670 |             if freq < cutoff:
 671 |                 break
 672 |             res += [o]
 673 |         return res
 674 | 
 675 |     def get_prep_freqs_agg(self, v):
 676 |         """
 677 |         Gets the counter of prepositions for v, aggregating across all of its
 678 |         subs (and their query results).
 679 | 
 680 |         Args:
 681 |             v (str) verb
 682 | 
 683 |         Returns:
 684 |             Counter
 685 |         """
 686 |         # gotta iterate over the various subs
 687 | 
 688 |         cs, _, p_idxes = self._get_cache_res_verb(v)
 689 |         # aggregating over different subs
 690 |         ac = Counter()
 691 |         for i, c in enumerate(cs):
 692 |             # Only aggregate prepositions if we saved any
 693 |             ps = p_idxes[i]
 694 |             if len(ps) == 0:
 695 |                 continue
 696 |             # Consider most general case that we could have n prepositions
 697 |             # saved.
 698 |             for p in ps:
 699 |                 for item, count in c.iteritems():
 700 |                     prep = list(item)[p]
 701 |                     ac[prep] += count
 702 |         return ac
 703 | 
 704 |     def get_prep_freqs(self, v):
 705 |         """
 706 |         Gets a counter of prepositions for v for each of its subs.
 707 | 
 708 |         Args:
 709 |             v (str) verb
 710 |             sub (str) one of ['p', 'dp', 'op']
 711 | 
 712 |         Returns:
 713 |             {str -> Counter(str)} --- {sub -> Counter(prep)}
 714 |         """
 715 |         cs, _, p_idxes = self._get_cache_res_verb(v)
 716 |         res = {}
 717 |         for i, c in enumerate(cs):
 718 |             # Only count prepositions any exist in this particular query
 719 |             ps = p_idxes[i]
 720 |             if len(ps) == 0:
 721 |                 continue
 722 | 
 723 |             sub = SUBS[i]
 724 |             res[sub] = Counter()
 725 |             # Consider most general case that we could have n prepositions
 726 |             # saved.
 727 |             for p in ps:
 728 |                 for item, count in c.iteritems():
 729 |                     prep = list(item)[p]
 730 |                     res[sub][prep] += count
 731 |         return res
 732 | 
 733 |     def get_top_nouns(self, v, s, p, filter_abstract, lemmatize):
 734 |         """
 735 |         Gets the counter for nouns in v, s, p.
 736 | 
 737 |         Args:
 738 |             v (str): Verb
 739 |             s (str): Sub
 740 |             p (str): Prep
 741 |             filter_abstract (bool): Whether to filter abstract nouns out of the
 742 |                 returned list.
 743 |             lemmatize (bool): Whether to compress nouns into their lemmatized
 744 |                 form before returning.
 745 |         Returns:
 746 |             Counter[str]
 747 |         """
 748 |         c, n_idxes = self._get_cache_res_prep(v, s, p)
 749 | 
 750 |         # aggregating
 751 |         ac = Counter()
 752 |         for i in n_idxes:
 753 |             for surface, count in c.iteritems():
 754 |                 noun = list(surface)[i]
 755 |                 ac[noun] += count
 756 | 
 757 |         # debugging
 758 |         # print '... <before filter abstract>'
 759 |         # code.interact(local=dict(globals(), **locals()))
 760 | 
 761 |         # Maybe filter
 762 |         if filter_abstract:
 763 |             filter_abstract_from_counter(ac)
 764 | 
 765 |         # debugging
 766 |         # print '... <after filter abstract>'
 767 |         # code.interact(local=dict(globals(), **locals()))
 768 | 
 769 |         # Maybe lemmatize
 770 |         if lemmatize:
 771 |             self.compress_lemmas_in_counter(ac)
 772 | 
 773 |         return ac
 774 | 
 775 |     def compress_lemmas_in_counter(self, c):
 776 |         """
 777 |         Compresses forms to their lemma in (keys of) c (by adding counts).
 778 | 
 779 |         Args:
 780 |             c (Counter)
 781 | 
 782 |         Modifies c in-place.
 783 |         """
 784 |         for k in c.keys():
 785 |             l = self.lmtz.lemmatize(k)
 786 |             if l != k:
 787 |                 c[l] += c[k]
 788 |                 del c[k]
 789 | 
 790 |     def get_verb_freq(self, v):
 791 |         """
 792 |         Get frequency statistics for verb.
 793 | 
 794 |         Args:
 795 |             v (str): Verbn
 796 | 
 797 |         Returns:
 798 |             int: sum occurrences of verb in currently active queries.
 799 |         """
 800 |         cs, _, _ = self._get_cache_res_verb(v)
 801 |         total = 0
 802 |         for c in cs:
 803 |             total += sum(c.values())
 804 |         return total
 805 | 
 806 |     def _get_cache_res_verb(self, v):
 807 |         """
 808 |         Gets verb cache result for all subs.
 809 | 
 810 |         Args:
 811 |             v (str): Verb
 812 | 
 813 |         Returns (each list in the returned tuple is of length `len(SUBS)`:
 814 |             (
 815 |                 [Counter], -- results for each sub
 816 |                 [[int]],   -- noun indexes for each sub
 817 |                 [[int]],   -- preposition indexes for each sub
 818 |             )
 819 |         """
 820 |         qs, noun_idxes, prep_idxes = self.get_queries_for_verb(v)
 821 |         cs = []
 822 |         for q in qs:
 823 |             c, _ = self._w.run(q)
 824 |             cs += [c]
 825 |         return cs, noun_idxes, prep_idxes
 826 | 
 827 |     def _get_cache_res_sub(self, v, s):
 828 |         """
 829 |         Returns the cached result for v_sub along with the indexes of the nouns.
 830 |         Helper function for querying APIs.
 831 | 
 832 |         Args:
 833 |             v (str): Verb
 834 |             s (str): Sub
 835 | 
 836 |         Returns:
 837 |             Counter[tuple(str)]
 838 |             [int]: Noun indexes
 839 |             [int]: Prep indexes
 840 |         """
 841 |         # TODO(mbforbes): Should consolidate w/ constant in system.py. Then
 842 |         # again, much code (e.g. query code) assumes just these exist...
 843 | 
 844 |         # Figure out what we're looking for
 845 |         idx = SUBS.index(s)
 846 | 
 847 |         # Get all data
 848 |         qs, noun_idxes, prep_idxes = self.get_queries_for_verb(v)
 849 | 
 850 |         # Return the piece we want from each
 851 |         c, _ = self._w.run(qs[idx])
 852 |         n = noun_idxes[idx]
 853 |         p = prep_idxes[idx]
 854 |         return c, n, p
 855 | 
 856 |     def _get_cache_res_prep(self, v, s, p):
 857 |         """
 858 |         Args:
 859 |             v (str): Verb
 860 |             s (str): Sub
 861 |             p (str): Prep
 862 | 
 863 |         Returns:
 864 |             Counter[tuple(str)]
 865 |             [int]: Noun indexes
 866 |         """
 867 |         c, nidxes, pidxes = self._get_cache_res_sub(v, s)
 868 | 
 869 |         # If we're looking at a sub without a prep, we can just return directly.
 870 |         if p is None and len(pidxes) > 0 or p is not None and len(pidxes) == 0:
 871 |             assert False, 'verb %s sub %s has prep mismatch: prep %r, idxes %r' % (v, s, p, pidxes)
 872 |         if p is None and len(pidxes) == 0:
 873 |             return c, nidxes
 874 | 
 875 |         # Else, we select only results which match the preposition.
 876 |         res = Counter()
 877 |         # NOTE: Assuming that there's at most one preposition per query as
 878 |         # that's how things are currently structured. Change the argument above
 879 |         # to be a [str] (and Turk new data to match) if you want to allow this.
 880 |         for surface, cnt in c.iteritems():
 881 |             pidx = pidxes[0]
 882 |             if list(surface)[pidx] != p:
 883 |                 continue
 884 |             res[surface] += cnt
 885 |         return res, nidxes
 886 | 
 887 | 
 888 | class PMI(object):
 889 | 
 890 |     def __init__(self):
 891 |         """
 892 |         Must have run PMI.compute() first.
 893 |         """
 894 |         with open(PMI_CACHE_FN, 'r') as f:
 895 |             self.frame_counter = pickle.load(f)
 896 |             self.frame_total = pickle.load(f)
 897 |             self.nounpair_counter = pickle.load(f)
 898 |             self.nounpair_total = pickle.load(f)
 899 |             self.joint_counter = pickle.load(f)
 900 |             self.joint_total = pickle.load(f)
 901 | 
 902 |     def query(self, frame, nounpair):
 903 |         """
 904 |         For the given frame (verb_sub[_prep]) and nounpair (noun1, noun2),
 905 |         determines the PMI score between them (according to the ngramdb data)
 906 |         and returns it.
 907 | 
 908 |         Args:
 909 |             frame (string)              verb_sub[_prep]
 910 |             nounpair (string, string)   (noun1, noun2)
 911 | 
 912 |         Return:
 913 |             float: The PMI between
 914 |         """
 915 |         res = self._get_pmi(frame, nounpair)
 916 |         # if HUMAN_NOUN in nounpair and res >= 0:
 917 |         #     print 'GOT HUMAN PMI:', frame, nounpair, res
 918 |         return res
 919 | 
 920 |     def _get_pmi(self, frame, nounpair):
 921 |         """
 922 |         Refactoring query() to allow for multiple trials if desired. See doc
 923 |         there.
 924 | 
 925 |         Args:
 926 |             frame (string)
 927 |             nounpair (string, string)
 928 | 
 929 |         Returns:
 930 |             float
 931 |         """
 932 |         joint = (frame, nounpair)
 933 |         # log(joint / (x*y)) = log(joint) - log(x*y) = log(joint) - log(x) - log(y)
 934 |         #
 935 |         # for any prob p_x, p_x = count(x) / total(x)
 936 |         #
 937 |         # log(p_x) = log(count(x) / total(x)) = log(count(x)) - log(total(x))
 938 | 
 939 |         # sanity checking
 940 |         if self.joint_counter[joint] == 0 or self.frame_counter[frame] == 0 or \
 941 |                 self.nounpair_counter[nounpair] == 0:
 942 |             return float('-inf')
 943 | 
 944 |         lj = math.log(self.joint_counter[joint]) - math.log(self.joint_total)
 945 |         lf = math.log(self.frame_counter[frame]) - math.log(self.frame_total)
 946 |         lnp = math.log(self.nounpair_counter[nounpair]) - math.log(self.nounpair_total)
 947 |         return lj - lf - lnp
 948 | 
 949 |     @staticmethod
 950 |     def compute():
 951 |         """
 952 |         Computing PMI and saving
 953 |         """
 954 |         # get caches
 955 |         w = DBWrapper('max print verb info', CACHE_SPREAD_DIR)
 956 |         w.load_caches()
 957 |         api = Data(w)
 958 | 
 959 |         # load verbs
 960 |         print '[pmi] loading verbs...'
 961 |         basedir = 'data/turk/hardcore/'
 962 |         # NOTE: This is fine because this isn't the corpus we are training /
 963 |         # testing on --- these are just the verbs that we might encounter, so
 964 |         # we're precomputing PMI for everything we might want and caching it.
 965 |         # (i.e. not actually touching test data here).
 966 |         fnames = ['train.txt', 'dev.txt', 'test.txt']
 967 |         verbs = []
 968 |         for fname in fnames:
 969 |             with open(basedir + fname, 'r') as f:
 970 |                 verbs += [v.strip() for v in f.readlines()]
 971 | 
 972 |         # there are our variables for aggregating pmi counts
 973 |         frame_counter = Counter()
 974 |         nounpair_counter = Counter()
 975 |         joint_counter = Counter()
 976 | 
 977 |         print '[pmi] counting query results...'
 978 |         for verb in tqdm(verbs):
 979 |             counters, noun_idxes_lst, prep_idxes_lst = api._get_cache_res_verb(verb)
 980 |             assert len(SUBS) == len(counters), 'Should get 1 set of results back for each frame type (sub)'
 981 |             for i in range(len(SUBS)):
 982 |                 counter = counters[i]
 983 |                 noun_idxes = noun_idxes_lst[i]
 984 |                 prep_idxes = prep_idxes_lst[i]
 985 | 
 986 |                 for surface_forms, freq in counter.iteritems():
 987 |                     # Compute frame
 988 |                     frame = verb + SUBS[i]
 989 |                     if len(prep_idxes) > 0:
 990 |                         # NOTE: assuming at most 1 prep for now
 991 |                         frame += '_' + surface_forms[prep_idxes[0]]
 992 | 
 993 |                     # Compute noun pair
 994 |                     # NOTE: Assuming 1 or 2 nouns
 995 |                     if len(noun_idxes) == 1:
 996 |                         nouns = [HUMAN_NOUN, surface_forms[noun_idxes[0]]]
 997 |                     else:
 998 |                         nouns = [surface_forms[noun_idxes[0]], surface_forms[noun_idxes[1]]]
 999 | 
1000 |                     for j in range(2):
1001 |                         if nouns[j] in PERSON_NOUNS:
1002 |                             nouns[j] = HUMAN_NOUN
1003 | 
1004 |                     nounpair = tuple(nouns)
1005 | 
1006 |                     frame_counter[frame] += freq
1007 |                     nounpair_counter[nounpair] += freq
1008 |                     joint_counter[(frame, nounpair)] += freq
1009 | 
1010 |         # easy to get actual probs so why not
1011 |         frame_total = sum(frame_counter.values())
1012 |         nounpair_total = sum(nounpair_counter.values())
1013 |         joint_total = sum(joint_counter.values())
1014 | 
1015 |         with open(PMI_CACHE_FN, 'w') as f:
1016 |             pickle.dump(frame_counter, f)
1017 |             pickle.dump(frame_total, f)
1018 |             pickle.dump(nounpair_counter, f)
1019 |             pickle.dump(nounpair_total, f)
1020 |             pickle.dump(joint_counter, f)
1021 |             pickle.dump(joint_total, f)
1022 | 
1023 | 
1024 | # TOP-LEVEL COMMAND-LINE FUNCS
1025 | # ------------------------------------------------------------------------------
1026 | 
1027 | def explore():
1028 |     w = DBWrapper('max explore', CACHE_SPREAD_DIR)
1029 |     w.load_caches()
1030 |     api = Data(w)
1031 |     code.interact(local=dict(globals(), **locals()))
1032 | 
1033 | 
1034 | def run_for_verb(verb, w=None):
1035 |     """
1036 |     Run queries for a verb.
1037 | 
1038 |     Args:
1039 |         verb (str): Verb to run queries for.
1040 |         w (DBWrapper, optional): Default is None, in which case one is loaded
1041 |             from the cache. If w is provided, it should have the caches loaded
1042 |             already.
1043 |     """
1044 |     print '<running for verb "%s">' % (verb)
1045 |     if w is None:
1046 |         w = DBWrapper('max verb run', CACHE_SPREAD_DIR)
1047 |         w.load_caches()
1048 |     api = Data(w)
1049 |     qs, _, _ = api.get_queries_for_verb(verb)
1050 |     for q in qs:
1051 |         w.run(q)
1052 | 
1053 | 
1054 | def run_for_file(fname):
1055 |     """
1056 |     Run queries for verbs in a file.
1057 | 
1058 |     Args:
1059 |         fname (str)
1060 |     """
1061 |     # Prep the DB.
1062 |     w = DBWrapper('max file run', CACHE_SPREAD_DIR)
1063 |     w.load_caches()
1064 | 
1065 |     # Read the verbs.
1066 |     with open(fname) as f:
1067 |         lines = f.readlines()
1068 |     verbs = [line.strip() for line in lines]
1069 | 
1070 |     # Run the queries.
1071 |     for v in verbs:
1072 |         run_for_verb(v, w)
1073 | 
1074 | 
1075 | def ping():
1076 |     """
1077 |     Really just want to see if the databse is actually up.
1078 |     """
1079 |     w = DBWrapper('max ping', CACHE_SPREAD_DIR)
1080 |     w.load_caches()
1081 |     res = w.db.create_and_run_query(
1082 |         words=['cat', None, 'dog'],
1083 |         postags=['NN', 'VBD', 'NN'],
1084 |         deprels=['nsubj', None, 'dobj'],
1085 |     )
1086 |     pprint_ngram_list(res[:10])
1087 | 
1088 | 
1089 | def check_cache():
1090 |     """
1091 |     Checks the cache files on disk.
1092 |     """
1093 |     w = DBWrapper('max check cache', CACHE_SPREAD_DIR)
1094 |     w.check_cache_files(CACHE_SPREAD_DIR)
1095 | 
1096 | 
1097 | def print_cache_stats():
1098 |     """
1099 |     Prints (verb) stats of cache.
1100 |     """
1101 |     w = DBWrapper('max print cache', CACHE_SPREAD_DIR)
1102 |     w.load_caches()
1103 |     w.print_cache_stats()
1104 | 
1105 | 
1106 | def interact_verb_info():
1107 |     """
1108 |     Prints info about verbs.
1109 | 
1110 |     Specifically, prints cached verb data in sets that would be partitioned
1111 |     into nodes in the factor graph.
1112 |     """
1113 |     w = DBWrapper('max interact verb info', CACHE_SPREAD_DIR)
1114 |     w.load_caches()
1115 |     api = Data(w)
1116 | 
1117 |     while True:
1118 |         verb = raw_input('enter a verb (%s to quit): ' % (QUIT))
1119 |         if verb == QUIT:
1120 |             break
1121 |         _print_single_verb_info(w, api, verb)
1122 | 
1123 | 
1124 | def print_top_preps_interact():
1125 |     """
1126 |     Pretty much this yeah.
1127 |     """
1128 |     w = DBWrapper('max interact verb preps', CACHE_SPREAD_DIR)
1129 |     w.load_caches()
1130 |     api = Data(w)
1131 | 
1132 |     while True:
1133 |         verb = raw_input('enter a verb (%s to quit): ' % (QUIT))
1134 |         if verb == QUIT:
1135 |             break
1136 |         preps = api.get_prep_freqs_agg(verb)
1137 |         for p, count in preps.most_common(20):
1138 |             print '%d\t%s' % (count, p)
1139 | 
1140 | def print_verb_info(verbs):
1141 |     """
1142 |     Prints info about verbs.
1143 | 
1144 |     Args:
1145 |         verbs ([str])
1146 |     """
1147 |     w = DBWrapper('max print verb info', CACHE_SPREAD_DIR)
1148 |     w.load_caches()
1149 |     api = Data(w)
1150 |     for verb in verbs:
1151 |         _print_single_verb_info(w, api, verb)
1152 | 
1153 | 
1154 | def _print_single_verb_info(w, api, verb):
1155 |     """
1156 |     Args:
1157 |         w (DBWrapper)
1158 |         api (API)
1159 |         verb (str)
1160 |     """
1161 |     qs, _, _ = api.get_queries_for_verb(verb)
1162 |     descs = [
1163 |         '(1) PRP   %s dobj' % (verb),
1164 |         '(2) PRP   %s      IN pobj' % (verb),
1165 |         '(3) PRP   %s dobj IN pobj' % (verb),
1166 |         # '(4) NN(S) %s' % (verb),
1167 |         # '(5) NN(S) %s      IN' % (verb),
1168 |         '(6) NN(S) %s      IN pobj' % (verb),
1169 |         '(7) NN(S) %s dobj' % (verb),
1170 |         # '(8) NN(S) %s dobj IN pobj' % (verb),
1171 |     ]
1172 |     # These index the noun positions in the queries. They are defined assuming
1173 |     # the PRPs---where applicable---have already been stripped off. Because of
1174 |     # this, we don't use the noun indexes returned from the
1175 |     # get_*queries_for_verb functions.
1176 |     idxes = [
1177 |         [0],
1178 |         [1],
1179 |         [0,2],
1180 |         # [0],
1181 |         # [0],
1182 |         [0,2],
1183 |         [0,1],
1184 |         # [0,2,4],
1185 |     ]
1186 | 
1187 |     # Sanity check
1188 |     if len(qs) != len(descs) or len(qs) != len(idxes):
1189 |         print 'ERRR: Code out-of-date: qs, descs, idxes must match query #s.'
1190 |         return
1191 | 
1192 |     for i, q in enumerate(qs):
1193 |         noun_idxes = idxes[i]
1194 |         c, _ = w.run(q)
1195 |         print descs[i]
1196 | 
1197 |         # maybe compress PRPs
1198 |         compress = True  # Change to false to not compress.
1199 |         if compress and i < 3:
1200 |             # compress PRP
1201 |             cp = Counter()
1202 |             for item, count in c.iteritems():
1203 |                 minus_prp = tuple(list(item)[1:])
1204 |                 cp[minus_prp] += count
1205 |         else:
1206 |             # no PRP; can't compress (or just disabled)
1207 |             cp = c
1208 | 
1209 |         # NOTE: In other functions, abstract nouns may be filtered. May want to
1210 |         # enable here.
1211 |         # filter_abstract_from_counter(cp, noun_idxes)
1212 |         for r, f in cp.most_common(20):
1213 |             print '\t %s\t %d' % (r, f)
1214 | 
1215 | 
1216 | def compute_pmi():
1217 |     PMI.compute()
1218 | 
1219 | 
1220 | def query_pmi():
1221 |     pmi = PMI()
1222 |     print 'Entering interactive python shell'
1223 |     print 'Query pmi with `pmi.query(frame, nounpair)`'
1224 |     print 'Example:'
1225 |     print ">>> pmi.query('looked_op_as', ('children', 'friend'))"
1226 |     print pmi.query('looked_op_as', ('children', 'friend'))
1227 |     code.interact(local=dict(globals(), **locals()))
1228 | 
1229 | 
1230 | def main():
1231 |     """
1232 |     NOTE: To any reader of this code, apologizes for the hacked-together
1233 |     command line parsing. I should have just used `argparse`. If you're seeing
1234 |     this and care, open an issue and send a PR and we'll make this better :-)
1235 |     """
1236 |     # sanity checking
1237 |     if len(sys.argv) < 2:
1238 |         print 'USAGE: python data.py --command [args]'
1239 |         print 'Possible commands:'
1240 |         print '\t', '--explore \t\t run iterative exploration'
1241 |         print '\t', '--ping \t\t\t ping the Myria DB'
1242 |         print '\t', '--check-cache \t\t check local cache for soundness'
1243 |         print '\t', '--print-cache-stats \t print (verb) cache stats'
1244 |         print '\t', '--interact-verb-info \t print cached verb info (interact)'
1245 |         print '\t', '--query-pmi \t\t querys precomputed PMI results'
1246 |         print '\t', '--compute-pmi \t\t computes PMI over ngramdb w/ turked verbs'
1247 |         print '\t', '--print-verb-info <verb1> [<more_verbs>]\t print cached verb info (verb(s) provided)'
1248 |         print '\t', '--print-top-preps-interact\t print cached info on top preps for verbs (interact)'
1249 |         print '\t', '--verb <verb> \t\t run query and cache results for <verb>'
1250 |         print '\t', '--file <file> \t\t run queries for all words in <file>'
1251 |         return 1
1252 | 
1253 |     if sys.argv[1] == '--explore':
1254 |         explore()
1255 |     elif sys.argv[1] == '--ping':
1256 |         ping()
1257 |     elif sys.argv[1] == '--check-cache':
1258 |         check_cache()
1259 |     elif sys.argv[1] == '--print-cache-stats':
1260 |         print_cache_stats()
1261 |     elif sys.argv[1] == '--interact-verb-info':
1262 |         interact_verb_info()
1263 |     elif sys.argv[1] == '--compute-pmi':
1264 |         compute_pmi()
1265 |     elif sys.argv[1] == '--query-pmi':
1266 |         query_pmi()
1267 |     elif sys.argv[1] == '--print-top-preps-interact':
1268 |         print_top_preps_interact()
1269 |     elif sys.argv[1] == '--print-verb-info':
1270 |         if len(sys.argv) < 3:
1271 |             print 'ERRR: Command "--print-verb-info" requires at least one verb'
1272 |             return 1
1273 |         print_verb_info(sys.argv[2:])
1274 |     elif sys.argv[1] == '--verb':
1275 |         if len(sys.argv) < 3:
1276 |             print 'ERRR: Command "--verb" requires a verb'
1277 |             return 1
1278 |         run_for_verb(sys.argv[2])
1279 |     elif sys.argv[1] == '--file':
1280 |         if len(sys.argv) < 3:
1281 |             print 'ERRR: Command "--file" requires a filename'
1282 |             return 1
1283 |         run_for_file(sys.argv[2])
1284 |     else:
1285 |         print 'ERRR: Command "%s" unrecognized' % (sys.argv[1])
1286 |         return 1
1287 | 
1288 |     return 0
1289 | 
1290 | 
1291 | if __name__ == '__main__':
1292 |     sys.exit(main())
1293 | 


--------------------------------------------------------------------------------