├── .floydexpt
├── .floydignore
├── .gitignore
├── .idea
    ├── misc.xml
    ├── modules.xml
    └── vcs.xml
├── LICENSE
├── Pipfile
├── Pipfile.lock
├── README.md
├── bin
    ├── floyd-run.sh
    └── start_neo4j_locally.sh
├── config
    ├── .gitignore
    ├── __init__.py
    ├── environment.py
    └── overrides.py
├── data_sets
    └── synthetic_review_prediction
    │   ├── article_0
    │       ├── __init__.py
    │       ├── configure.py
    │       └── generate.py
    │   └── utils
    │       └── dataset_writer.py
├── experiment
    ├── __init__.py
    ├── arguments.py
    ├── directory.py
    ├── experiment.py
    └── experiment_header.py
├── floyd_requirements.txt
├── graph_ml
    ├── __init__.py
    ├── adjacency_layer.py
    ├── dataset.py
    ├── dataset_helpers.py
    ├── model.py
    ├── ntm.py
    ├── path.py
    ├── train.py
    └── util.py
├── output
    └── .gitignore
├── test.sh
├── test
    ├── __init__.py
    └── test_memory_cell.py
└── train.py


/.floydexpt:
--------------------------------------------------------------------------------
1 | {"family_id": "XaCDPUiGtasLwxhbKi4y7S", "name": "graph-investigations"}


--------------------------------------------------------------------------------
/.floydignore:
--------------------------------------------------------------------------------
 1 | 
 2 | # Directories and files to ignore when uploading code to floyd
 3 | 
 4 | .git
 5 | .eggs
 6 | eggs
 7 | lib
 8 | lib64
 9 | parts
10 | sdist
11 | var
12 | *.pyc
13 | *.swp
14 | .DS_Store
15 | data
16 | output
17 | log


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | *~
  6 | # C extensions
  7 | *.so
  8 | .DS_Store
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | env/
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | data/
 29 | output/
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | 
 52 | # Translations
 53 | *.mo
 54 | *.pot
 55 | 
 56 | # Django stuff:
 57 | *.log
 58 | local_settings.py
 59 | 
 60 | # Flask stuff:
 61 | instance/
 62 | .webassets-cache
 63 | 
 64 | # Scrapy stuff:
 65 | .scrapy
 66 | 
 67 | # Sphinx documentation
 68 | docs/_build/
 69 | 
 70 | # PyBuilder
 71 | target/
 72 | 
 73 | # Jupyter Notebook
 74 | .ipynb_checkpoints
 75 | 
 76 | # pyenv
 77 | .python-version
 78 | 
 79 | # celery beat schedule file
 80 | celerybeat-schedule
 81 | 
 82 | # SageMath parsed files
 83 | *.sage.py
 84 | 
 85 | # dotenv
 86 | .env
 87 | 
 88 | # virtualenv
 89 | .venv
 90 | venv/
 91 | ENV/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" languageLevel="JDK_1_9" default="false" project-jdk-name="Python 3.6.3 virtualenv at ~/.local/share/virtualenvs/graph-node-categorizer-OCoY3DpE" project-jdk-type="Python SDK">
4 |     <output url="file://$PROJECT_DIR$/out" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/graph-node-categorizer.iml" filepath="$PROJECT_DIR$/graph-node-categorizer.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Octavian-ai
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | 
 3 | url = "https://pypi.python.org/simple"
 4 | verify_ssl = true
 5 | name = "pypi"
 6 | 
 7 | 
 8 | [packages]
 9 | 
10 | "neo4j-driver" = "*"
11 | tensorflow = "*"
12 | keras = "*"
13 | numpy = "*"
14 | lazy = "*"
15 | "h5py" = "*"
16 | colorama = "*"
17 | coloredlogs = "*"
18 | more-itertools = "*"
19 | recurrentshop = {git = "https://github.com/datalogai/recurrentshop.git"}
20 | generate-data = {git = "https://github.com/Octavian-ai/generate-data.git"}
21 | colored-traceback = "*"
22 | sklearn = "*"
23 | tqdm = "*"
24 | floyd-cli = "*"
25 | 
26 | 
27 | [dev-packages]
28 | 
29 | 


--------------------------------------------------------------------------------
/Pipfile.lock:
--------------------------------------------------------------------------------
  1 | {
  2 |     "_meta": {
  3 |         "hash": {
  4 |             "sha256": "abe2e1e33a7a78d6c130b15bb5444b6c11496bc59b24304e35c505c2081a253b"
  5 |         },
  6 |         "host-environment-markers": {
  7 |             "implementation_name": "cpython",
  8 |             "implementation_version": "3.6.2",
  9 |             "os_name": "posix",
 10 |             "platform_machine": "x86_64",
 11 |             "platform_python_implementation": "CPython",
 12 |             "platform_release": "17.3.0",
 13 |             "platform_system": "Darwin",
 14 |             "platform_version": "Darwin Kernel Version 17.3.0: Thu Nov  9 18:09:22 PST 2017; root:xnu-4570.31.3~1/RELEASE_X86_64",
 15 |             "python_full_version": "3.6.2",
 16 |             "python_version": "3.6",
 17 |             "sys_platform": "darwin"
 18 |         },
 19 |         "pipfile-spec": 6,
 20 |         "requires": {},
 21 |         "sources": [
 22 |             {
 23 |                 "name": "pypi",
 24 |                 "url": "https://pypi.python.org/simple",
 25 |                 "verify_ssl": true
 26 |             }
 27 |         ]
 28 |     },
 29 |     "default": {
 30 |         "args": {
 31 |             "hashes": [
 32 |                 "sha256:a785b8d837625e9b61c39108532d95b85274acd679693b71ebb5156848fcf814"
 33 |             ],
 34 |             "version": "==0.1.0"
 35 |         },
 36 |         "backports.weakref": {
 37 |             "hashes": [
 38 |                 "sha256:81bc9b51c0abc58edc76aefbbc68c62a787918ffe943a37947e162c3f8e19e82",
 39 |                 "sha256:bc4170a29915f8b22c9e7c4939701859650f2eb84184aee80da329ac0b9825c2"
 40 |             ],
 41 |             "version": "==1.0.post1"
 42 |         },
 43 |         "bleach": {
 44 |             "hashes": [
 45 |                 "sha256:e67f46adcec78dbc3c04462f3aba3213a673d5652eba2609ed1ef15492a44b8d",
 46 |                 "sha256:978e758599b54cd3caa2e160d74102879b230ea8dc93871d0783721eef58bc65"
 47 |             ],
 48 |             "version": "==1.5.0"
 49 |         },
 50 |         "certifi": {
 51 |             "hashes": [
 52 |                 "sha256:14131608ad2fd56836d33a71ee60fa1c82bc9d2c8d98b7bdbc631fe1b3cd1296",
 53 |                 "sha256:edbc3f203427eef571f79a7692bb160a2b0f7ccaa31953e99bd17e307cf63f7d"
 54 |             ],
 55 |             "version": "==2018.1.18"
 56 |         },
 57 |         "chardet": {
 58 |             "hashes": [
 59 |                 "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691",
 60 |                 "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae"
 61 |             ],
 62 |             "version": "==3.0.4"
 63 |         },
 64 |         "click": {
 65 |             "hashes": [
 66 |                 "sha256:29f99fc6125fbc931b758dc053b3114e55c77a6e4c6c3a2674a2dc986016381d",
 67 |                 "sha256:f15516df478d5a56180fbf80e68f206010e6d160fc39fa508b65e035fd75130b"
 68 |             ],
 69 |             "version": "==6.7"
 70 |         },
 71 |         "clint": {
 72 |             "hashes": [
 73 |                 "sha256:05224c32b1075563d0b16d0015faaf9da43aa214e4a2140e51f08789e7a4c5aa"
 74 |             ],
 75 |             "version": "==0.5.1"
 76 |         },
 77 |         "colorama": {
 78 |             "hashes": [
 79 |                 "sha256:463f8483208e921368c9f306094eb6f725c6ca42b0f97e313cb5d5512459feda",
 80 |                 "sha256:48eb22f4f8461b1df5734a074b57042430fb06e1d61bd1e11b078c0fe6d7a1f1"
 81 |             ],
 82 |             "version": "==0.3.9"
 83 |         },
 84 |         "colored-traceback": {
 85 |             "hashes": [
 86 |                 "sha256:f76c21a4b4c72e9e09763d4d1b234afc469c88693152a763ad6786467ef9e79f",
 87 |                 "sha256:6da7ce2b1da869f6bb54c927b415b95727c4bb6d9a84c4615ea77d9872911b05"
 88 |             ],
 89 |             "version": "==0.3.0"
 90 |         },
 91 |         "coloredlogs": {
 92 |             "hashes": [
 93 |                 "sha256:6bd7ceac109c3f2e138db8578396664b1067f32aca55d3280a57dbf05f1ada6c",
 94 |                 "sha256:e3b19320bd21bde506444601a71397cf5215f040df06503013697c6261b05de9"
 95 |             ],
 96 |             "version": "==9.0"
 97 |         },
 98 |         "contextlib2": {
 99 |             "hashes": [
100 |                 "sha256:f5260a6e679d2ff42ec91ec5252f4eeffdcf21053db9113bd0a8e4d953769c00",
101 |                 "sha256:509f9419ee91cdd00ba34443217d5ca51f5a364a404e1dce9e8979cea969ca48"
102 |             ],
103 |             "markers": "python_version < '3.2'",
104 |             "version": "==0.5.5"
105 |         },
106 |         "enum34": {
107 |             "hashes": [
108 |                 "sha256:6bd0f6ad48ec2aa117d3d141940d484deccda84d4fcd884f5c3d93c23ecd8c79",
109 |                 "sha256:644837f692e5f550741432dd3f223bbb9852018674981b1664e5dc339387588a",
110 |                 "sha256:8ad8c4783bf61ded74527bffb48ed9b54166685e4230386a9ed9b1279e2df5b1",
111 |                 "sha256:2d81cbbe0e73112bdfe6ef8576f2238f2ba27dd0d55752a776c41d38b7da2850"
112 |             ],
113 |             "version": "==1.1.6"
114 |         },
115 |         "floyd-cli": {
116 |             "hashes": [
117 |                 "sha256:0ecd7d42b91ab88b4e3e852f37c22f8ede849de96e5c0f3b9c20e4bd6fad5bbc"
118 |             ],
119 |             "version": "==0.10.31"
120 |         },
121 |         "funcsigs": {
122 |             "hashes": [
123 |                 "sha256:330cc27ccbf7f1e992e69fef78261dc7c6569012cf397db8d3de0234e6c937ca",
124 |                 "sha256:a7bb0f2cf3a3fd1ab2732cb49eba4252c2af4240442415b4abce3b87022a8f50"
125 |             ],
126 |             "markers": "python_version < '3.3'",
127 |             "version": "==1.0.2"
128 |         },
129 |         "futures": {
130 |             "hashes": [
131 |                 "sha256:c4884a65654a7c45435063e14ae85280eb1f111d94e542396717ba9828c4337f",
132 |                 "sha256:51ecb45f0add83c806c68e4b06106f90db260585b25ef2abfcda0bd95c0132fd"
133 |             ],
134 |             "markers": "python_version < '3.2'",
135 |             "version": "==3.1.1"
136 |         },
137 |         "generate-data": {
138 |             "git": "https://github.com/Octavian-ai/generate-data.git"
139 |         },
140 |         "h5py": {
141 |             "hashes": [
142 |                 "sha256:562045c57a2e47aca9c716ac8cd64448d4897c0f5fe456ab5a34b17c8b3907cb",
143 |                 "sha256:e1bfcfa2c425dc0f637d4edd858b94e400bbb5746dba324ace124d55fc21d3df",
144 |                 "sha256:9e0537058efea7547d976f9c00067f7193727bb41ce6b4733c52de35beaa46f5",
145 |                 "sha256:9d9fb861e10735c5c710fe18f34c69e470cf161a4ba38717b7dde21de2d33760",
146 |                 "sha256:2d137a1b2f529e58886b5865f6dec51cd96ea0671dd84cebc6dba5cd8c7d0a75",
147 |                 "sha256:2ccb4f405059314829ebad1859d2c68e133a9d13ca7c3cc7a298a76a438fd09c",
148 |                 "sha256:52204972a02032d6a427addd37a24a22a2b97d4bce0850c84a6995db9c91926c",
149 |                 "sha256:1be9cd57e74b24f836d0d2c34ae376ff2df704f40aa8815aa9113b5a860d467f",
150 |                 "sha256:2258fca3533a3276fd86e9196326786f408a95748ac707c010fff265edf60342",
151 |                 "sha256:66609c48f8841357ced4291b7c9009518bb6e6fec449d91eb46aa417b6f5f4cf",
152 |                 "sha256:4a6e6cd8668fa453864f4f9e243460dcc2d41e79d14516b84f4ba74ebcc5b222",
153 |                 "sha256:a314e5e98037ece52ad0b88b4e0d788ca554935268f3e9d293ca9bcd18611b42",
154 |                 "sha256:478efa37b84a56061af5fcd286678331e873e216f6c5987cd31f9666edc2f157",
155 |                 "sha256:2b91c9117f2e7a2ef924bec41ac77e57567bec6731773373bf78eb4387b39a2a",
156 |                 "sha256:07ddea6bb649a257fc57ccae359a36d691b2ef8b9617971ae7d6f74ef6f67cad",
157 |                 "sha256:bb990d8663dbeee22ce44135ffd65ab38bd23d6a689722a653cfbf2d18d46688",
158 |                 "sha256:e78f09a44fc9256b84c9df98edf7b6ead3b3da2e12bf2d1e00384960a6a78a1a",
159 |                 "sha256:40dd37cbf24ca3b935a8d6eb8960ec5d0381219f82317bdc40aa9e08b3fcc143",
160 |                 "sha256:1fad9aa32835230de77b31edd6980b7c202de7bb7d8384d1bcb47b5dd32c8c7c",
161 |                 "sha256:537a60879485e5ce484ab4350c7bd8b3da4b531f9f82ef0a18780beabde98c90",
162 |                 "sha256:c050791989cd9979fe57a770d4e323b2e67ef95800e89e7dc6ad3652b8ccd86f",
163 |                 "sha256:b7e1c42367513108c3615cf1a24a9d366fd93eb9d2d92085bafb3011b785e8a9",
164 |                 "sha256:180a688311e826ff6ae6d3bda9b5c292b90b28787525ddfcb10a29d5ddcae2cc"
165 |             ],
166 |             "version": "==2.7.1"
167 |         },
168 |         "html5lib": {
169 |             "hashes": [
170 |                 "sha256:2612a191a8d5842bfa057e41ba50bbb9dcb722419d2408c78cff4758d0754868"
171 |             ],
172 |             "version": "==0.9999999"
173 |         },
174 |         "humanfriendly": {
175 |             "hashes": [
176 |                 "sha256:587b16ce804bec8e3cbb8c420decea051b38e3d895272b2c1e38fc69b4286b1c",
177 |                 "sha256:d0e74171b87318a94b99520e4f0c5651e944b5f11d696c46be3330bb82b85300"
178 |             ],
179 |             "version": "==4.8"
180 |         },
181 |         "idna": {
182 |             "hashes": [
183 |                 "sha256:8c7309c718f94b3a625cb648ace320157ad16ff131ae0af362c9f21b80ef6ec4",
184 |                 "sha256:2c6a5de3089009e3da7c5dde64a141dbc8551d5b7f6cf4ed7c2568d0cc520a8f"
185 |             ],
186 |             "version": "==2.6"
187 |         },
188 |         "keras": {
189 |             "hashes": [
190 |                 "sha256:7b1116bad7fb497758cfaffcd180e9adc2904be7deec2d9164543955e9973d0b",
191 |                 "sha256:7ca3a381523bad40a6922e88951a316664cb088fd01cea07e5ec8ada3327e3c7"
192 |             ],
193 |             "version": "==2.1.3"
194 |         },
195 |         "lazy": {
196 |             "hashes": [
197 |                 "sha256:c80a77bf7106ba7b27378759900cfefef38271088dc63b014bcfe610c8e68e3d"
198 |             ],
199 |             "version": "==1.3"
200 |         },
201 |         "markdown": {
202 |             "hashes": [
203 |                 "sha256:9ba587db9daee7ec761cfc656272be6aabe2ed300fece21208e4aab2e457bc8f",
204 |                 "sha256:a856869c7ff079ad84a3e19cd87a64998350c2b94e9e08e44270faef33400f81"
205 |             ],
206 |             "version": "==2.6.11"
207 |         },
208 |         "marshmallow": {
209 |             "hashes": [
210 |                 "sha256:8740ada95f47fa19f905772aa4932dc5512226a90c30da5672d6d6bf3dd791a7",
211 |                 "sha256:d3f31fe7be2106b1d783cbd0765ef4e1c6615505514695f33082805f929dd584"
212 |             ],
213 |             "version": "==2.15.0"
214 |         },
215 |         "mock": {
216 |             "hashes": [
217 |                 "sha256:5ce3c71c5545b472da17b72268978914d0252980348636840bd34a00b5cc96c1",
218 |                 "sha256:b158b6df76edd239b8208d481dc46b6afd45a846b7812ff0ce58971cf5bc8bba"
219 |             ],
220 |             "version": "==2.0.0"
221 |         },
222 |         "monotonic": {
223 |             "hashes": [
224 |                 "sha256:0bcd2b14e3b7ee7cfde796e408176ceffa01d89646f2e532964ef2aae0c9fa3e",
225 |                 "sha256:a02611d5b518cd4051bf22d21bd0ae55b3a03f2d2993a19b6c90d9d168691f84"
226 |             ],
227 |             "markers": "python_version == '2.6' or python_version == '2.7' or python_version == '3.0' or python_version == '3.1' or python_version == '3.2'",
228 |             "version": "==1.4"
229 |         },
230 |         "more-itertools": {
231 |             "hashes": [
232 |                 "sha256:11a625025954c20145b37ff6309cd54e39ca94f72f6bb9576d1195db6fa2442e",
233 |                 "sha256:0dd8f72eeab0d2c3bd489025bb2f6a1b8342f9b198f6fc37b52d15cfa4531fea",
234 |                 "sha256:c9ce7eccdcb901a2c75d326ea134e0886abfbea5f93e91cc95de9507c0816c44"
235 |             ],
236 |             "version": "==4.1.0"
237 |         },
238 |         "neo4j-driver": {
239 |             "hashes": [
240 |                 "sha256:a25c9b67e63403b6ca8114d18bee581d2cff032cdc89c68970a4be8cd30585d0"
241 |             ],
242 |             "version": "==1.5.3"
243 |         },
244 |         "numpy": {
245 |             "hashes": [
246 |                 "sha256:428cd3c0b197cf857671353d8c85833193921af9fafcc169a1f29c7185833d50",
247 |                 "sha256:a476e437d73e5754aa66e1e75840d0163119c3911b7361f4cd06985212a3c3fb",
248 |                 "sha256:289ff717138cd9aa133adcbd3c3e284458b9c8230db4d42b39083a3407370317",
249 |                 "sha256:c5eccb4bf96dbb2436c61bb3c2658139e779679b6ae0d04c5e268e6608b58053",
250 |                 "sha256:75471acf298d455b035226cc609a92aee42c4bb6aa71def85f77fa2c2b646b61",
251 |                 "sha256:5c54fb98ecf42da59ed93736d1c071842482b18657eb16ba6e466bd873e1b923",
252 |                 "sha256:9ddf384ac3aacb72e122a8207775cc29727cbd9c531ee1a4b95754f24f42f7f3",
253 |                 "sha256:781d3197da49c421a07f250750de70a52c42af08ca02a2f7bdb571c0625ae7eb",
254 |                 "sha256:93b26d6c06a22e64d56aaca32aaaffd27a4143db0ac2f21a048f0b571f2bfc55",
255 |                 "sha256:b2547f57d05ba59df4289493254f29f4c9082d255f1f97b7e286f40f453e33a1",
256 |                 "sha256:eef6af1c752eef538a96018ef9bdf8e37bbf28aab50a1436501a4aa47a6467df",
257 |                 "sha256:ff8a4b2c3ac831964f529a2da506c28d002562b230261ae5c16885f5f53d2e75",
258 |                 "sha256:194074058c22a4066e1b6a4ea432486ee468d24ab16f13630c1030409e6b8666",
259 |                 "sha256:4e13f1a848fde960dea33702770265837c72b796a6a3eaac7528cfe75ddefadd",
260 |                 "sha256:91101216d72749df63968d86611b549438fb18af2c63849c01f9a897516133c7",
261 |                 "sha256:97507349abb7d1f6b76b877258defe8720833881dc7e7fd052bac90c88587387",
262 |                 "sha256:1479b46b6040b5c689831496354c8859c456b152d37315673a0c18720b41223b",
263 |                 "sha256:98b1ac79c160e36093d7914244e40ee1e7164223e795aa2c71dcce367554e646",
264 |                 "sha256:24bbec9a199f938eab75de8390f410969bc33c218e5430fa1ae9401b00865255",
265 |                 "sha256:7880f412543e96548374a4bb1d75e4cdb8cad80f3a101ed0f8d0e0428f719c1c",
266 |                 "sha256:6112f152b76a28c450bbf665da11757078a724a90330112f5b7ea2d6b6cefd67",
267 |                 "sha256:7c5276763646480143d5f3a6c2acb2885460c765051a1baf4d5070f63d05010f",
268 |                 "sha256:3de643935b212307b420248018323a44ec51987a336d1d747c1322afc3c099fb"
269 |             ],
270 |             "version": "==1.14.0"
271 |         },
272 |         "pathlib2": {
273 |             "hashes": [
274 |                 "sha256:db3e43032d23787d3e9aec8c7ef1e0d2c3c589d5f303477661ebda2ca6d4bfba",
275 |                 "sha256:d32550b75a818b289bd4c1f96b60c89957811da205afcceab75bc8b4857ea5b3"
276 |             ],
277 |             "version": "==2.3.0"
278 |         },
279 |         "pbr": {
280 |             "hashes": [
281 |                 "sha256:60c25b7dfd054ef9bb0ae327af949dd4676aa09ac3a9471cdc871d8a9213f9ac",
282 |                 "sha256:05f61c71aaefc02d8e37c0a3eeb9815ff526ea28b3b76324769e6158d7f95be1"
283 |             ],
284 |             "version": "==3.1.1"
285 |         },
286 |         "protobuf": {
287 |             "hashes": [
288 |                 "sha256:11788df3e176f44e0375fe6361342d7258a457b346504ea259a21b77ffc18a90",
289 |                 "sha256:50c24f0d00b7efb3a72ae638ddc118e713cfe8cef40527afe24f7ebcb878e46d",
290 |                 "sha256:41661f9a442eba2f1967f15333ebe9ecc7e7c51bcbaa2972303ad33a4ca0168e",
291 |                 "sha256:06ec363b74bceb7d018f2171e0892f03ab6816530e2b0f77d725a58264551e48",
292 |                 "sha256:b20f861b55efd8206428c13e017cc8e2c34b40b2a714446eb202bbf0ff7597a6",
293 |                 "sha256:c1f9c36004a7ae6f1ce4a23f06070f6b07f57495f251851aa15cc4da16d08378",
294 |                 "sha256:4d2e665410b0a278d2eb2c0a529ca2366bb325eb2ae34e189a826b71fb1b28cd",
295 |                 "sha256:95b78959572de7d7fafa3acb718ed71f482932ddddddbd29ba8319c10639d863"
296 |             ],
297 |             "version": "==3.5.1"
298 |         },
299 |         "pygments": {
300 |             "hashes": [
301 |                 "sha256:78f3f434bcc5d6ee09020f92ba487f95ba50f1e3ef83ae96b9d5ffa1bab25c5d",
302 |                 "sha256:dbae1046def0efb574852fab9e90209b23f556367b5a320c0bcb871c77c3e8cc"
303 |             ],
304 |             "version": "==2.2.0"
305 |         },
306 |         "pytz": {
307 |             "hashes": [
308 |                 "sha256:80af0f3008046b9975242012a985f04c5df1f01eed4ec1633d56cc47a75a6a48",
309 |                 "sha256:feb2365914948b8620347784b6b6da356f31c9d03560259070b2f30cff3d469d",
310 |                 "sha256:59707844a9825589878236ff2f4e0dc9958511b7ffaae94dc615da07d4a68d33",
311 |                 "sha256:d0ef5ef55ed3d37854320d4926b04a4cb42a2e88f71da9ddfdacfde8e364f027",
312 |                 "sha256:c41c62827ce9cafacd6f2f7018e4f83a6f1986e87bfd000b8cfbd4ab5da95f1a",
313 |                 "sha256:8cc90340159b5d7ced6f2ba77694d946fc975b09f1a51d93f3ce3bb399396f94",
314 |                 "sha256:dd2e4ca6ce3785c8dd342d1853dd9052b19290d5bf66060846e5dc6b8d6667f7",
315 |                 "sha256:699d18a2a56f19ee5698ab1123bbcc1d269d061996aeb1eda6d89248d3542b82",
316 |                 "sha256:fae4cffc040921b8a2d60c6cf0b5d662c1190fe54d718271db4eb17d44a185b7"
317 |             ],
318 |             "version": "==2017.3"
319 |         },
320 |         "pyyaml": {
321 |             "hashes": [
322 |                 "sha256:3262c96a1ca437e7e4763e2843746588a965426550f3797a79fca9c6199c431f",
323 |                 "sha256:16b20e970597e051997d90dc2cddc713a2876c47e3d92d59ee198700c5427736",
324 |                 "sha256:e863072cdf4c72eebf179342c94e6989c67185842d9997960b3e69290b2fa269",
325 |                 "sha256:bc6bced57f826ca7cb5125a10b23fd0f2fff3b7c4701d64c439a300ce665fff8",
326 |                 "sha256:c01b880ec30b5a6e6aa67b09a2fe3fb30473008c85cd6a67359a1b15ed6d83a4",
327 |                 "sha256:827dc04b8fa7d07c44de11fabbc888e627fa8293b695e0f99cb544fdfa1bf0d1",
328 |                 "sha256:592766c6303207a20efc445587778322d7f73b161bd994f227adaa341ba212ab",
329 |                 "sha256:5f84523c076ad14ff5e6c037fe1c89a7f73a3e04cf0377cb4d017014976433f3",
330 |                 "sha256:0c507b7f74b3d2dd4d1322ec8a94794927305ab4cebbe89cc47fe5e81541e6e8",
331 |                 "sha256:b4c423ab23291d3945ac61346feeb9a0dc4184999ede5e7c43e1ffb975130ae6",
332 |                 "sha256:ca233c64c6e40eaa6c66ef97058cdc80e8d0157a443655baa1b2966e812807ca",
333 |                 "sha256:4474f8ea030b5127225b8894d626bb66c01cda098d47a2b0d3429b6700af9fd8",
334 |                 "sha256:326420cbb492172dec84b0f65c80942de6cedb5233c413dd824483989c000608",
335 |                 "sha256:5ac82e411044fb129bae5cfbeb3ba626acb2af31a8d17d175004b70862a741a7"
336 |             ],
337 |             "version": "==3.12"
338 |         },
339 |         "raven": {
340 |             "hashes": [
341 |                 "sha256:0adae40e004dfe2181d1f2883aa3d4ca1cf16dbe449ae4b445b011c6eb220a90",
342 |                 "sha256:84da75114739191bdf2388f296ffd6177e83567a7fbaf2701e034ad6026e4f3b"
343 |             ],
344 |             "version": "==6.5.0"
345 |         },
346 |         "recurrentshop": {
347 |             "git": "https://github.com/datalogai/recurrentshop.git"
348 |         },
349 |         "requests": {
350 |             "hashes": [
351 |                 "sha256:6a1b267aa90cac58ac3a765d067950e7dbbf75b1da07e895d1f594193a40a38b",
352 |                 "sha256:9c443e7324ba5b85070c4a818ade28bfabedf16ea10206da1132edaa6dda237e"
353 |             ],
354 |             "version": "==2.18.4"
355 |         },
356 |         "requests-toolbelt": {
357 |             "hashes": [
358 |                 "sha256:42c9c170abc2cacb78b8ab23ac957945c7716249206f90874651971a4acff237",
359 |                 "sha256:f6a531936c6fa4c6cfce1b9c10d5c4f498d16528d2a54a22ca00011205a187b5"
360 |             ],
361 |             "version": "==0.8.0"
362 |         },
363 |         "scandir": {
364 |             "hashes": [
365 |                 "sha256:913d0d04f3ea8f38a52a38e930a08deacd3643d71875a0751a5c01e006102998",
366 |                 "sha256:eb9d4a55bbeb0473a9c7d3ff81e12d44f0ad86daff48b02a95e2398c87ff1a00",
367 |                 "sha256:2b28d118b372de8950f85b65d8ddfd43643f139a5b721281dd6532bed6b8321c",
368 |                 "sha256:f14476800cfdd6809d5130840f78ca3c08aa25544113e2b33a0b2fe914583d69",
369 |                 "sha256:6db5aadb667bb709cc23921203e9c27f08225506a9b84b7ebe2b645dee47a4dd",
370 |                 "sha256:8129fe7b9211d080457e0ff87397d85bb9be6ebb482b6be6ad9700059ac2e516",
371 |                 "sha256:8fe782abf9314f2733c09d2191c1b3047475218ddbae90052b5c0f1a4215d5e2",
372 |                 "sha256:a93b6cc872eeccdc91b4c1c1e510820bee17f79c9455064fb8d3b73b51e52024",
373 |                 "sha256:9851e782da220073093da68b3451e3c33b10f84eca2aec17a24661c7c63357a2",
374 |                 "sha256:937d27e367af994afd3792904b794a82645ea9616dd336f5030e0b50e527eb57",
375 |                 "sha256:e0278a2d4bc6c0569aedbe66bf26c8ab5b2b08378b3289de49257f23ac624338"
376 |             ],
377 |             "markers": "python_version < '3.5'",
378 |             "version": "==1.6"
379 |         },
380 |         "scikit-learn": {
381 |             "hashes": [
382 |                 "sha256:3775cca4ce3f94508bb7c8a6b113044b78c16b0a30a5c169ddeb6b9fe57a8a72",
383 |                 "sha256:873245b03361710f47c5410a050dc56ee8ae97b9f8dcc6e3a81521ca2b64ad10",
384 |                 "sha256:370919e3148253fd6552496c33a1e3d78290a336fc8d1b9349d9e9770fae6ec0",
385 |                 "sha256:ce78bf4d10bd7e28807c36c6d2ab25a9934aaf80906ad987622a5e45627d91a2",
386 |                 "sha256:ba3fd442ae1a46830789b3578867daaf2c8409dcca6bf192e30e85beeabbfc2f",
387 |                 "sha256:a21cf8217e31a9e8e32c559246e05e6909981816152406945ae2e3e244dfcc1f",
388 |                 "sha256:e54a3dd1fe1f8124de90b93c48d120e6da2ea8df29b6895325df01ddc1bd8e26",
389 |                 "sha256:f9abae483f4d52acd6f660addb1b67e35dc5748655250af479de2ea6aefc6df0",
390 |                 "sha256:5c9ff456d67ef9094e5ea272fff2be05d399a47fc30c6c8ed653b94bdf787bd1",
391 |                 "sha256:871669cdb5b3481650fe3adff46eb97c455e30ecdc307eaf382ef90d4e2570ab",
392 |                 "sha256:d4da369614e55540c7e830ccdd17ab4fe5412ff8e803a4906d3ece393e2e3a63",
393 |                 "sha256:42f3c5bd893ed73bf47ccccf04dfb98fae743f397d688bb58c2238c0e6ec15d2",
394 |                 "sha256:95b155ef6bf829ddfba6026f100ba8e4218b7171ecab97b2163bc9e8d206848f",
395 |                 "sha256:72c194c5092e921d6107a8de8a5adae58c35bbc54e030ba624b6f02fd823bb21",
396 |                 "sha256:f528c4b2bba652cf116f5cccf36f4db95a7f9cbfcd1ee549c4e8d0f8628783b5",
397 |                 "sha256:d384e6f9a055b7a43492f9d27779adb717eb5dcf78b0603b01d0f070a608d241",
398 |                 "sha256:ee8c3b1898c728b6e5b5659c233f547700a1fea13ce876b6fe7d3434c70cc0e0",
399 |                 "sha256:56cfa19c31edf62e6414da0a337efee37a4af488b135640e67238786b9be6ab3",
400 |                 "sha256:5db9e68a384ce80a17fc449d4d5d9b45025fe17cf468429599bf404eccb51049",
401 |                 "sha256:8b17fc29554c5c98d88142f895516a5bec2b6b61daa815e1193a64c868ad53d2",
402 |                 "sha256:13136c6e4f6b808569f7f59299d439b2cd718f85d72ea14b5b6077d44ebc7d17",
403 |                 "sha256:ddc1eb10138ae93c136cc4b5945d3977f302b5d693592a4731b2805a7d7f2a74",
404 |                 "sha256:5ca0ad32ee04abe0d4ba02c8d89d501b4e5e0304bdf4d45c2e9875a735b323a0",
405 |                 "sha256:6e0899953611d0c47c0d49c5950082ab016b38811fced91cd2dcc889dd94f50a",
406 |                 "sha256:b2a10e2f9b73de10d8486f7a23549093436062b69139158802910a0f154aa53b",
407 |                 "sha256:a58746d4f389ea7df1d908dba8b52f709835f91c342f459a3ade5424330c69d1",
408 |                 "sha256:fdc39e89bd3466befb76dfc0c258d4ccad159df974954a87de3be5759172a067"
409 |             ],
410 |             "version": "==0.19.1"
411 |         },
412 |         "scipy": {
413 |             "hashes": [
414 |                 "sha256:70e6fc3f2f52c9152f05e27eb9bd8543cb862cacb71f8521a571e4ffb837f450",
415 |                 "sha256:08041e5336fcd57defcc78650b44b3df652eff3e3a801638d894e50494fb630d",
416 |                 "sha256:ff8b6637d8d2c074ed67f3d57513e62f94747c6f1210f43e60ad3d8e93a424e4",
417 |                 "sha256:5964dba6a3c0be226d44d2520de8fb4ba1501768bad57eec687d36d3f53b6254",
418 |                 "sha256:bf36f3485e7b7291c36330a93bbfd4f5e8db23bbe4ea46c37b2839fef463f4e2",
419 |                 "sha256:e3a5673c105eab802fdecb77f102d877352e201df9328698a265b7f57546b34b",
420 |                 "sha256:cd23894e1cc6eaa00e6807b6b12e4ca66d5ff092986c9c3eb01e97f24e2d6462",
421 |                 "sha256:23a7238279ae94e088396b8b05a9795ef598dc79c5cd1adb91ad1ff87c7514fd",
422 |                 "sha256:3b66d5e40152175bca75cbbfd1eb5c108c50de9ae5625923f1c4f8f51cbe2dea",
423 |                 "sha256:fa17be6c66985931d3a391f61a6ba97c902585cf26020aa3eb24604115732d22",
424 |                 "sha256:d84df0bc86bbdd49f0a6b6bad5cd62ccb02a3bfe546bf79263de44ae081bcd7b",
425 |                 "sha256:912499ddb521b7ac6287ac4ccf5f296a83d38996c2d04f43c9e62a91f7b420aa",
426 |                 "sha256:889602ead28054a15e8c26e1a6b8420d5a4fa777cfeb3ec98cfa52b9f317d153",
427 |                 "sha256:5774adb6047983489bc81edaa72cd132e665e5680f0b2cf8ea28cd3b99e65d39",
428 |                 "sha256:01c7040a83eb4e020ab729488637dcadef54cb728b035b76668ab92a72515d60",
429 |                 "sha256:046705c604c6f1d63cad3e89677c0618b7abb40ed09a4c241c671a2d8e5128a9",
430 |                 "sha256:1f58fbd59e8d9652759df0d137832ff2a325ed708c173cba20c86589d811c210",
431 |                 "sha256:424500b2fe573d30de6dea927076c01acaadb3efb3d1f40340e8cc37151ccf27",
432 |                 "sha256:97123a25216616723083942eb595f47fee18da6b637a88b803de5f078009003c",
433 |                 "sha256:a79b99b8b5af9a63312bd053bbb7bdb7710e6bbb9cc81617f9f6b9b1e49c72f8",
434 |                 "sha256:9bd193686fd837472bdb6425486cb234ed0a4db76b930c141cc8d095ab213c8d",
435 |                 "sha256:a9e479648aab5f36330da94f351ebbfe79acb4e6f5e6ac6aeddc9291eb096839",
436 |                 "sha256:87ea1f11a0e9ec08c264dc64551d501fa307289460705f6fccd84cbfc7926d10"
437 |             ],
438 |             "version": "==1.0.0"
439 |         },
440 |         "six": {
441 |             "hashes": [
442 |                 "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb",
443 |                 "sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9"
444 |             ],
445 |             "version": "==1.11.0"
446 |         },
447 |         "sklearn": {
448 |             "hashes": [
449 |                 "sha256:e23001573aa194b834122d2b9562459bf5ae494a2d59ca6b8aa22c85a44c0e31"
450 |             ],
451 |             "version": "==0.0"
452 |         },
453 |         "tabulate": {
454 |             "hashes": [
455 |                 "sha256:e4ca13f26d0a6be2a2915428dc21e732f1e44dad7f76d7030b2ef1ec251cf7f2"
456 |             ],
457 |             "version": "==0.8.2"
458 |         },
459 |         "tensorflow": {
460 |             "hashes": [
461 |                 "sha256:f9c03acc5d26ac903e177fb904ceb797632830c5a0fae5c8b49d688a748337db",
462 |                 "sha256:c6d798da0002778f38e3b097acd7a620c89ff060fa3823c054113885b2472173",
463 |                 "sha256:975cbdeb016c3f14ad44f4919260e279918fba08c4bb3d7172ae4bf1aa612292",
464 |                 "sha256:62e3884a1d7824f20a172ae2861aab50b1802989e85a971f9dfaf61444226856",
465 |                 "sha256:9e6681a4b1e46936dbcc56ac213f61633979f6f348319658431181ffc3c1936c",
466 |                 "sha256:e43641ac5bbfc8a0d37fb8b78657f664856fe83b1ab7acf298f57780e6fbf2de",
467 |                 "sha256:cceb8439975ea508ffd19a312d7ff83149ab81d7e8a88685852bbea4ded98736",
468 |                 "sha256:bf51429bc11ab4561b5d124c08a5ee6476519d33b5970338586767563a02adc4",
469 |                 "sha256:ee96a38a3ba3c53e1cdd8cc2af59d5f378b7992e63c54fba9605c963b209e814",
470 |                 "sha256:233d66bfad2287c61434384ec315bbf37b2f551beda2e0d37a8c24a0f2ed3896"
471 |             ],
472 |             "version": "==1.4.1"
473 |         },
474 |         "tensorflow-tensorboard": {
475 |             "hashes": [
476 |                 "sha256:4ff1c16faa8189c921b57ccb5f05ea1e19c276d59de7dcae3d846a6267a132d0",
477 |                 "sha256:6684571c711e07b3aae25dd91cb4b106738d71acfce385b9d359ab14374ac518"
478 |             ],
479 |             "version": "==0.4.0"
480 |         },
481 |         "tqdm": {
482 |             "hashes": [
483 |                 "sha256:4c041f8019f7be65b8028ddde9a836f7ccc51c4637f1ff2ba9b5813d38d19d5a",
484 |                 "sha256:df32e6f127dc0ccbc675eadb33f749abbcb8f174c5cb9ec49c0cdb73aa737377"
485 |             ],
486 |             "version": "==4.19.5"
487 |         },
488 |         "urllib3": {
489 |             "hashes": [
490 |                 "sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b",
491 |                 "sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f"
492 |             ],
493 |             "version": "==1.22"
494 |         },
495 |         "werkzeug": {
496 |             "hashes": [
497 |                 "sha256:d5da73735293558eb1651ee2fddc4d0dedcfa06538b8813a2e20011583c9e49b",
498 |                 "sha256:c3fd7a7d41976d9f44db327260e263132466836cef6f91512889ed60ad26557c"
499 |             ],
500 |             "version": "==0.14.1"
501 |         },
502 |         "wheel": {
503 |             "hashes": [
504 |                 "sha256:e721e53864f084f956f40f96124a74da0631ac13fbbd1ba99e8e2b5e9cafdf64",
505 |                 "sha256:9515fe0a94e823fd90b08d22de45d7bde57c90edce705b22f5e1ecf7e1b653c8"
506 |             ],
507 |             "version": "==0.30.0"
508 |         }
509 |     },
510 |     "develop": {}
511 | }
512 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Join our Discord >> https://discord.gg/a2Z82Te
  2 | 
  3 | # Review prediction
  4 | 
  5 | ## Introduction
  6 | 
  7 | The aim of this experiment is to investigate the performance of
  8 | 1) different NN approaches 
  9 | 2) different graph representations of the same data 
 10 | 
 11 | on a simple synthetic prediction task.
 12 | 
 13 | ## The Task
 14 | 
 15 | We model personalised recommendations as a system containing _people_, _products_ and _recommendations_. In our system every product has a _style_ and each person has a _style preference_. _People_ can make _reviews_ of products. In our system the _review score_ will be a function _Y(...)_ of the person's _style preference_ and the product's _style_. We call this function the _opinion function_ i.e.:
 16 |  
 17 |   _review_score_ = _Y(product_style, person_style_preference)_ 
 18 | 
 19 | We will generate data using this model. We will then use this synthetic data to investigate how effective various ML approaches on the data set  are at learning the behaviour of this system.
 20 | 
 21 | 
 22 | If necessary we can change the opinion function _Y(...)_ to increase or decrease the difficulty of the task.
 23 | 
 24 | ## The Synthetic Data
 25 | 
 26 | The synthetic data for this task can be varied in various ways:
 27 | 
 28 | 1) Change which information is hidden e.g. we could hide _product_style_, _style_preference_ or both.
 29 | 1) Change the representation of the key properties e.g. reviews/styles and preferences could be boolean, categorical, continuous scalars or even multi dimensional vectors.
 30 | 1) Change how the data is represented as a graph e.g. reviews could be nodes in their own right, or they could be edges with properties, product_style could be a property on a product node or product_style could be a seperate node connected to a product node by a _HAS_STYLE_ relationship (edge).
 31 | 1) Add additional meaningless or semi-meaningless information to the training data. 
 32 | 
 33 | We will generate different data sets to qualitatively investigate different ML approaches on the same basic system.
 34 | 
 35 | 
 36 | ## Evaluation Tasks
 37 | 
 38 | We are interested in four different evaluation tasks depending on whether the person or product is included in the training set or not:
 39 | 
 40 | - **new product == unknown** at training time i.e. not in training set or validation set
 41 | - **new person == unknown** at training time i.e. not in training set or validation set
 42 | - **existing product == known** at training time i.e. present in training set
 43 | - **existing person == known** at training time i.e. present in training set
 44 | 
 45 | The evaluation tasks we are interested in are, how well can you predict the person's review? Given:
 46 | 
 47 | 1) new product and new person 
 48 | 1) existing product and new person
 49 | 1) new product and existing person
 50 | 1) existing product and existing person
 51 | 
 52 | 
 53 | ## Approach
 54 | 
 55 | Although we have a synthetic system for which we can generate more data we want to get into good habits for working with "real" data. So we will attempt to blind the ML system to the fact that we are working with synthetic data and not rely on our ability to generate more information at will.
 56 | 
 57 | It will be the responsibility of the ML part of the system to split the data into Test / Train and Validation sets. However for each data set that we generate we will keep back a small portion to make up a "golden" test set which is only to be used at the very end of our investigation. This is to perform a final test of the ML predictor, one which we haven't had the opportunity to optimise the meta-parameters for.
 58 | 
 59 | Because of the three different evaluation tasks it will be necessary for us to keep back three different golden test sets, of a large enough size to test the system regardless of the test/training split. We will keep the following volumes of golden test data:
 60 | 
 61 | 1) INDEPENDENT: A completely independent data set containing 1000 reviews
 62 | 2) NEW_PEOPLE: new people + their reviews of existing products containing approx 2000 reviews
 63 | 3) NEW_PRODUCTS: new products + reviews of them by existing people containing approx 2000 reviews
 64 | 4) EXISTING: 2000 additional reviews between existing people and products.
 65 | 
 66 | 
 67 | 
 68 | # The Data Sets
 69 | 
 70 | ## Data Set 1: A simple binary preference system
 71 | 
 72 | Products have a binary style and people have a binary preference.
 73 | 
 74 | - All variables will be 'public' in the data set
 75 | 
 76 | 
 77 | ### Product Style
 78 | - _product_style_ will be categorical with two mutually exclusive elements (A and B).
 79 | - The distribution of product styles will be uniform i.e. Approx 50% of products will have style A and 50% will have style B.
 80 | 
 81 | 
 82 | ### Style Preference
 83 | - _person_style_preference_ will be categorical with two mutually exclusive elements (likes_A_dislikes_B | likes_B_dislikes_A ).
 84 | - The distribution of product styles will be uniform i.e. Approx 50% of people will like style A and 50% will like style B.
 85 | 
 86 | 
 87 | ### Reviews and Opinion Function
 88 | - _review_score_ will be boolean (1 for a positive review and 0 for a negative review)
 89 | - Each person will have made either 1 or 2 reviews. The mean number of reviews-per-person will be approx 1.5 i.e. approx 50% will have made 2 reviews and 50% will have made 1 review. 
 90 | - _review_score_ is the dot product of the _product_style_ and _person_style_preference_ normalised to the range of 0 to 1
 91 | 
 92 | Note: having people with 0 reviews would be useless since you cannot train or validate/test using them.
 93 |  
 94 | Note: fixing the number of reviews-per-person would restrict the graph structure too much and open up the problem to approaches that we aren't interested in right now.
 95 | 
 96 | 
 97 | ### Entity Ratios and Data Set Size
 98 | 
 99 | I basically made these up. Intuitively the reviews-per-product and reviews-per-person parameters affect how much we can infer about people/product hidden variables. I like the idea of those figures being very different so we can see how systems cope with that distinction. 
100 | 
101 | - _people_:_products_ = 50:1
102 | - _people_:_reviews_ = 1:1.5
103 | - _reviews_:_products_ = 75:1
104 | 
105 | Data set size: 12000 reviews / 160 products / 8000 people 
106 | 
107 | n.b. because we assign the reviews randomly some products may not have reviews, but it is relatively unlikely.
108 | 
109 | ### Graph Schema
110 | 
111 | PERSON(id: <uuid>, style_preference: A|B, is_golden: True|False) -- WROTE(is_golden: True|False) -> REVIEW(id: <uuid>, score: 1|0, is_golden: True|False) -- OF(is_golden: True|False) --> PRODUCT(id: <uuid>, style: A:B, is_golden: True|False)
112 | 
113 | ### Data generation algorithm
114 | 
115 | 1) Instantiate all products for public data set and write to Neo, keeping an array of the ids.
116 | 1) Iteratively instantiate people, decide how many reviews that person will have made (probabilistically)
117 | 1) For each review that the person has to make randomly choose a product to review (without replacement)
118 | 1) Calculate the review score and submit the Person + their reviews to Neo
119 | 1) Read the data back out of neo and validate the entity ratios
120 | 1) Create the golden test sets: 
121 |   - NEW_PEOPLE: create 2000/reviews_per_person new people + their reviews of randomly selected (with replacement) existing products.
122 |   - NEW_PRODUCTS: create 2000/reviews_per_product new products, have randomly selected (with replacement) people review them.
123 |   - EXISTING randomly pick 2000 people (with replacement) have each of them review a randomly selected (with replacement) product
124 |   - INDEPENDENT is easy, but best to leave till last to avoid confusion - just repeat the basic data generation from scratch
125 | 
126 |  
127 | 


--------------------------------------------------------------------------------
/bin/floyd-run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | floyd run  \
 4 | 	--data davidmack/datasets/graph_experiments/1:/data \
 5 | 	--env tensorflow-1.4 \
 6 | 	--gpu \
 7 | 	--tensorboard \
 8 | 	--message "adj dense with dropout" \
 9 | 	"ENVIRONMENT=floyd python train.py \
10 | 	 	--output-dir /output \
11 | 	 	--data-dir /data/ \
12 | 	 	--epochs 100"


--------------------------------------------------------------------------------
/bin/start_neo4j_locally.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | CONTAINER_ID=$(docker run -d  -e NEO4J_dbms_memory_heap_max__size=2000m --publish=7474:7474 --publish=7687:7687 --volume=$(pwd)/data/neo4j:/data  neo4j:3.2.7)
4 | sleep 10
5 | docker run -it --net host neo4j:3.2.7 bin/cypher-shell -u neo4j -p neo4j "CALL dbms.changePassword('local neo hates security!')"
6 | 
7 | echo "Neo4j running locally. To stop it: docker kill ${CONTAINER_ID}"


--------------------------------------------------------------------------------
/config/.gitignore:
--------------------------------------------------------------------------------
1 | local*


--------------------------------------------------------------------------------
/config/__init__.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | from .environment import Environment
 3 | 
 4 | # There are too many layers and too many files to this config system
 5 | default_values = {
 6 |     'neo4j_url': 'bolt://localhost',
 7 |     'neo4j_user': 'neo4j',
 8 |     'neo4j_password': 'local neo hates security!'
 9 | }
10 | 
11 | environment_box = Environment(None)
12 | 
13 | 
14 | def set_environment(environment_name):
15 |     environment_box.name = environment_name
16 | 
17 | 
18 | def get(config_variable_name):
19 |     # don't execute code in overrides till necessary
20 |     from .overrides import overrides
21 |     return overrides[environment_box.name].get(config_variable_name, default_values[config_variable_name])
22 | 
23 | 
24 | class Config(object):
25 |     @property
26 |     def neo4j_url(self):
27 |         return get('neo4j_url')
28 | 
29 |     @property
30 |     def neo4j_user(self):
31 |         return get('neo4j_user')
32 | 
33 |     @property
34 |     def neo4j_password(self):
35 |         return get('neo4j_password')
36 | 
37 | 
38 | config: Config = Config()
39 | 
40 | import os
41 | 
42 | if 'ENVIRONMENT' not in os.environ:
43 |     raise Exception("You must set an ENVIRONMENT variable. Sorry, I am very opinionated that we should not have a default value because it will mask misconfiguration issues later.")
44 | set_environment(os.environ['ENVIRONMENT'])


--------------------------------------------------------------------------------
/config/environment.py:
--------------------------------------------------------------------------------
1 | class Environment(object):
2 |     def __init__(self, name):
3 |         self.name = name


--------------------------------------------------------------------------------
/config/overrides.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | overrides = dict()
 3 | 
 4 | # There are too many layers and too many files to this config system
 5 | overrides.update(**{
 6 |     'remote': {
 7 |         'neo4j_url': 'bolt://796bafef-staging.databases.neo4j.io',
 8 |         'neo4j_user': 'readonly',
 9 |         'neo4j_password': '0s3DGA6Zq'
10 |     },
11 |     'floyd': { # Todo: implement me?
12 |         'neo4j_url': 'bolt://796bafef-staging.databases.neo4j.io',
13 |         'neo4j_user': 'readonly',
14 |         'neo4j_password': '0s3DGA6Zq'
15 |     },
16 |     'local': { # Just uses defaults
17 | 
18 |     }
19 | })
20 | 
21 | with open('./config/local_overrides.json') as f:
22 |     overrides.update(json.load(f))


--------------------------------------------------------------------------------
/data_sets/synthetic_review_prediction/article_0/__init__.py:
--------------------------------------------------------------------------------
1 | from .configure import DATASET_NAME, create_data_set_properties
2 | from .generate import run as _run
3 | 
4 | 
5 | def run(client):
6 |     print(DATASET_NAME)
7 |     return _run(client, create_data_set_properties())
8 | 


--------------------------------------------------------------------------------
/data_sets/synthetic_review_prediction/article_0/configure.py:
--------------------------------------------------------------------------------
 1 | from ..meta_classes import DataSetProperties
 2 | from ..meta_classes.data_set_properties import PersonStyleWeightDistribution, PersonStyleWeight, ProductStyleWeight
 3 | from ..utils import WeightedOption, Distribution
 4 | from ..classes import PersonStylePreferenceEnum, ProductStyleEnum, Style
 5 | from ..experiment_1.opinion_function import opinion_function
 6 | from ..experiment_1.style_functions import person_style_function, product_style_function
 7 | from graph_io.classes.dataset_name import DatasetName
 8 | 
 9 | DATASET_NAME = DatasetName('article_0')
10 | 
11 | 
12 | def create_data_set_properties() -> DataSetProperties:
13 |     N_STYLES = 2
14 |     styles = [Style(str(i)) for i in range(N_STYLES)]
15 | 
16 |     for style in styles:
17 |         ProductStyleEnum.register('LIKES_STYLE_'+style.value, style)
18 |         PersonStylePreferenceEnum.register('HAS_STYLE_'+style.value, style)
19 | 
20 |     data_set_properties = DataSetProperties(
21 |         dataset_name=DATASET_NAME,
22 |         n_reviews=20000,
23 |         reviews_per_product=10,
24 |         reviews_per_person_distribution=[
25 |             WeightedOption[int](1, 0.25),
26 |             WeightedOption[int](2, 0.25),
27 |             WeightedOption[int](3, 0.25),
28 |             WeightedOption[int](4, 0.25)
29 |         ],
30 |         person_styles_distribution=PersonStyleWeightDistribution([
31 |             PersonStyleWeight(x, 1) for x in PersonStylePreferenceEnum.iterate()
32 |         ]),
33 |         product_styles_distribution=Distribution[ProductStyleWeight, ProductStyleEnum]([
34 |             ProductStyleWeight(x, 1) for x in ProductStyleEnum.iterate()
35 |         ]),
36 |         opinion_function=opinion_function,
37 |         person_style_function=person_style_function,
38 |         product_style_function=product_style_function,
39 |         n_companies=0,
40 |         person_company_number_of_relationships_distribution=[]
41 |     )
42 | 
43 |     return data_set_properties
44 | 


--------------------------------------------------------------------------------
/data_sets/synthetic_review_prediction/article_0/generate.py:
--------------------------------------------------------------------------------
 1 | from ..classes import PersonWroteReview, ReviewOfProduct, IsGoldenFlag
 2 | import random
 3 | 
 4 | from ..meta_classes import DataSetProperties
 5 | from ..experiment_1.simple_data_set import SimpleDataSet
 6 | from ..utils import DatasetWriter
 7 | from graph_io import QueryParams, CypherQuery
 8 | 
 9 | 
10 | def run(client, data_set_properties: DataSetProperties):
11 | 
12 |     with DatasetWriter(client, data_set_properties.dataset_name, {"is_golden",""}) as writer:
13 | 
14 |         writer.nuke_dataset()
15 | 
16 |         data_set: SimpleDataSet = SimpleDataSet(data_set_properties)
17 | 
18 |         def create_indexes():
19 |             client.execute_cypher_write(CypherQuery("CREATE INDEX ON :NODE(id)"), QueryParams())
20 |             #client.execute_cypher_write(CypherQuery("CREATE INDEX ON :NODE(id, dataset_name)"), QueryParams())
21 |             pass
22 | 
23 |         create_indexes()
24 | 
25 |         for i, product in enumerate(data_set.generate_public_products()):
26 |             writer.create_node_if_not_exists(product, {"style"})
27 | 
28 |         for i, person in enumerate(data_set.generate_public_people()):
29 |             writer.create_node_if_not_exists(person, {"style_preference"})
30 | 
31 |             for review in data_set.generate_reviews(person):
32 |                 review.test = random.random() <= 0.1
33 |                 writer.create_node_if_not_exists(review, {"score", "test"})
34 |                 writer.create_edge_if_not_exists(PersonWroteReview(review.by_person, review.id, IsGoldenFlag(False)), set())
35 |                 writer.create_edge_if_not_exists(ReviewOfProduct(review.id, review.of_product, IsGoldenFlag(False)), set())
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/data_sets/synthetic_review_prediction/utils/dataset_writer.py:
--------------------------------------------------------------------------------
 1 | from graph_io import SimpleNodeClient, CypherQuery, QueryParams
 2 | from ..classes import GraphNode, GraphEdge, IsGoldenFlag
 3 | from graph_io.classes.dataset_name import DatasetName
 4 | from typing import Set, AnyStr
 5 | from multiprocessing.pool import ThreadPool
 6 | from multiprocessing.queues import Queue
 7 | from uuid import UUID
 8 | 
 9 | 
10 | class DatasetWriter(object):
11 |     ADDITIONAL_NODE_PROPERTIES: Set[AnyStr] = {'id'}
12 | 
13 |     def __init__(self,
14 |                  client: SimpleNodeClient,
15 |                  dataset_name: DatasetName,
16 |                  properties_to_ignore: Set[str] = set()
17 |                  ):
18 |         self.properties_to_ignore = properties_to_ignore
19 |         self.dataset_name = dataset_name
20 |         self._client = client
21 |         self.pool = ThreadPool(1)
22 | 
23 |     def __enter__(self):
24 |         # TODO: do query batching with a buffer etc. to increase performance
25 |         return self
26 | 
27 |     def __exit__(self, exc_type, exc_val, exc_tb):
28 |         self._client.run_batch()
29 |         # TODO: on non error exits wait until the buffer has all flushed
30 |         pass
31 | 
32 |     def nuke_dataset(self):
33 |         query = CypherQuery("MATCH (n:NODE {dataset_name: $dataset_name}) DETACH DELETE n")
34 |         self._client.execute_cypher_write(query, QueryParams(dataset_name=self.dataset_name))
35 | 
36 |     def create_node_if_not_exists(self, node: GraphNode, properties: Set[AnyStr]):  # TODO: define properties on the node entity itself?
37 |         properties = properties.union(self.ADDITIONAL_NODE_PROPERTIES)
38 | 
39 |         query_params = self._get_properties_for_query(node, properties)
40 | 
41 |         create_query = CypherQuery(f"MERGE (n:{node.label_string} {query_params.query_string} )")
42 | 
43 |         result = self._client.add_to_batch(create_query, query_params)
44 |         # TODO: check that result wasn't an error
45 | 
46 |         print("merged node", query_params._params, result)
47 | 
48 |     def create_edge_if_not_exists(self, edge: GraphEdge, properties: Set[AnyStr]):
49 |         _from = edge._from
50 |         _to = edge._to
51 | 
52 |         query_params = self._get_properties_for_query(edge, properties)
53 | 
54 |         match = f"MATCH (from:{_from.label_string} {{ id: $from_id }}), (to:{_to.label_string} {{ id: $to_id }})"
55 |         merge = f"MERGE (from)-[r:{edge.relationship} {query_params.query_string} ]->(to)"
56 | 
57 |         create_query = CypherQuery(match + "\n" + merge)
58 |         query_params = query_params.union(QueryParams(from_id=str(_from.id.value), to_id=str(_to.id.value)))
59 | 
60 |         result = self._client.add_to_batch(create_query, query_params)
61 |         print("merged edge", query_params._params, result)
62 | 
63 |     def _get_properties_for_query(self, node, properties, prefix=None):
64 |         properties.add('is_golden')
65 | 
66 |         properties_dict = {
67 |             name if not prefix else f"{prefix}_{name}": getattr(node, name) for name in properties if name not in self.properties_to_ignore
68 |         }
69 | 
70 |         query_params = QueryParams(dataset_name=self.dataset_name, **properties_dict)
71 |         return query_params
72 | 


--------------------------------------------------------------------------------
/experiment/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .experiment import Experiment
3 | from .experiment_header import ExperimentHeader
4 | from .directory import directory


--------------------------------------------------------------------------------
/experiment/arguments.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import argparse
 3 | 
 4 | from .directory import directory, default_experiment
 5 | 
 6 | class Arguments(object):
 7 | 	def parse():
 8 | 
 9 | 		parser = argparse.ArgumentParser()
10 | 
11 | 		parser.add_argument('--experiment', type=str, default=default_experiment, choices=directory.keys())
12 | 		parser.add_argument('--dataset-name', type=str, default=None)
13 | 
14 | 
15 | 		parser.add_argument('--batch_size', type=int, default=32)
16 | 		parser.add_argument('--epochs', type=int, default=None)
17 | 		parser.add_argument('--random-seed', type=int, default=None)
18 | 		parser.add_argument('--verbose', type=int, default=1)
19 | 
20 | 		parser.add_argument('--golden', action='store_true')
21 | 		parser.add_argument('--not-lazy', dest='lazy', action='store_false')
22 | 		parser.add_argument('--no-say', dest='say_result', action='store_false')
23 | 		parser.add_argument('--load-weights', action='store_true')
24 | 		parser.add_argument('--print-weights', action='store_true')
25 | 		parser.add_argument('--custom-test', action='store_true')
26 | 
27 | 		parser.add_argument('--output-dir', type=str, default="./output")
28 | 		parser.add_argument('--data-dir', type=str, default="./data")
29 | 
30 | 		return parser.parse_args()
31 | 


--------------------------------------------------------------------------------
/experiment/directory.py:
--------------------------------------------------------------------------------
  1 | from data_sets import *
  2 | from basic_types import NanoType
  3 | 
  4 | from .experiment_header import ExperimentHeader
  5 | 
  6 | shared_query = {
  7 | 	"product_and_product_subgraph": """
  8 | 			MATCH p=
  9 | 				(a:PERSON {is_golden:{golden}, dataset_name:{dataset_name}}) 
 10 | 					-[:WROTE {is_golden:{golden}, dataset_name:{dataset_name}}]-> 
 11 | 				(b:REVIEW {is_golden:{golden}, dataset_name:{dataset_name}}) 
 12 | 					-[:OF {is_golden:{golden}, dataset_name:{dataset_name}}]-> 
 13 | 				(product:PRODUCT {is_golden:{golden}, dataset_name:{dataset_name}})
 14 | 
 15 | 			WITH
 16 | 				product,
 17 | 				COLLECT(p) as neighbors
 18 | 
 19 | 			RETURN 
 20 | 				product,
 21 | 				neighbors
 22 | 
 23 | 	"""
 24 | 
 25 | }
 26 | 
 27 | directory = {
 28 | 	"review_from_visible_style": ExperimentHeader(
 29 | 		"""
 30 | 			A simple baseline experiment.
 31 | 
 32 | 			From a person's style preference and a product's style, predict review score.
 33 | 
 34 | 			review_score = dot(style_preference, product_style)
 35 | 		""",
 36 | 		EXPERIMENT_2_DATASET,
 37 | 		"""MATCH p=
 38 | 				(a:PERSON {is_golden:{golden}, dataset_name:{dataset_name}}) 
 39 | 					-[:WROTE {is_golden:{golden}, dataset_name:{dataset_name}}]-> 
 40 | 				(b:REVIEW {is_golden:{golden}, dataset_name:{dataset_name}}) 
 41 | 					-[:OF {is_golden:{golden}, dataset_name:{dataset_name}}]-> 
 42 | 				(c:PRODUCT {is_golden:{golden}, dataset_name:{dataset_name}})
 43 | 			RETURN a.style_preference AS style_preference, c.style AS style, b.score AS score
 44 | 		""",
 45 | 		float
 46 | 	),
 47 | 
 48 | 
 49 | 	"review_from_hidden_style_neighbor_conv": ExperimentHeader(
 50 | 		"""
 51 | 			A simple experiment requiring the ML system to aggregate information from a sub-graph
 52 | 
 53 | 			Predict a person's score for a product, given a person's style preference and the product
 54 | 
 55 | 			This needs to be able to take in the review graph for a product
 56 | 			and infer the product's style based on the style_preference and scores other people gave the product.
 57 | 
 58 | 			Plan for the network (assume 1 hot encoding for categorical variables):
 59 | 
 60 | 			For a product (product):
 61 | 				For a person (person):
 62 | 
 63 | 					- get array of N other people's reviews: [other_person.style_preference, score] x N
 64 | 					- Apply 1d_convolution output: [product_style] x N
 65 | 					- Apply average across N, output: [product_style]
 66 | 					- Apply softmax, output: [product_style]
 67 | 					- Concat with person, output: [product_style, person.style_preference]
 68 | 					- Apply dense layer, activation sigmoid, output: [score]
 69 | 
 70 | 					- Train that!
 71 | 
 72 | 		""",
 73 | 		EXPERIMENT_2_DATASET,
 74 | 		"""
 75 | 			MATCH (a:PERSON) 
 76 | 					-[e1:WROTE ]-> 
 77 | 				(b:REVIEW {is_golden:{golden}, dataset_name:{dataset_name}}) 
 78 | 					-[e2:OF ]-> 
 79 | 				(c:PRODUCT),
 80 | 			others=
 81 | 			    (other_person:PERSON {is_golden:{golden}, dataset_name:{dataset_name}})
 82 | 			        -[:WROTE {is_golden:{golden}, dataset_name:{dataset_name}}]->
 83 | 			    (other_review:REVIEW {is_golden:{golden}, dataset_name:{dataset_name}})
 84 | 			        -[:OF {is_golden:{golden}, dataset_name:{dataset_name}}]->
 85 | 			    (c)
 86 | 			WHERE other_person<>a AND other_review<>b 
 87 | 			WITH
 88 | 				a,b,c,
 89 |                 e1,e2,
 90 | 				COLLECT(others) as neighbors
 91 | 			WHERE a.dataset_name={dataset_name} AND a.is_golden={golden}
 92 |             AND b.dataset_name={dataset_name} AND b.is_golden={golden}
 93 | 			AND c.dataset_name={dataset_name} AND c.is_golden={golden}
 94 | 			AND e1.dataset_name={dataset_name} AND e1.is_golden={golden}
 95 | 			AND e2.dataset_name={dataset_name} AND e2.is_golden={golden}
 96 |             RETURN 
 97 | 				a.style_preference AS style_preference,
 98 | 				b.score AS score,
 99 | 				neighbors
100 | 
101 | 		""",
102 | 		float
103 | 	),
104 | 
105 | 	"review_from_all_hidden_simple_unroll": ExperimentHeader(
106 | 		"""
107 | 			# Objective
108 | 
109 | 			Learn a function `score(input_person, input_product)` that gives a product review
110 | 			given a person and a product.
111 | 
112 | 			## Input format
113 | 
114 | 			People, reviews and products are essentially anonymous and defined by their relationship
115 | 			to each-other.
116 | 
117 | 			Our network needs to take in a portion of the graph then output the predicted score.
118 | 
119 | 			The graph is transformed and formatted in a consistent fashion, allowing the network
120 | 			to understand which person and product is being input.
121 | 
122 | 			# Solution
123 | 	
124 | 			Allow the network to find look-a-likes by generating array of person-product-person-product-person chains
125 | 
126 | 			E.g. If me and my lookalike both liked product X, then we'll agree for product Y
127 | 
128 | 			This has a limitation that it can only successfully predict a score of there happens to be someone
129 | 			with the same style_preference who has reviewed a product you have also reviewed.
130 | 
131 | 		""",
132 | 		EXPERIMENT_4_DATASET,
133 | 		"""
134 | 			MATCH g=(input_person:PERSON) 
135 | 					-[:WROTE]-> 
136 | 				(target_review:REVIEW {dataset_name:{dataset_name}}) 
137 | 					-[:OF]-> 
138 | 				(input_product:PRODUCT)
139 | 					<-[:OF]-
140 | 				(review1:REVIEW)
141 | 					<-[:WROTE]-
142 | 				(person2:PERSON)
143 | 					-[:WROTE]->
144 | 				(review2:REVIEW)
145 | 					-[:OF]->
146 | 				(product2:PRODUCT)
147 | 					<-[:OF]-
148 | 				(review3:REVIEW)
149 | 					<-[:WROTE]-
150 | 				(input_person)
151 | 			
152 | 			WHERE 
153 | 				input_person<>person2 
154 | 				AND input_product<>product2 
155 | 
156 | 			RETURN
157 | 				target_review.score as score,
158 | 				COLLECT([1.0, review1.score, review2.score, review3.score])[0..50] as neighbors,
159 | 
160 | 				// These two need to be here otherwise the query implicitly groups by score
161 | 				input_product.id,
162 | 				input_person.id
163 | 
164 | 		""",
165 | 		float,
166 | 		{
167 | 			"neighbor_count":50
168 | 		}
169 | 	),
170 | 
171 | 	"review_from_all_hidden_random_walks": ExperimentHeader(
172 | 		"""
173 | 			Let's try to do a RNN that operates on pieces of the graph
174 | 			Generate random walks.
175 | 
176 | 			This is a great problem because it requires the network to find a specific
177 | 			shape of subgraph in order to answer the question.
178 | 
179 | 			It needs to find a loop, with 1s on the review scores, like such:
180 | 
181 | 			(REVIEW=1)  --> (PRODUCT) <--  (REVIEW=1) <-- (PERSON_B)
182 | 				↑											  |
183 | 				|											  ↓
184 | 			(PERSON_A) --> (THE_REVIEW) --> (PRODUCT) <-- (REVIEW=1)
185 | 
186 | 
187 | 			# Idea
188 | 
189 | 			What if the parameters define a shape the network wants to look for?
190 | 
191 | 			That's the solution to this problem and could be useful for other problems,
192 | 			particularly since the magic of neural networks lets you define a noise-resiliant
193 | 			function, and an ensemble of shapes.
194 | 
195 | 			Let:
196 | 
197 | 			const string_length = 9
198 | 			pattern:List[part]  = |----|-----|-----|----|  ==>  Convolve 1D with path
199 | 			path:List[part]     = (a)-->(b)-->(c)-->(d)
200 | 			part                = (type, parameter_values, is_target) | Loop | None
201 | 			target_type         = "REVIEW"
202 | 
203 | 			## Algorithm
204 | 
205 | 			1) For each node of type=target_type:
206 | 			1.a) Generate all paths s.t. |path| <= string_length
207 | 			1.b) If a path is cyclic it should have a 'Loop' element after the nodes
208 | 			2) Feed to network ([path, ..., path], target_review_score)
209 | 			3) Network performs 1D convolution of each path with pattern kernel (The overflow of the kernel should wrap around the input path)
210 | 			4) Network performs a 1D convolution on those outputs
211 | 			5) Network sums those values
212 | 			6) Network applies a dense layer, thus outputting y_prediction
213 | 
214 | 
215 | 		""",
216 | 		EXPERIMENT_4_DATASET,
217 | 		"""
218 | 			MATCH p=
219 | 				(review:REVIEW {is_golden:{golden}, dataset_name:{dataset_name}}) 
220 | 					-[*8]-
221 | 				(otherB)
222 | 			WHERE review.id={id}
223 | 			WITH
224 | 				review,
225 | 				COLLECT(p)[0..600] as neighbors
226 | 			RETURN 
227 | 				review,
228 | 				neighbors
229 | 		""",
230 | 		float,
231 | 		{
232 | 			"generate_address": False,
233 | 			"target_dropout": 0.0,
234 | 			"memory_size": 1000,
235 | 			"word_size": 4,
236 | 			"sequence_size": 600,
237 | 			"patch_width": 7,
238 | 			"patch_size": 20,
239 | 			"epochs": 20,
240 | 			"repeat_batch": 1,
241 | 			"working_width": 64,
242 | 			"id_limit": 32 * 10
243 | 		}, 
244 | 		["id_limit"]
245 | 	),
246 | 
247 | 	"review_from_all_hidden_adj": ExperimentHeader(
248 | 		"""
249 | 			Try the following:
250 | 				- variable pr represents PRODUCT style vectors
251 | 				- variable pe represents PERSON preference vectors
252 | 				- x = adj matrix of PRODUCT-REVIEW-PERSON
253 | 				- y = adj matrix of same with REVIEW.score as the weights
254 | 				- Use optimizer to optimize the style/pref vectors such that: Dot(MatMul(pr, T(pe)), x) = y
255 | 
256 | 
257 | 		""",
258 | 		EXPERIMENT_5_DATASET,
259 | 		"""
260 | 			MATCH p=
261 | 				(person:PERSON) -->
262 | 				(review:REVIEW {is_golden:{golden}, dataset_name:{dataset_name}}) -->
263 | 				(product:PRODUCT)
264 | 			RETURN 
265 | 				person.id as person_id, review.score as score, product.id as product_id
266 | 		""",
267 | 		"adj_equals",
268 | 		{
269 | 			"product_count": 160, # total 160
270 | 			"person_count": 1200, # total 1200
271 | 			"style_width": 12,
272 | 			"epochs": 10000,
273 | 			"batch_per_epoch": 10
274 | 		}
275 | 	),
276 | 
277 | 	"style_from_neighbor_conv": ExperimentHeader(
278 | 		""" 
279 | 		A precursor to review_from_hidden_style_neighbor_conv
280 | 
281 | 		This experiment seeks to see if we can efficiently determine a product's style
282 | 		given it's set of reviews and the style_preference of each reviewer.
283 | 
284 | 		This should be easy!!
285 | 
286 | 		""",
287 | 		EXPERIMENT_2_DATASET,
288 | 		shared_query["product_and_product_subgraph"],
289 | 		list,
290 | 	),
291 | 
292 | 	"style_from_neighbor_rnn": ExperimentHeader(
293 | 		""" The same as style_from_neighbor_conv but using an RNN instead of convolution """,
294 | 		EXPERIMENT_2_DATASET,
295 | 		shared_query["product_and_product_subgraph"],
296 | 		list
297 | 	)
298 | 
299 | }
300 | 
301 | default_experiment = "review_from_all_hidden_adj"
302 | 
303 | 
304 | 


--------------------------------------------------------------------------------
/experiment/experiment.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from datetime import datetime
 3 | from colorama import init, Fore, Style
 4 | import logging
 5 | import coloredlogs
 6 | import colored_traceback.auto
 7 | import os
 8 | 
 9 | from graph_ml import Train, Dataset
10 | from .arguments import Arguments
11 | from .directory import directory
12 | 
13 | 
14 | init()
15 | logger = logging.getLogger(__name__)
16 | 
17 | 
18 | class Experiment(object):
19 | 	def __init__(self, name, header, params):
20 | 		self.name = name
21 | 		self.header = header
22 | 		self.params = params
23 | 		self.run_tag = str(datetime.now())
24 | 
25 | 	@classmethod
26 | 	def run(cls):
27 | 
28 | 		params = Arguments.parse()
29 | 		
30 | 		if params.verbose > 0:
31 | 			coloredlogs.install(level='INFO', logger=logging.getLogger("experiment"))
32 | 			coloredlogs.install(level='INFO', logger=logging.getLogger("graph_ml"))
33 | 			coloredlogs.install(level='INFO', logger=logging.getLogger("graph_io"))
34 | 
35 | 		experiment = Experiment(params.experiment, directory[params.experiment], params)
36 | 
37 | 		print(Fore.GREEN)
38 | 		print("#######################################################################")
39 | 		print(f"📟  Running experiment {experiment.name} {experiment.run_tag}")
40 | 		print("#######################################################################")
41 | 		print(Style.RESET_ALL)
42 | 
43 | 		dataset = Dataset.get(experiment)
44 | 		score = Train.run(experiment, dataset)
45 | 
46 | 		print(Fore.YELLOW)
47 | 		print("#######################################################################")
48 | 		print("Experiment results")
49 | 		print(f"{experiment.name} test loss {round(score[0],6)}")
50 | 		print(f"{experiment.name} test accuracy {round(score[1])}%")
51 | 		print("#######################################################################")
52 | 		print(Style.RESET_ALL)
53 | 
54 | 		# t = '-title {!r}'.format(title)
55 | 		# s = '-subtitle {!r}'.format(subtitle)
56 | 		# m = '-message {!r}'.format(message)
57 | 		os.system(f"terminal-notifier -message 'test accuracy {round(score[1]*100)}%  loss {round(score[0],2)}' -title Octavian")
58 | 
59 | 		if params.say_result:
60 | 			os.system(f"say test accuracy {round(score[1]*100)} percent")
61 | 		
62 | 


--------------------------------------------------------------------------------
/experiment/experiment_header.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from typing import List
 3 | from graph_io.classes import DatasetName
 4 | 
 5 | class ExperimentHeader(object):
 6 | 	def __init__(self, doc="", dataset_name: DatasetName=None, cypher_query=None, target=None, params={}, lazy_params:List[str]=[]):
 7 | 		# Jesus I have to spell this out?!
 8 | 		# WTF are the python language devs doing?!
 9 | 		self.dataset_name = dataset_name
10 | 		self.doc = doc
11 | 		self.cypher_query = cypher_query
12 | 		self.target = target
13 | 		self.params = params
14 | 		self.lazy_params = lazy_params


--------------------------------------------------------------------------------
/floyd_requirements.txt:
--------------------------------------------------------------------------------
 1 | neo4j-driver
 2 | lazy
 3 | h5py
 4 | colorama
 5 | coloredlogs
 6 | more-itertools
 7 | git+git://github.com/datalogai/recurrentshop.git#egg=recurrentshop
 8 | colored-traceback
 9 | sklearn
10 | tqdm


--------------------------------------------------------------------------------
/graph_ml/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .train import Train
3 | from .dataset import Dataset
4 | from .ntm import NTMBase
5 | 


--------------------------------------------------------------------------------
/graph_ml/adjacency_layer.py:
--------------------------------------------------------------------------------
  1 | from keras import backend as K
  2 | import tensorflow as tf
  3 | from keras.engine.topology import Layer, Input
  4 | from keras import regularizers, initializers, layers, activations
  5 | from functools import partial
  6 | import numpy as np
  7 | 
  8 | class PD(regularizers.Regularizer):
  9 | 	def __init__(self, a=0.0001, b=0.0, axis=-1):
 10 | 		self.a = K.cast_to_floatx(a)
 11 | 		self.b = K.cast_to_floatx(b)
 12 | 
 13 | 		self.axis = axis
 14 | 
 15 | 	def __call__(self, x):
 16 | 		sum_to_one = K.abs(1.0 - K.sum(K.abs(x), axis=self.axis))
 17 | 		different_by_one = K.abs(1.0 - K.abs(x[:,0] - x[:,1]))
 18 | 		core =  self.a * sum_to_one + self.b * different_by_one
 19 | 
 20 | 		return K.sum(core)
 21 | 
 22 | 	def get_config(self):
 23 | 		return {'a': float(self.a), 'b': float(self.b)}
 24 | 
 25 | 
 26 | class Clip(regularizers.Regularizer):
 27 | 	def __init__(self, max=1):
 28 | 		self.max = max
 29 | 
 30 | 	def __call__(self, x):
 31 | 		K.clip(x, min_value=-1, max_value=1)
 32 | 
 33 | 	def get_config(self):
 34 | 		return {'max': float(self.max)}
 35 | 
 36 | 
 37 | class Adjacency(Layer):
 38 | 
 39 | 	def __init__(self, person_count, product_count, style_width, **kwargs):
 40 | 		self.person_count = person_count
 41 | 		self.product_count = product_count
 42 | 		self.style_width = style_width
 43 | 		self.dense1 = layers.Dense(units=(style_width), activation=activations.softplus, use_bias=False, kernel_regularizer=Clip)
 44 | 		#self.dense2 = layers.(units=(1), activation=activations.linear)
 45 | 		self.dense3 = layers.Dense(units=1, activation=partial(activations.relu, alpha=0.1), use_bias=False, kernel_regularizer=Clip)
 46 | 		super(Adjacency, self).__init__(**kwargs)
 47 | 
 48 | 	def __call__(self, inputs, **kwargs):
 49 | 		self.batch_size = inputs.shape[0]
 50 | 		product_ct = inputs.shape[1]
 51 | 		person_ct = inputs.shape[2]
 52 | 		my_batch = product_ct * person_ct
 53 | 
 54 | 		self.inner_input = Input(batch_shape=(product_ct, person_ct, 2, self.style_width), dtype='float32', name="inner_d0")
 55 | 		self.reshaped_to_look_like_a_batch = K.reshape(self.inner_input, (product_ct * person_ct, 2 * self.style_width))
 56 | 		self.dense1_called = self.dense1(self.reshaped_to_look_like_a_batch)
 57 | 		#self.dense2_called = self.dense2(self.dense1_called)
 58 | 		self.dense3_called = self.dense3(self.dense1_called)
 59 | 		self.reshaped_to_look_like_adj_mat = K.reshape(self.dense3_called, (product_ct, person_ct, 1))
 60 | 		return super(Adjacency, self).__call__(inputs, **kwargs)
 61 | 
 62 | 	def cartesian_product_matrix(self, a, b):
 63 | 		tile_a = tf.tile(tf.expand_dims(a, 1), [1, tf.shape(b)[0], 1])
 64 | 		tile_a = tf.expand_dims(tile_a, 2)
 65 | 
 66 | 		tile_b = tf.tile(tf.expand_dims(b, 0), [tf.shape(a)[0], 1, 1])
 67 | 		tile_b = tf.expand_dims(tile_b, 2)
 68 | 
 69 | 		cartesian_product = tf.concat([tile_a, tile_b], axis=-1)
 70 | 
 71 | 		return cartesian_product
 72 | 
 73 | 
 74 | 
 75 | 	def build(self, input_shape):
 76 | 		# Create a trainable weight variable for this layer.
 77 | 		self.person = self.add_weight(name='people', 
 78 | 			shape=(self.person_count, self.style_width),
 79 | 			initializer='uniform',
 80 | 			# initializer='ones',
 81 | 			# regularizer=PD(),
 82 | 			trainable=True)
 83 | 
 84 | 		self.product = self.add_weight(name='product', 
 85 | 			shape=(self.product_count, self.style_width),
 86 | 			initializer='uniform',
 87 | 			# initializer='ones',
 88 | 			# regularizer=PD(),
 89 | 			trainable=True)
 90 | 
 91 | 
 92 | 		# self.wc1 = self.add_weight(name='w1', 
 93 | 		# 	shape=(2, 1),
 94 | 		# 	initializer='glorot_uniform',
 95 | 		# 	trainable=True)
 96 | 
 97 | 		# self.b1 = self.add_weight(name='b1', 
 98 | 		# 	shape=(1, ),
 99 | 		# 	initializer='zero',
100 | 		# 	trainable=True)
101 | 
102 | 		self.w1 = self.add_weight(name='w1', 
103 | 			shape=(2 * self.style_width, 
104 | 				   self.style_width),
105 | 			initializer='glorot_uniform',
106 | 			trainable=True)
107 | 
108 | 		# self.b1 = self.add_weight(name='b1', 
109 | 		# 	shape=(self.style_width, ),
110 | 		# 	initializer='zero',
111 | 		# 	trainable=True)
112 | 
113 | 		self.w2 = self.add_weight(name='w2', 
114 | 			shape=(self.style_width, 1),
115 | 			initializer='glorot_uniform',
116 | 			trainable=True)
117 | 
118 | 		# self.b2 = self.add_weight(name='b2', 
119 | 		# 	shape=(1, ),
120 | 		# 	initializer='zero',
121 | 		# 	trainable=True)
122 | 
123 | 
124 | 		# self.b3 = self.add_weight(name='b2', 
125 | 		# 	shape=(1,),
126 | 		# 	initializer='zero',
127 | 		# 	trainable=True)
128 | 
129 | 		# self.w3 = self.add_weight(name='m2', 
130 | 		# 	shape=(1,),
131 | 		# 	initializer='one',
132 | 		# 	trainable=True)
133 | 
134 | 
135 | 		super(Adjacency, self).build(input_shape)  # Be sure to call this somewhere!
136 | 
137 | 	def jitter(self, idx=[0,1], var=0.2):
138 | 		wts = self.get_weights()
139 | 		
140 | 		for i in idx:
141 | 			wts[i] += np.random.normal(0, var, wts[i].shape)
142 | 		
143 | 		self.set_weights(wts)
144 | 
145 | 	def call(self, x):
146 | 		return self.call_dense(x)
147 | 
148 | 	# 100pc test accuracy
149 | 	def call_dot_softmax(self, x):
150 | 		pr = self.product
151 | 		pe = self.person
152 | 
153 | 		pr = K.softmax(self.product)
154 | 		pe = K.softmax(self.person)
155 | 
156 | 		m = K.dot(pr, K.transpose(pe))
157 | 		m = (self.w3 * m) + self.b3
158 | 		m = K.relu(m, alpha=0.1)
159 | 
160 | 		m = m * x
161 | 
162 | 		return m
163 | 
164 | 	# 100pc test accuracy
165 | 	def call_dot(self, x):
166 | 		pr = self.product
167 | 		pe = self.person
168 | 
169 | 		m = K.dot(pr, K.transpose(pe))
170 | 		m = m * x
171 | 
172 | 		return m
173 | 
174 | 	# Seen at 68% 1-accuracy test
175 | 	def call_dense(self, x):
176 | 		self.jitter(idx=[0,1], var=0.1)
177 | 
178 | 		pr = self.product
179 | 		pe = self.person
180 | 
181 | 		pr = K.softmax(pr)
182 | 		pe = K.softmax(pe)
183 | 
184 | 		all_pairs = self.cartesian_product_matrix(pr, pe)
185 | 		flat = K.reshape(all_pairs, (self.product_count * self.person_count, self.style_width * 2))
186 | 
187 | 		m = K.dot(flat, self.w1)
188 | 		# m = K.bias_add(m, self.b1)
189 | 		m = K.relu(m, alpha=0.1)
190 | 
191 | 		m = K.dropout(m, level=0.1)
192 | 
193 | 		m = K.dot(m, self.w2)
194 | 		m = K.relu(m, alpha=0.1)
195 | 
196 | 		m = K.reshape(m, (1, self.product_count, self.person_count))
197 | 		masked = m * x
198 | 		return masked
199 | 
200 | 
201 | 
202 | 	# 100pc test accuracy
203 | 	def call_dense_conv(self, x):
204 | 		self.jitter(idx=[0,1])
205 | 
206 | 		pr = self.product
207 | 		pe = self.person
208 | 
209 | 		pr = K.softmax(pr)
210 | 		pe = K.softmax(pe)
211 | 
212 | 		all_pairs = self.cartesian_product_matrix(pr, pe)
213 | 
214 | 		flat = K.reshape(all_pairs, (self.product_count * self.person_count * self.style_width, 2))
215 | 		m = K.dot(flat, self.wc1)
216 | 		m = K.tanh(m)
217 | 
218 | 		m = K.reshape(m, (self.product_count * self.person_count, self.style_width))
219 | 		m = K.dot(m, self.w2)
220 | 		m = K.relu(m, alpha=0.1)
221 | 
222 | 		m = K.reshape(m, (1, self.product_count, self.person_count))
223 | 		masked = m * x
224 | 		return masked
225 | 
226 | 
227 | 
228 | 	def compute_output_shape(self, input_shape):
229 | 		return input_shape
230 | 
231 | 
232 | 


--------------------------------------------------------------------------------
/graph_ml/dataset.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from collections import Counter, namedtuple
  3 | import random
  4 | import pickle
  5 | import os.path
  6 | import hashlib
  7 | import neo4j
  8 | import math
  9 | from typing import Callable, Generator, Tuple
 10 | import logging
 11 | import itertools
 12 | from itertools import cycle
 13 | import more_itertools
 14 | from more_itertools import peekable
 15 | 
 16 | import keras
 17 | import numpy as np
 18 | from keras.preprocessing import text
 19 | from keras.utils import np_utils
 20 | 
 21 | from .path import generate_output_path, generate_data_path
 22 | from graph_io import *
 23 | # from experiment import Experiment
 24 | from .util import *
 25 | from .dataset_helpers import *
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | 
 30 | class Dataset(object):
 31 | 
 32 | 
 33 | 	# Applies a per-experiment recipe to Neo4j to get a dataset to train on
 34 | 	# This performs all transformations in-memory - it is not very efficient
 35 | 	@classmethod
 36 | 	def get(cls, experiment):
 37 | 		
 38 | 		# TODO: delete this
 39 | 		legacy_recipes = {
 40 | 			'review_from_visible_style': Recipe(
 41 | 				split=lambda row: Point(np.concatenate((row['style_preference'], row['style'])), row['score'])
 42 | 			),
 43 | 			'review_from_hidden_style_neighbor_conv': Recipe(
 44 | 				split=DatasetHelpers.review_from_hidden_style_neighbor_conv(100),
 45 | 				finalize_x=lambda x: {'person':np.array([i['person'] for i in x]), 'neighbors': np.array([i['neighbors'] for i in x])}
 46 | 			),
 47 | 			'style_from_neighbor_conv': Recipe(
 48 | 				split=DatasetHelpers.style_from_neighbor(100)
 49 | 			),
 50 | 			'style_from_neighbor_rnn': Recipe(
 51 | 				split=DatasetHelpers.style_from_neighbor(100)
 52 | 			)
 53 | 		}
 54 | 
 55 | 		try:
 56 | 			recipe = legacy_recipes[experiment.name]
 57 | 		except:
 58 | 			# TODO: move all to this pattern
 59 | 			recipe = getattr(DatasetHelpers, experiment.name)(experiment)
 60 | 
 61 | 
 62 | 		return Dataset(experiment, recipe)
 63 | 
 64 | 		
 65 | 
 66 | 	# Split data into test/train set, organise it into a class
 67 | 	def __init__(self, experiment, recipe):
 68 | 
 69 | 		self.experiment = experiment
 70 | 		self.recipe = recipe
 71 | 
 72 | 		if experiment.params.random_seed is not None:
 73 | 			random.seed(experiment.params.random_seed)
 74 | 
 75 | 		if experiment.params.dataset_name is not None:
 76 | 			dataset_name = experiment.params.dataset_name
 77 | 		else:
 78 | 			dataset_name = experiment.header.dataset_name
 79 | 
 80 | 		query_params = QueryParams(
 81 | 			golden=experiment.params.golden, 
 82 | 			dataset_name=dataset_name, 
 83 | 			experiment=experiment.name)
 84 | 
 85 | 		query_params.update(QueryParams(**experiment.header.params))
 86 | 
 87 | 		# Calculate params for lazy data loading
 88 | 		data_path_params = {i:query_params[i] for i in experiment.header.lazy_params}
 89 | 		data_path_params["dataset_name"] = dataset_name
 90 | 
 91 | 		dataset_file = generate_data_path(experiment, '.pkl', data_path_params)
 92 | 		logger.info(f"Dataset file {dataset_file}")
 93 | 
 94 | 		if os.path.isfile(dataset_file) and experiment.params.lazy:
 95 | 			logger.info(f"Opening dataset pickle {dataset_file}")
 96 | 			data = pickle.load(open(dataset_file, "rb"))
 97 | 
 98 | 		else:
 99 | 			logger.info("Querying data from database")
100 | 			with SimpleNodeClient() as client:
101 | 				cq = CypherQuery(experiment.header.cypher_query)
102 | 				data = recipe.query(client, cq, query_params)
103 | 
104 | 				# Later shift to query-on-demand
105 | 				data = list(data)
106 | 			pickle.dump(data, open(dataset_file, "wb"))
107 | 
108 | 		# We need to know total length of data, so for ease I've listed it here.
109 | 		# I've used generators everywhere, so if it wasn't for Keras, this would
110 | 		# be memory efficient
111 | 		
112 | 		logger.info(f"Rows returned by Neo4j {len(data)}")
113 | 		list_data = list(recipe.transform(data))
114 | 		total_data = len(list_data)
115 | 		logger.info(f"Number of rows of data: {total_data}")
116 | 
117 | 
118 | 		def repeat_infinitely(gen_fn):
119 | 			while True:
120 | 				for x in gen_fn():
121 | 					yield x
122 | 		stream = repeat_infinitely(lambda: recipe.partition(recipe.transform(data)))
123 | 
124 | 		def just(tag):
125 | 			return ( (i[1].x, i[1].y) for i in stream if i[0] == tag)
126 | 
127 | 		def chunk(it, length):
128 | 			chunky = more_itertools.chunked(it, length)
129 | 			for i in chunky:
130 | 				xs = np.array([j[0] for j in i])
131 | 				ys = np.array([j[1] for j in i])
132 | 				yield (xs, ys)
133 | 
134 | 		
135 | 		bs = experiment.params.batch_size
136 | 
137 | 		self.train_generator 		= peekable(chunk(just("train"), bs))
138 | 		self.validation_generator 	= peekable(chunk(just("validate"), bs))
139 | 		self.test_generator 		= peekable(chunk(just("test"), bs))
140 | 
141 | 		self.generator = {
142 | 			"test": self.test_generator,
143 | 			"train": self.train_generator,
144 | 			"validate": self.validation_generator
145 | 		}
146 | 
147 | 		f = self.train_generator.peek()
148 | 		# logger.info(f"First training item: x:{f[0].shape}, y:{f[1].shape}")
149 | 
150 | 		# These are not exact counts since the data is randomly split at generation time
151 | 		self.validation_steps 	= math.ceil(total_data * 0.1 / experiment.params.batch_size)
152 | 		self.test_steps 		= math.ceil(total_data * 0.1 / experiment.params.batch_size)
153 | 		self.steps_per_epoch 	= math.ceil(total_data * 0.8 / experiment.params.batch_size) * int(experiment.header.params.get('repeat_batch', 1))
154 | 
155 | 		self.input_shape = self.train_generator.peek()[0][0].shape
156 | 
157 | 
158 | 
159 | 
160 | 
161 | 


--------------------------------------------------------------------------------
/graph_ml/dataset_helpers.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | from collections import Counter, namedtuple
  4 | import random
  5 | import pickle
  6 | import os.path
  7 | import hashlib
  8 | import neo4j
  9 | import math
 10 | from typing import Callable, Generator, Tuple
 11 | import logging
 12 | import itertools
 13 | from itertools import cycle
 14 | import more_itertools
 15 | from more_itertools import peekable
 16 | 
 17 | import keras
 18 | import numpy as np
 19 | from keras.preprocessing import text
 20 | from keras.utils import np_utils
 21 | 
 22 | from .path import generate_output_path, generate_data_path
 23 | from graph_io import *
 24 | # from experiment import Experiment
 25 | from .util import *
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | 
 30 | 
 31 | class Point(object):
 32 | 	def __init__(self, x, y):
 33 | 		self.x = x
 34 | 		self.y = y
 35 | 
 36 | 	# This is weird, I know, re-write later when I'm making this more efficient
 37 | 	def append(self, point):
 38 | 		self.x.append(point.x)
 39 | 		self.y.append(point.y)
 40 | 
 41 | 	def __str__(self):
 42 | 		return "{x:\n" + str(self.x) + ",\ny:\n" + str(self.y) + "}"
 43 | 
 44 | 	def __repr__(self):
 45 | 		return self.__str__()
 46 | 
 47 | 
 48 | def noop():
 49 | 	pass
 50 | 
 51 | RecordGenerator = Generator[neo4j.v1.Record, None, None]
 52 | PointGenerator = Generator[Point, None, None]
 53 | 
 54 | class Recipe:
 55 | 	def __init__(self, 
 56 | 		transform:Callable[[RecordGenerator], PointGenerator] = None,
 57 | 		query:Callable[[], RecordGenerator] = None,
 58 | 		partition:Callable[[PointGenerator], Generator[Tuple[str, Point], None, None]] = None,
 59 | 		split:Callable[[neo4j.v1.Record], Point] = None,
 60 | 		finalize_x = None):
 61 | 
 62 | 		self.transform = transform
 63 | 		self.query = query
 64 | 		self.partition = partition
 65 | 
 66 | 		# TODO: migrate older experiments
 67 | 		if transform is None:
 68 | 			def legacy_transform(rows):
 69 | 				for i in rows:
 70 | 					p = split(i)
 71 | 					p.x = finalize_x(p.x) if finalize_x else p.x
 72 | 					yield p
 73 | 			self.transform = legacy_transform
 74 | 
 75 | 		if query is None:
 76 | 			def default_query(client, cypher_query, query_params):
 77 | 				return client.execute_cypher(cypher_query, query_params)
 78 | 
 79 | 			self.query = default_query
 80 | 
 81 | 		if partition is None:
 82 | 			def default_partition(data):
 83 | 				random.shuffle(data)
 84 | 				c = 0
 85 | 				for i in data:
 86 | 					
 87 | 					if c == 9:
 88 | 						l = "test"
 89 | 					elif c == 8:
 90 | 						l = "validate"
 91 | 					else:
 92 | 						l = "train"
 93 | 
 94 | 					c = (c + 1) % 10
 95 | 
 96 | 					yield (l, i)
 97 | 			self.partition = default_partition
 98 | 
 99 | 
100 | class DatasetHelpers(object):
101 | 
102 | 	@staticmethod
103 | 	def ensure_length(arr, length):
104 | 		delta = length - arr.shape[0]
105 | 		if delta > 0:
106 | 			pad_shape = ((0,delta),)
107 | 			for i in range(len(arr.shape)-1):
108 | 				pad_shape += ((0, 0),)
109 | 			arr = np.pad(arr, pad_shape, 'constant', constant_values=0.0)
110 | 		elif delta < 0:
111 | 			arr = arr[:length]
112 | 
113 | 		assert len(arr) == length, f"ensure_length failed to resize, {len(arr)} != {length}"
114 | 
115 | 		return arr
116 | 
117 | 	@staticmethod
118 | 	def path_map_style_preference_score(cls, path):
119 | 		other_person = path.nodes[0]
120 | 		other_review = path.nodes[1]
121 | 		return np.concatenate((
122 | 				np.array(other_person.properties['style_preference']),
123 | 				[other_review.properties['score']]
124 | 			))
125 | 
126 | 	# Turn neighbors sub-graph into a sampled array of neighbours
127 | 	# @argument length What size of array should be returned. Use None for variable. If you request a fixed length, the first column of the feature is a 0.0/1.0 flag of where there is data or zeros in that feature row
128 | 	@classmethod
129 | 	def collect_neighbors(cls, row, key, path_map, length:int):
130 | 		subrows = []
131 | 		for path in row[key]:
132 | 			subrows.append(path_map(path))
133 | 
134 | 		# Lets always shuffle to keep the network on its toes
135 | 		# If you use --random-seed you'll fix this to be the same each run
136 | 		np.random.shuffle(subrows)
137 | 
138 | 		if length is not None:
139 | 			if len(subrows) > length:
140 | 				subrows = subrows[:length]
141 | 	
142 | 			subrows = np.pad(subrows, ((0,0), (1,0)), 'constant', constant_values=1.0) # add 'none' flag
143 | 
144 | 			# pad out if too small
145 | 			# note if there are zero subrows, this won't know the width to make the zeros, so it'll be 1 wide and broadcast later
146 | 			if len(subrows) < length:
147 | 				delta = length - subrows.shape[0]
148 | 				subrows = np.pad(subrows, ((0,delta), (0, 0)), 'constant', constant_values=0.0)
149 | 
150 | 		return subrows
151 | 
152 | 
153 | 	@classmethod
154 | 	def review_from_hidden_style_neighbor_conv(cls, length):
155 | 		def transform_row(row):
156 | 			neighbors = cls.collect_neighbors(row, 'neighbors', cls.path_map_style_preference_score)
157 | 			return Point({'person': np.array(row["style_preference"]), 'neighbors':neighbors}, row["score"])
158 | 		return transform_row
159 | 
160 | 
161 | 	@classmethod
162 | 	def style_from_neighbor(cls, length):
163 | 		# Python you suck at developer productivity.
164 | 		# Seriously, coffeescript has all these things sorted out
165 | 		# Like no anonymous functions? Fuck you.
166 | 		def transform_row(row):
167 | 			neighbors = cls.collect_neighbors(row, 'neighbors', cls.path_map_style_preference_score, length)
168 | 			return Point(neighbors, row["product"].properties["style"])
169 | 		return transform_row
170 | 
171 | 
172 | 	@classmethod
173 | 	def review_from_all_hidden_simple_unroll(cls, experiment):
174 | 		def t(row):
175 | 			length = experiment.header.params["neighbor_count"]
176 | 			neighbors = np.array(row["neighbors"])
177 | 			delta = length - neighbors.shape[0]
178 | 
179 | 			if delta > 0:
180 | 				neighbors = np.pad(neighbors, ((0,delta), (0, 0)), 'constant', constant_values=0.0)
181 | 			
182 | 			return Point(neighbors, row["score"])
183 | 
184 | 		return Recipe(t)
185 | 
186 | 	@staticmethod
187 | 	def review_from_all_hidden_random_walks(experiment):
188 | 
189 | 		encode_label = {
190 | 			"NODE":	   [1,0,0,0,0],
191 | 			"PERSON":  [0,1,0,0,0],
192 | 			"REVIEW":  [0,0,1,0,0],
193 | 			"PRODUCT": [0,0,0,1,0],
194 | 			"LOOP":	   [0,0,0,0,1]
195 | 		}
196 | 
197 | 		FakeNode = namedtuple('FakeNode', ['id', 'properties', 'labels'])
198 | 		loop_node = FakeNode(None, {}, set(['NODE', 'LOOP']))
199 | 
200 | 		def extract_label(l):
201 | 			return encode_label.get(list(set(l) - set('NODE'))[0], [1,0,0,0])
202 | 
203 | 		node_id_dict = {}
204 | 
205 | 		def node_id_to_memory_addr(nid):
206 | 
207 | 			if nid not in node_id_dict:
208 | 				node_id_dict[nid] = len(node_id_dict) % experiment.header.params['memory_size']
209 | 
210 | 			return node_id_dict[nid]
211 | 
212 | 		def package_node(n, is_target=False):
213 | 			ms = experiment.header.params['memory_size']
214 | 
215 | 			if experiment.header.params["generate_address"]:
216 | 				address_trunc = node_id_to_memory_addr(n.id)
217 | 				address_one_hot = np.zeros(ms)
218 | 				address_one_hot[address_trunc] = 1.0
219 | 			else:
220 | 				address_one_hot = np.array([])
221 | 
222 | 			label = extract_label(n.labels)
223 | 			score = n.properties.get("score", -1.0)
224 | 
225 | 			if random.random() < experiment.header.params["target_dropout"] or is_target:
226 | 				score = -1.0
227 | 
228 | 			x = np.concatenate(([score, float(is_target)], label, address_one_hot))
229 | 
230 | 			return x
231 | 
232 | 
233 | 		def path_to_patch(node, path):
234 | 			ps = np.array([package_node(i, i.id == node.id) for i in path.nodes])
235 | 
236 | 			if path.nodes[0].id == path.nodes[-1].id:
237 | 				print("outputting loop_node for ", path.nodes[0].id, [i.id for i in path.nodes])
238 | 				l = np.array([package_node(loop_node, False)])
239 | 				np.append(ps, l, axis=0)
240 | 
241 | 			ps = np.repeat(ps, 2, axis=0)
242 | 
243 | 			patch_size = experiment.header.params["patch_size"]
244 | 			ps = DatasetHelpers.ensure_length(ps, patch_size)
245 | 			return ps
246 | 
247 | 
248 | 		def row_to_point(row):
249 | 			patch_size = experiment.header.params["patch_size"]
250 | 			seq_size = experiment.header.params["sequence_size"]
251 | 
252 | 			neighbors = row["neighbors"]
253 | 			review = row["review"]
254 | 
255 | 			x = np.array([path_to_patch(review, path) for path in neighbors])
256 | 			x = DatasetHelpers.ensure_length(x, seq_size)
257 | 			# x = np.repeat(x, 3, axis=0)
258 | 
259 | 			y = row["review"].properties.get("score", -1.0)
260 | 			# y = np.repeat([y], seq_size)
261 | 			# y = np.expand_dims(y, axis=-1)
262 | 
263 | 			target_shape = (seq_size, patch_size, experiment.header.params["patch_width"])
264 | 			assert x.shape == target_shape, f"{x.shape} != {target_shape}"
265 | 
266 | 			return Point(x, y)
267 | 
268 | 		def query(client, cypher_query, query_params):
269 | 			return client.execute_cypher_once_per_id(
270 | 				cypher_query,
271 | 				query_params,
272 | 				dataset_name=experiment.header.dataset_name,
273 | 				id_limit=experiment.header.params["id_limit"],
274 | 				id_type="REVIEW"
275 | 			)
276 | 
277 | 		def balance_classes(stream):
278 | 			# ugh arch pain
279 | 			# instead pass in an arg that is a callable stream generator
280 | 
281 | 			classes = [0.0, 1.0]
282 | 			last = [None, None]
283 | 
284 | 			# Over-sample
285 | 			# This is imperfectly balanced as it cold-starts without last values
286 | 			for i in stream:
287 | 				for index, c in enumerate(classes):
288 | 					if np.array([i.y]).flatten()[0] == c:
289 | 						last[index] = i
290 | 						yield i
291 | 					elif last[index] is not None:
292 | 						yield last[index]
293 | 			
294 | 
295 | 		def transform(stream):
296 | 			# y_count = Counter()
297 | 			# y_count[str(y)] += 1
298 | 			# print(f"Counter of y values: {[(i, y_count[i] / len(list(y_count.elements())) * 100.0) for i in y_count]}")
299 | 			stream = (row_to_point(row) for row in stream)
300 | 			stream = balance_classes(stream)
301 | 			return stream
302 | 
303 | 		return Recipe(transform=transform,query=query)
304 | 
305 | 	@staticmethod
306 | 	def review_from_all_hidden_adj(experiment) -> Recipe:
307 | 		bs = experiment.params.batch_size
308 | 		person_product = {}
309 | 
310 | 		reviews_per_person = Counter()
311 | 		reviews_per_product = Counter()
312 | 
313 | 		pr_c = experiment.header.params["product_count"]
314 | 		pe_c = experiment.header.params["person_count"]
315 | 
316 | 		shape = (pr_c, pe_c)
317 | 		unmasked_products=np.zeros(shape=(pr_c,))
318 | 		unmasked_products[0] = 1
319 | 		unmasked_people=np.zeros(shape=(pe_c,))
320 | 		cache = []
321 | 		training_mask = np.zeros(shape)
322 | 		pause=[0]
323 | 		def gen_output(datas):
324 | 			for i in range(bs * experiment.header.params["batch_per_epoch"]):
325 | 				for partition, pt in datas.items():
326 | 					if partition=="train":
327 | 						pe_flag = False
328 | 						pr_flag = False
329 | 						if pause[0] > 48:
330 | 
331 | 							def do_product():
332 | 								if not pr_flag:
333 | 									for x in range(pe_c):
334 | 										if unmasked_people[x] == 0 and any(pt.x[y][x] == 1 for y in range(pr_c) if unmasked_products[y] == 1):
335 | 											unmasked_people[x] = 1
336 | 											pe_flag = True
337 | 											break
338 | 
339 | 							def do_person():
340 | 								if not pe_flag:
341 | 									for y in range(pr_c):
342 | 										if unmasked_products[y] == 0 and any(pt.x[y][x] == 1 for x in range(pe_c) if unmasked_people[x] == 1):
343 | 											unmasked_products[y] = 1
344 | 											pr_flag = True
345 | 											break
346 | 
347 | 							if random.random() > 0.5:
348 | 								do_product()
349 | 							else:
350 | 								do_person()
351 | 
352 | 							if not pr_flag and not pe_flag:
353 | 								for x in range(pe_c):
354 | 									if unmasked_people[x] == 0:
355 | 										unmasked_people[x] = 1
356 | 										pe_flag = True
357 | 										break
358 | 								if not pe_flag:
359 | 									for y in range(pr_c):
360 | 										if unmasked_products[y] == 0:
361 | 											unmasked_products[y] = 1
362 | 											pr_flag = True
363 | 											break
364 | 							for x in range(pe_c):
365 | 								#TODO this is like a np.cross or something
366 | 								for y in range(pr_c):
367 | 									if unmasked_people[x] * unmasked_products[y] == 1:
368 | 										training_mask[y][x] = 1
369 | 							if not pe_flag and not pr_flag:
370 | 								assert np.sum(training_mask) == pr_c * pe_c
371 | 								print('all data')
372 | 							pause[0] = 0
373 | 						pause[0]+=1
374 | 
375 | 						pt = Point(np.where(training_mask, pt.x, 0), np.where(training_mask, pt.y, 0))
376 | 						#print(np.sum(pt.x))
377 | 						#print(np.sum(pt.y))
378 | 					yield (partition, pt)
379 | 				# yield Point(adj_con, adj_score)
380 | 
381 | 		def transform(stream):
382 | 			if len(cache) == 1:
383 | 				return  gen_output(cache[0])
384 | 
385 | 			data = list(stream)
386 | 
387 | 			products = set()
388 | 			people = set()
389 | 			# Construct adjacency dict
390 | 			for i in data:
391 | 				if i["person_id"] not in person_product:
392 | 					person_product[i["person_id"]] = {}
393 | 
394 | 				if len(people) < pe_c or i["person_id"] in people:
395 | 					if len(products) < pr_c or i["product_id"] in products:
396 | 
397 | 						person_product[i["person_id"]][i["product_id"]] = i["score"]
398 | 
399 | 						reviews_per_person[i["person_id"]] += 1
400 | 						reviews_per_product[i["product_id"]] += 1
401 | 
402 | 						products.add(i["product_id"])
403 | 						people.add(i["person_id"])
404 | 
405 | 			def exists(person, product):
406 | 				return 1.0 if person in person_product and product in person_product[person] else 0.0
407 | 
408 | 			def score(person, product):
409 | 				return person_product.get(person, 0.0).get(product, 0.0) 
410 | 
411 | 			ppe = list(dict(reviews_per_person).values())
412 | 			ppr = list(dict(reviews_per_product).values())
413 | 
414 | 			#print("Reviews per product: ", np.histogram(ppe) )
415 | 			#print("Reviews per person: ", np.histogram(ppr) )
416 | 
417 | 			#logger.info(f"People returned {len(people)} of capacity {pe_c}")
418 | 			#logger.info(f"Products returned {len(products)} of capacity {pr_c}")
419 | 
420 | 			people   = sorted(list(people))[:pe_c]
421 | 			products = sorted(list(products))[:pr_c]
422 | 
423 | 			def build(fn):
424 | 				return DatasetHelpers.ensure_length(np.array([
425 | 					DatasetHelpers.ensure_length(
426 | 						np.array([fn(person, product) for person in people])
427 | 					, pe_c) for product in products
428 | 				]), pr_c)
429 | 
430 | 			adj_score = build(score)
431 | 			adj_con = build(exists)
432 | 
433 | 			# print("Connections:",adj_con)
434 | 			# print("Scores:",adj_score)
435 | 
436 | 			assert_mtx_shape(adj_score, shape, "adj_score")
437 | 			assert_mtx_shape(adj_con,   shape)
438 | 
439 | 			mask_seed = np.random.randint(10, size=shape)
440 | 			masks = {
441 | 				"test":     np.equal(mask_seed, 0),
442 | 				"train":    np.greater(mask_seed, 1),
443 | 				"validate": np.equal(mask_seed, 1),
444 | 				"all":      Point(adj_con, adj_score)
445 | 			}
446 | 
447 | 			def gen_d(mask):
448 | 				return Point(np.where(mask, adj_con, 0), np.where(mask, adj_score, 0))
449 | 
450 | 			datas = {
451 | 				k: gen_d(v)
452 | 				for (k, v) in masks.items()
453 | 			}
454 | 
455 | 			warm_up = False
456 | 
457 | 
458 | 			if warm_up:
459 | 				cache.append(datas)
460 | 				return gen_output(datas)
461 | 
462 | 			else:
463 | 				for i in range(experiment.params.batch_size * experiment.header.params["batch_per_epoch"]):
464 | 					for partition, pt in datas.items():
465 | 						yield (partition, pt)
466 | 
467 | 
468 | 		return Recipe(transform=transform, partition=lambda x:x)
469 | 
470 | 
471 | 


--------------------------------------------------------------------------------
/graph_ml/model.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import keras
  3 | from keras.models import Sequential, Model
  4 | from keras.layers import *
  5 | import keras.backend as K
  6 | 
  7 | import tensorflow as tf
  8 | 
  9 | from .ntm import *
 10 | from .adjacency_layer import Adjacency
 11 | 
 12 | 
 13 | # Rainbow sprinkles for your activation function
 14 | # Try to use all activation functions
 15 | # @argument m: (?,N) tensor
 16 | # @returns (?,N*5) tensor
 17 | def PolyActivation(m):
 18 | 	# wildcard of the day - let's do inception style activation because I've no idea which is best
 19 | 	# and frequently I get great boosts from switching activation functions
 20 | 	activations = ['tanh', 'sigmoid', 'softmax', 'softplus', 'relu']
 21 | 
 22 | 	# TODO: Add dense layer to resize back to original size
 23 | 	# I cannot work out how to do that in Keras yet :/
 24 | 	return Concatenate()([
 25 | 		Activation(i)(m) for i in activations
 26 | 	])
 27 | 
 28 | 
 29 | # Choose activation function for me
 30 | # More efficient than PolyActivation
 31 | # @returns Same sized tensor as input
 32 | def PolySwitchActivation(m):
 33 | 	# will fail for shared nodes
 34 | 	print(m.shape)
 35 | 
 36 | 	if len(m.shape) != 3:
 37 | 		# TODO: make this work in a sane way
 38 | 		m = Reshape([i for i in m.shape.dims if i is not None] + [1])(m) # warning: assumes tensorflow
 39 | 
 40 | 	activations = ['tanh', 'sigmoid', 'softmax', 'softplus', 'relu']
 41 | 	return add([
 42 | 		Conv1D(1,1)(Activation(i)(m)) for i in activations
 43 | 	])
 44 | 
 45 | class Model(object):
 46 | 
 47 | 	@classmethod
 48 | 	def generate(cls, experiment, dataset):
 49 | 		params = experiment.params
 50 | 
 51 | 		# TODO: Move this into Experiment header
 52 | 		n_styles = 6
 53 | 		n_sequence = 100
 54 | 
 55 | 		bs = experiment.params.batch_size
 56 | 
 57 | 		if experiment.name == "review_from_visible_style":
 58 | 			model = Sequential([
 59 | 				Dense(8, 
 60 | 					input_shape=dataset.input_shape,
 61 | 					activation='softmax'),
 62 | 				Dense(1, activation='sigmoid'),
 63 | 			])
 64 | 
 65 | 
 66 | 		elif experiment.name == "review_from_hidden_style_neighbor_conv":
 67 | 			neighbors = Input(shape=(n_sequence,n_styles*2,), dtype='float32', name='neighbors')
 68 | 			person = Input(shape=(n_styles,), dtype='float32', name='person')
 69 | 
 70 | 			m = cls.style_from_neighbors(neighbors, n_styles, n_sequence)
 71 | 			m = Concatenate()([m, person])
 72 | 			m = Dense(n_styles*4)(m)
 73 | 			m = PolyActivation(m)
 74 | 			m = Dense(1, activation='sigmoid')(m)
 75 | 
 76 | 			model = keras.models.Model(inputs=[person, neighbors], outputs=[m])
 77 | 
 78 | 		
 79 | 		elif experiment.name == "style_from_neighbor_conv":
 80 | 			neighbors = Input(shape=(n_sequence,n_styles+2,), dtype='float32', name='neighbors')
 81 | 			m = cls.style_from_neighbors(neighbors, n_styles, n_sequence)
 82 | 
 83 | 			model = keras.models.Model(inputs=[neighbors], outputs=[m])
 84 | 
 85 | 
 86 | 		elif experiment.name == "style_from_neighbor_rnn":
 87 | 			neighbors = Input(shape=(n_sequence,n_styles+2,), dtype='float32', name='neighbors')
 88 | 			m = LSTM(n_styles*4)(neighbors)
 89 | 			m = Dense(n_styles)(m)
 90 | 			m = Activation('sigmoid', name='final_activation')(m)
 91 | 
 92 | 			model = keras.models.Model(inputs=[neighbors], outputs=[m])
 93 | 
 94 | 
 95 | 		elif experiment.name == "review_from_all_hidden_simple_unroll":
 96 | 			thinking_width = 10
 97 | 
 98 | 			neighbors = Input(shape=(experiment.header.params["neighbor_count"],4,), dtype='float32', name='neighbors')
 99 | 			m = Conv1D(thinking_width, 1, activation='tanh')(neighbors)
100 | 			m = MaxPooling1D(experiment.header.params["neighbor_count"])(m)
101 | 			m = Reshape([thinking_width])(m)
102 | 			m = Dense(1)(m)
103 | 			m = Activation("sigmoid", name='final_activation')(m)
104 | 
105 | 			model = keras.models.Model(inputs=[neighbors], outputs=[m])
106 | 
107 | 
108 | 		elif experiment.name == 'review_from_all_hidden_random_walks':
109 | 
110 | 			ss = experiment.header.params["sequence_size"]
111 | 			ps = experiment.header.params["patch_size"]
112 | 			pw = experiment.header.params["patch_width"]
113 | 
114 | 			patch = Input(batch_shape=(bs,ss,ps,pw), dtype='float32', name="patch")
115 | 			# flat_patch = Reshape([ss*ps*pw])(patch)
116 | 			# score = Dense(experiment.header.params["working_width"]*2, activation="tanh")(flat_patch)
117 | 			# score = Dense(experiment.header.params["working_width"],   activation="tanh")(flat_patch)
118 | 
119 | 			# rnn = PatchNTM(experiment).build()
120 | 			# score = rnn(patch)
121 | 
122 | 			# Data format
123 | 			# x      = [x_path, x_path, x_path]
124 | 			# x_path = [x_node, x_node, x_node]
125 | 			# x_node = [label, score, is_head]
126 | 
127 | 			# x = [
128 | 			# 	[
129 | 			# 		[label, score, is_head]:Node, 
130 | 			# 		[label, score, is_head]:Node
131 | 			# 	]:Path, 
132 | 			# 	[
133 | 			# 		[label, score, is_head]:Node, 
134 | 			# 		[label, score, is_head]:Node
135 | 			# 	]:Path 
136 | 			# ]:Sequence
137 | 
138 | 			# Convolve path-pattern
139 | 			channels = 8
140 | 			pattern_length = 8
141 | 
142 | 			m = patch
143 | 
144 | 			# Add channels for convolution
145 | 			m = Lambda(lambda x: K.expand_dims(x, axis=-1))(m)
146 | 
147 | 			# Compute!!
148 | 			m = Conv3D(channels, (1, pattern_length, pw), activation='relu')(m)
149 | 			pattern_conv_out_size = ps - pattern_length + 1
150 | 
151 | 			m = Reshape([ss * channels * pattern_conv_out_size])(m)
152 | 			m = Dense(4, activation="relu", name="score_dense")(m)
153 | 			score = Dense(1, activation="sigmoid", name="score_out")(m)
154 | 			
155 | 			model = keras.models.Model(inputs=[patch], outputs=[score])
156 | 
157 | 
158 | 		elif experiment.name == 'review_from_all_hidden_adj':
159 | 
160 | 			pr_c = experiment.header.params["product_count"]
161 | 			pe_c = experiment.header.params["person_count"]
162 | 			style_width = experiment.header.params["style_width"]
163 | 
164 | 			adj_con = Input(batch_shape=(bs, pr_c, pe_c), dtype='float32', name="adj_con")
165 | 			features = Adjacency(pe_c, pr_c, style_width, name="hidden_to_adj")(adj_con)
166 | 
167 | 			model = keras.models.Model(inputs=[adj_con], outputs=[features])
168 | 
169 | 			model.compile(loss=keras.losses.mean_squared_error,
170 | 				optimizer=keras.optimizers.Adam(lr=0.2, decay=0.01),
171 | 				metrics=['accuracy'])
172 | 
173 | 			return model
174 | 
175 | 
176 | 
177 | 		# Compile time!
178 | 		if experiment.header.target == float:
179 | 			model.compile(loss=keras.losses.mean_squared_error,
180 | 				optimizer=keras.optimizers.SGD(lr=0.3),
181 | 				metrics=['accuracy'])
182 | 
183 | 		elif experiment.header.target == list:
184 | 			model.compile(loss='categorical_crossentropy',
185 | 				optimizer=keras.optimizers.SGD(lr=0.3),
186 | 				metrics=['accuracy'])
187 | 
188 | 
189 | 
190 | 		return model
191 | 
192 | 	@classmethod 
193 | 	def style_from_neighbors(cls, neighbors, n_styles, n_sequence):
194 | 		m = Conv1D(n_styles, 1, activation='tanh')(neighbors)
195 | 		m = MaxPooling1D(n_sequence)(m)
196 | 		m = Reshape([n_styles])(m)
197 | 		m = Dense(n_styles)(m)
198 | 		m = Activation('softmax')(m)
199 | 
200 | 		return m
201 | 
202 | 
203 | 


--------------------------------------------------------------------------------
/graph_ml/ntm.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import keras
  3 | import keras.backend as K
  4 | 
  5 | import tensorflow as tf
  6 | 
  7 | from keras.models import Model
  8 | from keras.layers import *
  9 | from recurrentshop import RecurrentModel
 10 | 
 11 | from .util import *
 12 | 
 13 | class NTMBase(object):
 14 | 
 15 | 	def __init__(self, experiment):
 16 | 		self.experiment = experiment
 17 | 
 18 | 		self.patch_size  = experiment.header.params["patch_size"]
 19 | 		self.patch_width = experiment.header.params["patch_width"]
 20 | 		self.working_width = experiment.header.params["working_width"]
 21 | 		self.word_size = self.experiment.header.params["word_size"]
 22 | 		self.batch_size = self.experiment.params.batch_size
 23 | 		self.memory_size = self.experiment.header.params["memory_size"]
 24 | 		self.patch_data_width = self.patch_width - self.memory_size
 25 | 
 26 | 		self.word_shape = [self.word_size]
 27 | 		self.word_shape_batch = [self.batch_size, self.word_size]
 28 | 		self.memory_shape = [self.memory_size, self.word_size]
 29 | 		self.memory_shape_batch = [self.batch_size] + self.memory_shape
 30 | 
 31 | 
 32 | 	def combine_nodes(self, patch, width):
 33 | 		patch_data = Lambda(lambda x: x[:,:,0:self.patch_data_width:])(patch)
 34 | 
 35 | 		n1 = Conv1D(
 36 | 			filters=width, 
 37 | 			kernel_size=1, 
 38 | 			activation='tanh', 
 39 | 			kernel_initializer='random_uniform',
 40 | 			bias_initializer='zeros',
 41 | 			name="ConvPatch1")(patch_data)
 42 | 
 43 | 		n2 = Conv1D(
 44 | 			filters=width, 
 45 | 			kernel_size=1, 
 46 | 			activation='tanh', 
 47 | 			kernel_initializer='random_uniform',
 48 | 			bias_initializer='zeros',
 49 | 			name="ConvPatch2")(patch_data)
 50 | 
 51 | 		n = multiply([n1, n2])
 52 | 
 53 | 		n = Conv1D(
 54 | 			filters=width, 
 55 | 			kernel_size=1, 
 56 | 			activation='tanh', 
 57 | 			kernel_initializer='random_uniform',
 58 | 			bias_initializer='zeros',
 59 | 			name="ConvPatch3")(n)
 60 | 
 61 | 		n = MaxPooling1D(self.patch_size)(n)
 62 | 		n = Reshape([width])(n)
 63 | 		return n
 64 | 
 65 | 	def patch_extract(self, address, patch, slice_begin):
 66 | 		extract_width = self.patch_width - (slice_begin % self.patch_width)
 67 | 
 68 | 		address_repeated = Lambda(lambda x:K.repeat_elements(K.expand_dims(x, -1), extract_width, -1))(address)
 69 | 		patch_slices = Lambda(lambda x: x[:,:,slice_begin::])(patch)
 70 | 		assert_shape(patch_slices, [self.patch_size, extract_width])
 71 | 
 72 | 		rows = multiply([patch_slices, address_repeated])
 73 | 		row = Lambda(lambda x: K.sum(x,-2))(rows)
 74 | 		assert_shape(row, [extract_width])
 75 | 
 76 | 		return row 
 77 | 
 78 | 	def resolve_address(self, address, patch):
 79 | 		assert_shape(address, [self.patch_size])
 80 | 		assert_shape(patch, [self.patch_size, self.patch_width])
 81 | 		return self.patch_extract(address, patch, -self.memory_size) 
 82 | 
 83 | 	def read(self, memory, address):
 84 | 		address_repeated = Lambda(lambda x:K.repeat_elements(K.expand_dims(x, -1), self.word_size, -1))(address)
 85 | 		read_rows = multiply([memory, address_repeated])
 86 | 		read = Lambda(lambda x: K.sum(x,-2))(read_rows)
 87 | 
 88 | 		assert_shape(read, [self.word_size])
 89 | 
 90 | 		return read
 91 | 
 92 | 	def write(self, memory, address, write):
 93 | 		assert_shape(memory, self.memory_shape)
 94 | 		assert_shape(write, [self.word_size])
 95 | 		assert_shape(address, [self.memory_size])
 96 | 
 97 | 		address_expanded = expand_dims(address, -1)
 98 | 		write = expand_dims(write, 1)
 99 | 		write_e = dot([address_expanded, write], axes=[2,1], name="WriteExpanded")
100 | 		memory = add([memory, write_e], name="MemoryWrite")
101 | 		return memory
102 | 
103 | 	def erase(self, memory, address, erase):
104 | 		assert_shape(memory, self.memory_shape)
105 | 		assert_shape(erase, [self.word_size])
106 | 		assert_shape(address, [self.memory_size])
107 | 
108 | 		erase = expand_dims(erase, 1)
109 | 		address_expanded = expand_dims(address, -1)
110 | 		erase_e = dot([address_expanded, erase], axes=[2,1], name="EraseExpanded")
111 | 		assert_shape(erase_e, self.memory_shape)
112 | 		erase_mask = Lambda(lambda x: 1.0 - x)(erase_e)
113 | 		memory = multiply([memory, erase_mask])
114 | 		return memory
115 | 
116 | 	def generate_address(self, input_data, patch, name):
117 | 		address_ptr = Dense(self.patch_size, activation="softplus",name=name)(input_data)
118 | 		address = self.resolve_address(address_ptr, patch)
119 | 		return address
120 | 
121 | 
122 | class PatchNTM(NTMBase):
123 | 
124 | 	def __init__(self, experiment):
125 | 		NTMBase.__init__(self, experiment)
126 | 
127 | 	def build(self):
128 | 
129 | 		patch = Input((self.patch_size, self.patch_width), name="InputPatch")
130 | 		memory_tm1 = Input(batch_shape=self.memory_shape_batch, name="Memory")
131 | 		memory_t = memory_tm1
132 | 
133 | 		# conv = self.combine_nodes(patch, working_width)
134 | 		# first_node = Lambda(lambda x: x[:,:self.patch_data_width])(flat_patch)
135 | 		patch_without_memory_addr = Lambda(lambda x: x[:,:,:self.patch_data_width:])(patch)
136 | 		flat_patch = Reshape([self.patch_size*self.patch_data_width])(patch_without_memory_addr)
137 | 		
138 | 		working_memory = Dense(self.working_width, activation='relu')(flat_patch)
139 | 		# conv = self.combine_nodes(patch, self.working_width)
140 | 		# working_memory = concatenate([working_memory, conv])
141 | 		# working_memory = Dense(self.working_width, activation='relu')(working_memory)
142 | 
143 | 		pre_memory = working_memory
144 | 
145 | 		use_memory = False
146 | 
147 | 		if use_memory:
148 | 			# ------- Memory operations --------- #
149 | 
150 | 			primary_address = Lambda(lambda x: x[:,3,self.patch_data_width:])(patch)
151 | 			print(primary_address)
152 | 
153 | 			address = self.generate_address(primary_address, patch, name="address_read1")
154 | 			read1 = self.read(memory_t, address)
155 | 
156 | 			# Turn batch dimension from None to batch_size
157 | 			batched_working_memory = Lambda(lambda x: K.reshape(x, [self.batch_size, self.working_width]))(working_memory)
158 | 			batched_working_memory = concatenate([batched_working_memory, read1], batch_size=self.batch_size)
159 | 			
160 | 			batched_working_memory = Dense(self.working_width, activation='relu')(batched_working_memory)
161 | 
162 | 			erase_word = Dense(self.word_size, name="DenseEraseWord", activation='relu')(batched_working_memory)
163 | 			# address = self.generate_address(batched_working_memory, patch, name="address_erase")
164 | 			erase_word = Lambda(lambda x: K.ones_like(x))(erase_word)
165 | 			memory_t = self.erase(memory_t, primary_address, erase_word)
166 | 		
167 | 			write_word = Dense(self.word_size, name="DenseWriteWord", activation='relu')(batched_working_memory)
168 | 			# address = self.generate_address(batched_working_memory, patch, name="address_write")
169 | 			memory_t = self.write(memory_t, primary_address, write_word)
170 | 
171 | 			# address = self.generate_address(batched_working_memory, patch, name="address_read2")
172 | 			# read2 = self.read(memory_t, address)
173 | 
174 | 			# working_memory = concatenate([batched_working_memory, read1])
175 | 			working_memory = Dense(self.working_width, activation="relu")(batched_working_memory)
176 | 
177 | 
178 | 		return RecurrentModel(
179 | 			input=patch,
180 | 			output=working_memory,
181 | 			return_sequences=True,
182 | 			stateful=True,
183 | 
184 | 			initial_states=[memory_tm1],
185 | 			final_states=[memory_t],
186 | 			state_initializer=[initializers.random_normal(stddev=1.0)]
187 | 		)
188 | 
189 | 
190 | 


--------------------------------------------------------------------------------
/graph_ml/path.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import hashlib
 3 | import os.path
 4 | import logging
 5 | 
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | def generate_path(experiment, prefix, suffix, extra=""):
 9 | 	query = experiment.header.cypher_query
10 | 	m = hashlib.md5()
11 | 
12 | 	m.update(query.encode('utf-8'))
13 | 	m.update(extra.encode('utf-8'))
14 | 	# logger.info(f"generate_path {prefix} {suffix} {query} {extra}")
15 | 	return os.path.join(prefix + '/' + experiment.name + '_' + m.hexdigest()  + suffix)
16 | 
17 | def generate_output_path(experiment, suffix):
18 | 	return generate_path(experiment, experiment.params.output_dir, suffix)
19 | 
20 | def generate_data_path(experiment, suffix, query_params=None):
21 | 	return generate_path(experiment, experiment.params.data_dir, suffix, str(query_params))
22 | 


--------------------------------------------------------------------------------
/graph_ml/train.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os.path
  3 | from datetime import datetime
  4 | import logging
  5 | from sklearn.metrics import classification_report
  6 | import itertools
  7 | 
  8 | import keras
  9 | import numpy as np
 10 | import keras.callbacks
 11 | 
 12 | from .model import Model
 13 | from .dataset import Dataset
 14 | from .path import generate_output_path
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | class StopEarlyIfAbove(keras.callbacks.Callback):
 19 | 	def __init__(self, monitor='val_acc', value=0.99, verbose=0, patience=3):
 20 | 		super(keras.callbacks.Callback, self).__init__()
 21 | 		self.monitor = monitor
 22 | 		self.value = value
 23 | 		self.verbose = verbose
 24 | 		self.stopped_epoch = 0
 25 | 		self.patience = patience
 26 | 
 27 | 	def on_epoch_end(self, epoch, logs={}):
 28 | 		current = logs.get(self.monitor)
 29 | 		if current is None:
 30 | 			logger.error("Early stopping requires %s available!" % self.monitor)
 31 | 			exit()
 32 | 
 33 | 		if current > self.value:
 34 | 			self.patience -= 1
 35 | 			if self.patience <= 0:
 36 | 				self.stopped_epoch = epoch
 37 | 				self.model.stop_training = True
 38 | 
 39 | 	def on_train_end(self, logs=None):
 40 | 		if self.stopped_epoch > 0 and self.verbose > 0:
 41 | 			logger.info("Epoch {}: early stopping {} > {}".format(self.stopped_epoch+1, self.monitor, self.value))
 42 | 
 43 | 
 44 | class SpecialValidator(keras.callbacks.Callback):
 45 | 	def __init__(self, experiment, dataset, model, verbose):
 46 | 		self.experiment = experiment
 47 | 		self.model = model
 48 | 		self.dataset = dataset
 49 | 		self.verbose = verbose
 50 | 		super(keras.callbacks.Callback, self).__init__()
 51 | 
 52 | 	
 53 | 	def on_train_end(self, logs):
 54 | 		self.test(self.verbose)
 55 | 
 56 | 	def on_epoch_end(self, epoch, logs):
 57 | 		self.test()
 58 | 
 59 | 	def test(self, verbose=False):
 60 | 		print() # Clear from epoch status bar
 61 | 		for (label, genie) in self.dataset.generator.items():
 62 | 			# print(f"Prediction for {label}")
 63 | 
 64 | 			row = genie.peek()
 65 | 			y_true = row[1][0]
 66 | 			x_test = row[0][0]
 67 | 
 68 | 			y_pred = self.model.predict_generator(
 69 | 				generator=genie,
 70 | 				steps=1,
 71 | 				workers=0,
 72 | 				use_multiprocessing=False,
 73 | 			)
 74 | 			y_pred = np.array(y_pred[0])
 75 | 
 76 | 			y_correct = np.isclose(y_pred, y_true, atol=0.1)
 77 | 			y_zero = np.isclose(y_pred, 0, atol=0.1)
 78 | 			
 79 | 			# The bits that should be one
 80 | 			y_true_set_and_in_mask = np.where(np.greater(y_true, 0.1), np.greater(x_test, 0.1), False)
 81 | 			
 82 | 			# The bits that should be one and were one
 83 | 			y_masked = np.where(y_true_set_and_in_mask, y_correct, False)
 84 | 			
 85 | 			# The correct predictions for the input adj
 86 | 			y_masked_david = np.where(np.greater(x_test, 0.1), y_correct, False)
 87 | 
 88 | 			if verbose:
 89 | 				print("y_pred: ", np.around(y_pred, 1))
 90 | 				print("y_correct: ", y_correct)
 91 | 			# print(f"y_masked {np.count_nonzero(y_masked)} / {np.count_nonzero(y_correct)} / {np.count_nonzero(x_test)}")
 92 | 			
 93 | 			net_accuracy = round(np.count_nonzero(y_masked) / (np.count_nonzero(y_true_set_and_in_mask)+0.001) * 100, 3)
 94 | 			net_accuracy_david = round(np.count_nonzero(y_masked_david) / (np.count_nonzero(x_test)+0.001) * 100, 3)
 95 | 			gross_accuracy = round(np.count_nonzero(y_correct) / np.size(y_correct) * 100, 3)
 96 | 
 97 | 			print(f"{label} 1-accuracy: {net_accuracy}%  accuracy: {net_accuracy_david}%")
 98 | 			# print()
 99 | 
100 | 			if label == "validate" and net_accuracy == 100:
101 | 				self.model.stop_training = True
102 | 
103 | 
104 | 
105 | 
106 | 
107 | class Train(object):
108 | 
109 | 	@staticmethod
110 | 	def run(experiment, dataset):
111 | 
112 | 		params = experiment.params
113 | 
114 | 		if params.random_seed is not None:
115 | 			np.random.seed(params.random_seed)
116 | 
117 | 		logger.info("Generate model")
118 | 
119 | 		model = Model.generate(experiment, dataset)
120 | 		params_file = generate_output_path(experiment, ".hdf5")
121 | 
122 | 		if os.path.isfile(params_file) and params.load_weights:
123 | 			model.load_weights(params_file)
124 | 
125 | 		callbacks = [
126 | 			#StopEarlyIfAbove(verbose=params.verbose),
127 | 			SpecialValidator(experiment, dataset, model, params.print_weights),
128 | 			# keras.callbacks.ModelCheckpoint(params_file, verbose=params.verbose, save_best_only=True, monitor='val_loss', mode='auto', period=3),
129 | 			# keras.callbacks.TensorBoard(log_dir=generate_output_path(experiment, f"_log/{experiment.run_tag}/")),
130 | 			#keras.callbacks.EarlyStopping(monitor='loss', min_delta=0.000000001, patience=8, verbose=0, mode='auto')
131 | 		]
132 | 
133 | 		# TODO: move to more general overriding mechanism
134 | 		# Perhaps unify os.environ, arguments, experiment parameters
135 | 		if params.epochs is not None:
136 | 			epochs = params.epochs
137 | 		else:
138 | 			epochs = experiment.header.params.get('epochs', 20)
139 | 
140 | 		logger.info("Fit model")
141 | 
142 | 		# Once I've worked out Python multithreading conflicts we can introduce workers > 0
143 | 		model.fit_generator(
144 | 			generator=dataset.train_generator,
145 | 			steps_per_epoch=dataset.steps_per_epoch,
146 | 			validation_data=dataset.validation_generator,
147 | 			validation_steps=dataset.validation_steps,
148 | 
149 | 			epochs=epochs,
150 | 			verbose=params.verbose,
151 | 
152 | 			workers=0,
153 | 			use_multiprocessing=False,
154 | 			shuffle=True,
155 | 			callbacks=callbacks
156 | 		)
157 | 
158 | 		logger.info("Evaluate model")
159 | 
160 | 		score = model.evaluate_generator(
161 | 			generator=dataset.test_generator,
162 | 			steps=dataset.test_steps,
163 | 			workers=0,
164 | 			use_multiprocessing=False,
165 | 		)
166 | 
167 | 
168 | 		if params.print_weights:
169 | 			for layer in model.layers:
170 | 				for var, weight in zip(layer.weights, layer.get_weights()):
171 | 					print(f"{var.name} {np.around(weight, decimals=1)}")
172 | 
173 | 
174 | 		return score
175 | 
176 | 
177 | 


--------------------------------------------------------------------------------
/graph_ml/util.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from keras.layers import Lambda
 3 | import keras.backend as K
 4 | 
 5 | # Take that keras
 6 | from tensorflow import float32
 7 | 
 8 | def assert_shape(tensor, shape, strict=False):
 9 | 	if strict:
10 | 		assert hasattr(tensor, '_keras_shape'), f"{tensor.name} is missing _keras_shape"
11 | 	assert tensor.shape[1:] == shape, f"{tensor.name} is wrong shape, expected {shape} found {tensor.shape[1:]}"
12 | 
13 | def assert_mtx_shape(mtx, shape, name="matrix"):
14 | 	assert mtx.shape == shape, f"{name} is wrong shape, expected {shape} found {mtx.shape}"
15 | 
16 | def expand_dims(v, axis):
17 | 	return Lambda(lambda x: K.expand_dims(x,axis))(v)


--------------------------------------------------------------------------------
/output/.gitignore:
--------------------------------------------------------------------------------
1 | *


--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | python -m unittest discover test


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .test_memory_cell import test_memory_cell


--------------------------------------------------------------------------------
/test/test_memory_cell.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | import keras.backend as K
  4 | from keras.utils.test_utils import keras_test
  5 | from keras.models import Model
  6 | from keras.layers import *
  7 | 
  8 | from recurrentshop import RecurrentModel
  9 | 
 10 | import numpy as np
 11 | from numpy.testing import *
 12 | 
 13 | import random
 14 | from collections import namedtuple
 15 | from tensorflow import float32
 16 | from unittest import TestCase
 17 | 
 18 | from graph_ml import Train, Dataset
 19 | from graph_ml import NTMBase
 20 | from experiment import Experiment, ExperimentHeader
 21 | 
 22 | Args = namedtuple('DummyArgs', 'batch_size')
 23 | 
 24 | 
 25 | class Tests(TestCase):
 26 | 
 27 | 	@keras_test
 28 | 	def test_memory_ops(self):
 29 | 	    
 30 | 		memory_size = 10
 31 | 		word_size = 4
 32 | 		batch_size = 1
 33 | 
 34 | 		header = ExperimentHeader(params={"word_size":word_size, "memory_size":memory_size, "patch_size":4, "patch_width":4})
 35 | 		experiment = Experiment("test_memory_cell", header, Args(batch_size))
 36 | 
 37 | 		# Initialise memory with zeros
 38 | 		memory_initial = np.random.random((batch_size, memory_size, word_size))
 39 | 		memory_tm1 = K.constant(memory_initial, name="memory",dtype=float32)
 40 | 		memory_t = memory_tm1
 41 | 
 42 | 		# Write address is random int
 43 | 		address_w = random.randint(0,memory_size - 1)
 44 | 		address_one_hot_w = np.zeros([batch_size, memory_size])
 45 | 		address_one_hot_w[0][address_w] = 1.0
 46 | 		t_address_w = K.constant(address_one_hot_w, name="address",dtype=float32)
 47 | 
 48 | 		# Write random pattern
 49 | 		write = np.random.random([batch_size, word_size])
 50 | 		t_write = K.constant(write, name="write")
 51 | 
 52 | 		pb = NTMBase(experiment)
 53 | 		memory_t = pb.write(memory_t, t_address_w, t_write)
 54 | 		read = pb.read(memory_t, t_address_w)
 55 | 
 56 | 		address_e = (address_w+1) % memory_size
 57 | 		address_one_hot_e = np.zeros([batch_size, memory_size])
 58 | 		address_one_hot_e[0][address_e] = 1.0
 59 | 		t_address_e = K.constant(address_one_hot_e, name="address",dtype=float32)
 60 | 
 61 | 		t_erase = K.constant(np.ones([batch_size, word_size]),name="erase")
 62 | 		memory_t = pb.erase(memory_t, t_address_e, t_erase)
 63 | 
 64 | 		read_final = K.eval(read)
 65 | 		memory_after_erase = K.eval(memory_t)
 66 | 
 67 | 		write_expected = [write[0] + memory_initial[0][address_w]]
 68 | 
 69 | 		for i in range(batch_size):
 70 | 			for j in range(memory_size):
 71 | 				if j == address_w:
 72 | 					assert_allclose(memory_after_erase[i][j], write_expected[0])
 73 | 				elif j == address_e:
 74 | 					assert_allclose(memory_after_erase[i][j], 0)
 75 | 				else:
 76 | 					assert_allclose(memory_after_erase[i][j], memory_initial[i][j])
 77 | 
 78 | 		assert_allclose(read_final, write_expected)
 79 | 
 80 | 
 81 | 	@keras_test
 82 | 	def test_memory_loopback(self):
 83 | 	    
 84 | 		memory_size = 10
 85 | 		word_size = 4
 86 | 		batch_size = 1
 87 | 
 88 | 		header = ExperimentHeader(params={"word_size":word_size, "memory_size":memory_size, "patch_size":4, "patch_width":4})
 89 | 		experiment = Experiment("test_memory_cell", header, Args(batch_size))
 90 | 
 91 | 		# Initialise memory with zeros
 92 | 		memory_initial = np.random.random((batch_size, memory_size, word_size))
 93 | 		memory_tm1 = K.constant(memory_initial, name="memory",dtype=float32)
 94 | 		memory_t = memory_tm1
 95 | 
 96 | 		# Write address is random int
 97 | 		address = random.randint(0,memory_size - 1)
 98 | 		address_one_hot = np.zeros([batch_size, memory_size])
 99 | 		address_one_hot[0][address] = 1.0
100 | 		t_address = K.constant(address_one_hot, name="address",dtype=float32)
101 | 
102 | 		# Write random pattern
103 | 		write = np.random.random([batch_size, word_size])
104 | 		t_write = K.constant(write, name="write")
105 | 		t_erase = K.constant(np.ones([batch_size, word_size]),name="erase")
106 | 
107 | 		pb = NTMBase(experiment)
108 | 		memory_t = pb.erase(memory_t, t_address, t_erase)
109 | 		memory_t = pb.write(memory_t, t_address, t_write)
110 | 		t_read   = pb.read( memory_t, t_address)
111 | 
112 | 		read_final = K.eval(t_read)
113 | 
114 | 		assert_allclose(read_final, write)
115 | 
116 | 
117 | 	@keras_test
118 | 	def test_address_resolution(self):
119 | 
120 | 		# Data setup
121 | 		memory_size = 20
122 | 		word_size = 4
123 | 		batch_size = 1
124 | 		patch_size = 10
125 | 		patch_width = memory_size + 5
126 | 
127 | 		header = ExperimentHeader(params={"word_size":word_size, "memory_size":memory_size, "patch_size":patch_size, "patch_width":patch_width})
128 | 		experiment = Experiment("test_memory_cell", header, Args(batch_size))
129 | 
130 | 		pointer = random.randint(0,patch_size - 1)
131 | 		pointer_one_hot = np.zeros([batch_size, patch_size])
132 | 		pointer_one_hot[0][pointer] = 1.0
133 | 
134 | 		patch = np.random.random([batch_size, patch_size, patch_width])
135 | 
136 | 		t_patch = K.constant(patch, dtype=float32, name="patch")
137 | 		t_pointer_one_hot = K.constant(pointer_one_hot, dtype=float32, name="pointer_one_hot")
138 | 		pb = NTMBase(experiment)
139 | 		resolved = K.eval(pb.resolve_address(t_pointer_one_hot, t_patch))
140 | 
141 | 		for i in range(batch_size):
142 | 			assert_almost_equal(resolved[i], patch[i][pointer][-memory_size::])
143 | 
144 | 
145 | 
146 | 	@keras_test
147 | 	def test_address_resolution_gradient(self):
148 | 
149 | 		# Data setup
150 | 		memory_size = 20
151 | 		word_size = 4
152 | 		batch_size = 1
153 | 		patch_size = 10
154 | 		patch_width = memory_size + 5
155 | 
156 | 		header = ExperimentHeader(params={"word_size":word_size, "memory_size":memory_size, "patch_size":patch_size, "patch_width":patch_width})
157 | 		experiment = Experiment("test_memory_cell", header, Args(batch_size))
158 | 
159 | 		pb = NTMBase(experiment)
160 | 
161 | 		ptr = Input((patch_size,), name="ptr")
162 | 		patch = Input((patch_size,patch_width), name="patch")
163 | 		memory = Input((memory_size, word_size), name="memory")
164 | 
165 | 		resolved = pb.resolve_address(ptr, patch)
166 | 		read = pb.read(memory, resolved)
167 | 
168 | 		out = Dense(3)(read)
169 | 
170 | 		model = Model([ptr, patch, memory], out)
171 | 		model.compile(loss='mse', optimizer='sgd')
172 | 
173 | 		model.fit({
174 | 			"ptr": np.random.random((batch_size, patch_size)), 
175 | 			"patch": np.random.random((batch_size, patch_size, patch_width)),
176 | 			"memory": np.random.random((batch_size, memory_size, word_size)),
177 | 		}, np.random.random((batch_size, 3)))
178 | 
179 | 
180 | 		model.predict({
181 | 			"ptr": np.zeros((batch_size, patch_size)), 
182 | 			"patch": np.zeros((batch_size, patch_size, patch_width)),
183 | 			"memory": np.zeros((batch_size, memory_size, word_size)),
184 | 		})
185 | 
186 | 
187 | 	@keras_test
188 | 	def test_memory_gradient(self):
189 | 
190 | 		# Data setup
191 | 		memory_size = 20
192 | 		word_size = 4
193 | 		batch_size = 1
194 | 		patch_size = 10
195 | 		patch_width = memory_size + 5
196 | 
197 | 		header = ExperimentHeader(params={"word_size":word_size, "memory_size":memory_size, "patch_size":patch_size, "patch_width":patch_width})
198 | 		experiment = Experiment("test_memory_cell", header, Args(batch_size))
199 | 
200 | 		pb = NTMBase(experiment)
201 | 
202 | 		patch = Input((patch_size, patch_width), name="patch")
203 | 		memory_tm1 = Input((memory_size, word_size), name="memory")
204 | 		memory_t = memory_tm1
205 | 
206 | 		flat_patch = Reshape((patch_size*patch_width,))(patch)
207 | 
208 | 		write_word = Dense(word_size)(flat_patch)
209 | 		erase_word = Dense(word_size)(flat_patch)
210 | 
211 | 		ptr = Dense(patch_size)(flat_patch)
212 | 		address = pb.resolve_address(ptr, patch)
213 | 		memory_t = pb.erase(memory_t, address, erase_word)
214 | 
215 | 		ptr = Dense(patch_size)(flat_patch)
216 | 		address = pb.resolve_address(ptr, patch)
217 | 		memory_t = pb.write(memory_t, address, write_word)
218 | 
219 | 		ptr = Dense(patch_size)(flat_patch)
220 | 		address = pb.resolve_address(ptr, patch)
221 | 		read = pb.read(memory_t, address)
222 | 
223 | 		out = Dense(3)(read)
224 | 
225 | 		model = Model([patch, memory_tm1], out)
226 | 		model.compile(loss='mse', optimizer='sgd')
227 | 
228 | 		model.fit({
229 | 			"patch": np.random.random((batch_size, patch_size, patch_width)),
230 | 			"memory": np.random.random((batch_size, memory_size, word_size)),
231 | 		}, np.random.random((batch_size, 3)))
232 | 
233 | 
234 | 		model.predict({
235 | 			"patch": np.zeros((batch_size, patch_size, patch_width)),
236 | 			"memory": np.zeros((batch_size, memory_size, word_size)),
237 | 		})
238 | 
239 | 
240 | 
241 | 
242 | 	@keras_test
243 | 	def test_memory_rnn_gradient(self):
244 | 
245 | 		# Data setup
246 | 		memory_size = 20
247 | 		word_size = 4
248 | 		batch_size = 1
249 | 		patch_size = 10
250 | 		patch_width = memory_size + 5
251 | 		sequence_length = 10
252 | 
253 | 		header = ExperimentHeader(params={"word_size":word_size, "memory_size":memory_size, "patch_size":patch_size, "patch_width":patch_width})
254 | 		experiment = Experiment("test_memory_cell", header, Args(batch_size))
255 | 
256 | 		pb = NTMBase(experiment)
257 | 
258 | 		patch = Input((patch_size, patch_width), name="patch")
259 | 		memory_tm1 = Input((memory_size, word_size), name="memory")
260 | 		memory_t = memory_tm1
261 | 
262 | 		flat_patch = Reshape((patch_size*patch_width,))(patch)
263 | 
264 | 		write_word = Dense(word_size)(flat_patch)
265 | 		erase_word = Dense(word_size)(flat_patch)
266 | 
267 | 		ptr = Dense(patch_size)(flat_patch)
268 | 		address = pb.resolve_address(ptr, patch)
269 | 		memory_t = pb.erase(memory_t, address, erase_word)
270 | 
271 | 		ptr = Dense(patch_size)(flat_patch)
272 | 		address = pb.resolve_address(ptr, patch)
273 | 		memory_t = pb.write(memory_t, address, write_word)
274 | 
275 | 		ptr = Dense(patch_size)(flat_patch)
276 | 		address = pb.resolve_address(ptr, patch)
277 | 		read = pb.read(memory_t, address)
278 | 
279 | 		out = Dense(3)(read)
280 | 
281 | 		rnn = RecurrentModel(input=patch, output=out, initial_states=[memory_tm1], final_states=[memory_t])
282 | 		a = Input((sequence_length, patch_size, patch_width), name="patch_seq")
283 | 		b = rnn(a)
284 | 		model = Model(a, b)
285 | 		model.compile(loss='mse', optimizer='sgd')
286 | 
287 | 		model.fit({
288 | 			"patch_seq": np.random.random((batch_size, sequence_length, patch_size, patch_width)),
289 | 			# "memory": np.random.random((batch_size, memory_size, word_size)),
290 | 		}, np.random.random((batch_size, 3)))
291 | 
292 | 
293 | 		model.predict({
294 | 			"patch_seq": np.zeros((batch_size, sequence_length, patch_size, patch_width)),
295 | 			# "memory": np.zeros((batch_size, memory_size, word_size)),
296 | 		})
297 | 
298 | 
299 | 
300 | 
301 | 
302 | 
303 | 
304 | 
305 | 
306 | 
307 | 
308 | 
309 | 
310 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | 
3 | from experiment import Experiment
4 | 
5 | if __name__ == '__main__':
6 | 	Experiment.run()
7 | 
8 | 	


--------------------------------------------------------------------------------