├── .floydexpt
├── .floydignore
├── .gitignore
├── .idea
├── misc.xml
├── modules.xml
└── vcs.xml
├── LICENSE
├── Pipfile
├── Pipfile.lock
├── README.md
├── bin
├── floyd-run.sh
└── start_neo4j_locally.sh
├── config
├── .gitignore
├── __init__.py
├── environment.py
└── overrides.py
├── data_sets
└── synthetic_review_prediction
│ ├── article_0
│ ├── __init__.py
│ ├── configure.py
│ └── generate.py
│ └── utils
│ └── dataset_writer.py
├── experiment
├── __init__.py
├── arguments.py
├── directory.py
├── experiment.py
└── experiment_header.py
├── floyd_requirements.txt
├── graph_ml
├── __init__.py
├── adjacency_layer.py
├── dataset.py
├── dataset_helpers.py
├── model.py
├── ntm.py
├── path.py
├── train.py
└── util.py
├── output
└── .gitignore
├── test.sh
├── test
├── __init__.py
└── test_memory_cell.py
└── train.py
/.floydexpt:
--------------------------------------------------------------------------------
1 | {"family_id": "XaCDPUiGtasLwxhbKi4y7S", "name": "graph-investigations"}
--------------------------------------------------------------------------------
/.floydignore:
--------------------------------------------------------------------------------
1 |
2 | # Directories and files to ignore when uploading code to floyd
3 |
4 | .git
5 | .eggs
6 | eggs
7 | lib
8 | lib64
9 | parts
10 | sdist
11 | var
12 | *.pyc
13 | *.swp
14 | .DS_Store
15 | data
16 | output
17 | log
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 | *~
6 | # C extensions
7 | *.so
8 | .DS_Store
9 |
10 | # Distribution / packaging
11 | .Python
12 | env/
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | data/
29 | output/
30 |
31 | # PyInstaller
32 | # Usually these files are written by a python script from a template
33 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 |
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 |
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | .hypothesis/
51 |
52 | # Translations
53 | *.mo
54 | *.pot
55 |
56 | # Django stuff:
57 | *.log
58 | local_settings.py
59 |
60 | # Flask stuff:
61 | instance/
62 | .webassets-cache
63 |
64 | # Scrapy stuff:
65 | .scrapy
66 |
67 | # Sphinx documentation
68 | docs/_build/
69 |
70 | # PyBuilder
71 | target/
72 |
73 | # Jupyter Notebook
74 | .ipynb_checkpoints
75 |
76 | # pyenv
77 | .python-version
78 |
79 | # celery beat schedule file
80 | celerybeat-schedule
81 |
82 | # SageMath parsed files
83 | *.sage.py
84 |
85 | # dotenv
86 | .env
87 |
88 | # virtualenv
89 | .venv
90 | venv/
91 | ENV/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 Octavian-ai
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
1 | [[source]]
2 |
3 | url = "https://pypi.python.org/simple"
4 | verify_ssl = true
5 | name = "pypi"
6 |
7 |
8 | [packages]
9 |
10 | "neo4j-driver" = "*"
11 | tensorflow = "*"
12 | keras = "*"
13 | numpy = "*"
14 | lazy = "*"
15 | "h5py" = "*"
16 | colorama = "*"
17 | coloredlogs = "*"
18 | more-itertools = "*"
19 | recurrentshop = {git = "https://github.com/datalogai/recurrentshop.git"}
20 | generate-data = {git = "https://github.com/Octavian-ai/generate-data.git"}
21 | colored-traceback = "*"
22 | sklearn = "*"
23 | tqdm = "*"
24 | floyd-cli = "*"
25 |
26 |
27 | [dev-packages]
28 |
29 |
--------------------------------------------------------------------------------
/Pipfile.lock:
--------------------------------------------------------------------------------
1 | {
2 | "_meta": {
3 | "hash": {
4 | "sha256": "abe2e1e33a7a78d6c130b15bb5444b6c11496bc59b24304e35c505c2081a253b"
5 | },
6 | "host-environment-markers": {
7 | "implementation_name": "cpython",
8 | "implementation_version": "3.6.2",
9 | "os_name": "posix",
10 | "platform_machine": "x86_64",
11 | "platform_python_implementation": "CPython",
12 | "platform_release": "17.3.0",
13 | "platform_system": "Darwin",
14 | "platform_version": "Darwin Kernel Version 17.3.0: Thu Nov 9 18:09:22 PST 2017; root:xnu-4570.31.3~1/RELEASE_X86_64",
15 | "python_full_version": "3.6.2",
16 | "python_version": "3.6",
17 | "sys_platform": "darwin"
18 | },
19 | "pipfile-spec": 6,
20 | "requires": {},
21 | "sources": [
22 | {
23 | "name": "pypi",
24 | "url": "https://pypi.python.org/simple",
25 | "verify_ssl": true
26 | }
27 | ]
28 | },
29 | "default": {
30 | "args": {
31 | "hashes": [
32 | "sha256:a785b8d837625e9b61c39108532d95b85274acd679693b71ebb5156848fcf814"
33 | ],
34 | "version": "==0.1.0"
35 | },
36 | "backports.weakref": {
37 | "hashes": [
38 | "sha256:81bc9b51c0abc58edc76aefbbc68c62a787918ffe943a37947e162c3f8e19e82",
39 | "sha256:bc4170a29915f8b22c9e7c4939701859650f2eb84184aee80da329ac0b9825c2"
40 | ],
41 | "version": "==1.0.post1"
42 | },
43 | "bleach": {
44 | "hashes": [
45 | "sha256:e67f46adcec78dbc3c04462f3aba3213a673d5652eba2609ed1ef15492a44b8d",
46 | "sha256:978e758599b54cd3caa2e160d74102879b230ea8dc93871d0783721eef58bc65"
47 | ],
48 | "version": "==1.5.0"
49 | },
50 | "certifi": {
51 | "hashes": [
52 | "sha256:14131608ad2fd56836d33a71ee60fa1c82bc9d2c8d98b7bdbc631fe1b3cd1296",
53 | "sha256:edbc3f203427eef571f79a7692bb160a2b0f7ccaa31953e99bd17e307cf63f7d"
54 | ],
55 | "version": "==2018.1.18"
56 | },
57 | "chardet": {
58 | "hashes": [
59 | "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691",
60 | "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae"
61 | ],
62 | "version": "==3.0.4"
63 | },
64 | "click": {
65 | "hashes": [
66 | "sha256:29f99fc6125fbc931b758dc053b3114e55c77a6e4c6c3a2674a2dc986016381d",
67 | "sha256:f15516df478d5a56180fbf80e68f206010e6d160fc39fa508b65e035fd75130b"
68 | ],
69 | "version": "==6.7"
70 | },
71 | "clint": {
72 | "hashes": [
73 | "sha256:05224c32b1075563d0b16d0015faaf9da43aa214e4a2140e51f08789e7a4c5aa"
74 | ],
75 | "version": "==0.5.1"
76 | },
77 | "colorama": {
78 | "hashes": [
79 | "sha256:463f8483208e921368c9f306094eb6f725c6ca42b0f97e313cb5d5512459feda",
80 | "sha256:48eb22f4f8461b1df5734a074b57042430fb06e1d61bd1e11b078c0fe6d7a1f1"
81 | ],
82 | "version": "==0.3.9"
83 | },
84 | "colored-traceback": {
85 | "hashes": [
86 | "sha256:f76c21a4b4c72e9e09763d4d1b234afc469c88693152a763ad6786467ef9e79f",
87 | "sha256:6da7ce2b1da869f6bb54c927b415b95727c4bb6d9a84c4615ea77d9872911b05"
88 | ],
89 | "version": "==0.3.0"
90 | },
91 | "coloredlogs": {
92 | "hashes": [
93 | "sha256:6bd7ceac109c3f2e138db8578396664b1067f32aca55d3280a57dbf05f1ada6c",
94 | "sha256:e3b19320bd21bde506444601a71397cf5215f040df06503013697c6261b05de9"
95 | ],
96 | "version": "==9.0"
97 | },
98 | "contextlib2": {
99 | "hashes": [
100 | "sha256:f5260a6e679d2ff42ec91ec5252f4eeffdcf21053db9113bd0a8e4d953769c00",
101 | "sha256:509f9419ee91cdd00ba34443217d5ca51f5a364a404e1dce9e8979cea969ca48"
102 | ],
103 | "markers": "python_version < '3.2'",
104 | "version": "==0.5.5"
105 | },
106 | "enum34": {
107 | "hashes": [
108 | "sha256:6bd0f6ad48ec2aa117d3d141940d484deccda84d4fcd884f5c3d93c23ecd8c79",
109 | "sha256:644837f692e5f550741432dd3f223bbb9852018674981b1664e5dc339387588a",
110 | "sha256:8ad8c4783bf61ded74527bffb48ed9b54166685e4230386a9ed9b1279e2df5b1",
111 | "sha256:2d81cbbe0e73112bdfe6ef8576f2238f2ba27dd0d55752a776c41d38b7da2850"
112 | ],
113 | "version": "==1.1.6"
114 | },
115 | "floyd-cli": {
116 | "hashes": [
117 | "sha256:0ecd7d42b91ab88b4e3e852f37c22f8ede849de96e5c0f3b9c20e4bd6fad5bbc"
118 | ],
119 | "version": "==0.10.31"
120 | },
121 | "funcsigs": {
122 | "hashes": [
123 | "sha256:330cc27ccbf7f1e992e69fef78261dc7c6569012cf397db8d3de0234e6c937ca",
124 | "sha256:a7bb0f2cf3a3fd1ab2732cb49eba4252c2af4240442415b4abce3b87022a8f50"
125 | ],
126 | "markers": "python_version < '3.3'",
127 | "version": "==1.0.2"
128 | },
129 | "futures": {
130 | "hashes": [
131 | "sha256:c4884a65654a7c45435063e14ae85280eb1f111d94e542396717ba9828c4337f",
132 | "sha256:51ecb45f0add83c806c68e4b06106f90db260585b25ef2abfcda0bd95c0132fd"
133 | ],
134 | "markers": "python_version < '3.2'",
135 | "version": "==3.1.1"
136 | },
137 | "generate-data": {
138 | "git": "https://github.com/Octavian-ai/generate-data.git"
139 | },
140 | "h5py": {
141 | "hashes": [
142 | "sha256:562045c57a2e47aca9c716ac8cd64448d4897c0f5fe456ab5a34b17c8b3907cb",
143 | "sha256:e1bfcfa2c425dc0f637d4edd858b94e400bbb5746dba324ace124d55fc21d3df",
144 | "sha256:9e0537058efea7547d976f9c00067f7193727bb41ce6b4733c52de35beaa46f5",
145 | "sha256:9d9fb861e10735c5c710fe18f34c69e470cf161a4ba38717b7dde21de2d33760",
146 | "sha256:2d137a1b2f529e58886b5865f6dec51cd96ea0671dd84cebc6dba5cd8c7d0a75",
147 | "sha256:2ccb4f405059314829ebad1859d2c68e133a9d13ca7c3cc7a298a76a438fd09c",
148 | "sha256:52204972a02032d6a427addd37a24a22a2b97d4bce0850c84a6995db9c91926c",
149 | "sha256:1be9cd57e74b24f836d0d2c34ae376ff2df704f40aa8815aa9113b5a860d467f",
150 | "sha256:2258fca3533a3276fd86e9196326786f408a95748ac707c010fff265edf60342",
151 | "sha256:66609c48f8841357ced4291b7c9009518bb6e6fec449d91eb46aa417b6f5f4cf",
152 | "sha256:4a6e6cd8668fa453864f4f9e243460dcc2d41e79d14516b84f4ba74ebcc5b222",
153 | "sha256:a314e5e98037ece52ad0b88b4e0d788ca554935268f3e9d293ca9bcd18611b42",
154 | "sha256:478efa37b84a56061af5fcd286678331e873e216f6c5987cd31f9666edc2f157",
155 | "sha256:2b91c9117f2e7a2ef924bec41ac77e57567bec6731773373bf78eb4387b39a2a",
156 | "sha256:07ddea6bb649a257fc57ccae359a36d691b2ef8b9617971ae7d6f74ef6f67cad",
157 | "sha256:bb990d8663dbeee22ce44135ffd65ab38bd23d6a689722a653cfbf2d18d46688",
158 | "sha256:e78f09a44fc9256b84c9df98edf7b6ead3b3da2e12bf2d1e00384960a6a78a1a",
159 | "sha256:40dd37cbf24ca3b935a8d6eb8960ec5d0381219f82317bdc40aa9e08b3fcc143",
160 | "sha256:1fad9aa32835230de77b31edd6980b7c202de7bb7d8384d1bcb47b5dd32c8c7c",
161 | "sha256:537a60879485e5ce484ab4350c7bd8b3da4b531f9f82ef0a18780beabde98c90",
162 | "sha256:c050791989cd9979fe57a770d4e323b2e67ef95800e89e7dc6ad3652b8ccd86f",
163 | "sha256:b7e1c42367513108c3615cf1a24a9d366fd93eb9d2d92085bafb3011b785e8a9",
164 | "sha256:180a688311e826ff6ae6d3bda9b5c292b90b28787525ddfcb10a29d5ddcae2cc"
165 | ],
166 | "version": "==2.7.1"
167 | },
168 | "html5lib": {
169 | "hashes": [
170 | "sha256:2612a191a8d5842bfa057e41ba50bbb9dcb722419d2408c78cff4758d0754868"
171 | ],
172 | "version": "==0.9999999"
173 | },
174 | "humanfriendly": {
175 | "hashes": [
176 | "sha256:587b16ce804bec8e3cbb8c420decea051b38e3d895272b2c1e38fc69b4286b1c",
177 | "sha256:d0e74171b87318a94b99520e4f0c5651e944b5f11d696c46be3330bb82b85300"
178 | ],
179 | "version": "==4.8"
180 | },
181 | "idna": {
182 | "hashes": [
183 | "sha256:8c7309c718f94b3a625cb648ace320157ad16ff131ae0af362c9f21b80ef6ec4",
184 | "sha256:2c6a5de3089009e3da7c5dde64a141dbc8551d5b7f6cf4ed7c2568d0cc520a8f"
185 | ],
186 | "version": "==2.6"
187 | },
188 | "keras": {
189 | "hashes": [
190 | "sha256:7b1116bad7fb497758cfaffcd180e9adc2904be7deec2d9164543955e9973d0b",
191 | "sha256:7ca3a381523bad40a6922e88951a316664cb088fd01cea07e5ec8ada3327e3c7"
192 | ],
193 | "version": "==2.1.3"
194 | },
195 | "lazy": {
196 | "hashes": [
197 | "sha256:c80a77bf7106ba7b27378759900cfefef38271088dc63b014bcfe610c8e68e3d"
198 | ],
199 | "version": "==1.3"
200 | },
201 | "markdown": {
202 | "hashes": [
203 | "sha256:9ba587db9daee7ec761cfc656272be6aabe2ed300fece21208e4aab2e457bc8f",
204 | "sha256:a856869c7ff079ad84a3e19cd87a64998350c2b94e9e08e44270faef33400f81"
205 | ],
206 | "version": "==2.6.11"
207 | },
208 | "marshmallow": {
209 | "hashes": [
210 | "sha256:8740ada95f47fa19f905772aa4932dc5512226a90c30da5672d6d6bf3dd791a7",
211 | "sha256:d3f31fe7be2106b1d783cbd0765ef4e1c6615505514695f33082805f929dd584"
212 | ],
213 | "version": "==2.15.0"
214 | },
215 | "mock": {
216 | "hashes": [
217 | "sha256:5ce3c71c5545b472da17b72268978914d0252980348636840bd34a00b5cc96c1",
218 | "sha256:b158b6df76edd239b8208d481dc46b6afd45a846b7812ff0ce58971cf5bc8bba"
219 | ],
220 | "version": "==2.0.0"
221 | },
222 | "monotonic": {
223 | "hashes": [
224 | "sha256:0bcd2b14e3b7ee7cfde796e408176ceffa01d89646f2e532964ef2aae0c9fa3e",
225 | "sha256:a02611d5b518cd4051bf22d21bd0ae55b3a03f2d2993a19b6c90d9d168691f84"
226 | ],
227 | "markers": "python_version == '2.6' or python_version == '2.7' or python_version == '3.0' or python_version == '3.1' or python_version == '3.2'",
228 | "version": "==1.4"
229 | },
230 | "more-itertools": {
231 | "hashes": [
232 | "sha256:11a625025954c20145b37ff6309cd54e39ca94f72f6bb9576d1195db6fa2442e",
233 | "sha256:0dd8f72eeab0d2c3bd489025bb2f6a1b8342f9b198f6fc37b52d15cfa4531fea",
234 | "sha256:c9ce7eccdcb901a2c75d326ea134e0886abfbea5f93e91cc95de9507c0816c44"
235 | ],
236 | "version": "==4.1.0"
237 | },
238 | "neo4j-driver": {
239 | "hashes": [
240 | "sha256:a25c9b67e63403b6ca8114d18bee581d2cff032cdc89c68970a4be8cd30585d0"
241 | ],
242 | "version": "==1.5.3"
243 | },
244 | "numpy": {
245 | "hashes": [
246 | "sha256:428cd3c0b197cf857671353d8c85833193921af9fafcc169a1f29c7185833d50",
247 | "sha256:a476e437d73e5754aa66e1e75840d0163119c3911b7361f4cd06985212a3c3fb",
248 | "sha256:289ff717138cd9aa133adcbd3c3e284458b9c8230db4d42b39083a3407370317",
249 | "sha256:c5eccb4bf96dbb2436c61bb3c2658139e779679b6ae0d04c5e268e6608b58053",
250 | "sha256:75471acf298d455b035226cc609a92aee42c4bb6aa71def85f77fa2c2b646b61",
251 | "sha256:5c54fb98ecf42da59ed93736d1c071842482b18657eb16ba6e466bd873e1b923",
252 | "sha256:9ddf384ac3aacb72e122a8207775cc29727cbd9c531ee1a4b95754f24f42f7f3",
253 | "sha256:781d3197da49c421a07f250750de70a52c42af08ca02a2f7bdb571c0625ae7eb",
254 | "sha256:93b26d6c06a22e64d56aaca32aaaffd27a4143db0ac2f21a048f0b571f2bfc55",
255 | "sha256:b2547f57d05ba59df4289493254f29f4c9082d255f1f97b7e286f40f453e33a1",
256 | "sha256:eef6af1c752eef538a96018ef9bdf8e37bbf28aab50a1436501a4aa47a6467df",
257 | "sha256:ff8a4b2c3ac831964f529a2da506c28d002562b230261ae5c16885f5f53d2e75",
258 | "sha256:194074058c22a4066e1b6a4ea432486ee468d24ab16f13630c1030409e6b8666",
259 | "sha256:4e13f1a848fde960dea33702770265837c72b796a6a3eaac7528cfe75ddefadd",
260 | "sha256:91101216d72749df63968d86611b549438fb18af2c63849c01f9a897516133c7",
261 | "sha256:97507349abb7d1f6b76b877258defe8720833881dc7e7fd052bac90c88587387",
262 | "sha256:1479b46b6040b5c689831496354c8859c456b152d37315673a0c18720b41223b",
263 | "sha256:98b1ac79c160e36093d7914244e40ee1e7164223e795aa2c71dcce367554e646",
264 | "sha256:24bbec9a199f938eab75de8390f410969bc33c218e5430fa1ae9401b00865255",
265 | "sha256:7880f412543e96548374a4bb1d75e4cdb8cad80f3a101ed0f8d0e0428f719c1c",
266 | "sha256:6112f152b76a28c450bbf665da11757078a724a90330112f5b7ea2d6b6cefd67",
267 | "sha256:7c5276763646480143d5f3a6c2acb2885460c765051a1baf4d5070f63d05010f",
268 | "sha256:3de643935b212307b420248018323a44ec51987a336d1d747c1322afc3c099fb"
269 | ],
270 | "version": "==1.14.0"
271 | },
272 | "pathlib2": {
273 | "hashes": [
274 | "sha256:db3e43032d23787d3e9aec8c7ef1e0d2c3c589d5f303477661ebda2ca6d4bfba",
275 | "sha256:d32550b75a818b289bd4c1f96b60c89957811da205afcceab75bc8b4857ea5b3"
276 | ],
277 | "version": "==2.3.0"
278 | },
279 | "pbr": {
280 | "hashes": [
281 | "sha256:60c25b7dfd054ef9bb0ae327af949dd4676aa09ac3a9471cdc871d8a9213f9ac",
282 | "sha256:05f61c71aaefc02d8e37c0a3eeb9815ff526ea28b3b76324769e6158d7f95be1"
283 | ],
284 | "version": "==3.1.1"
285 | },
286 | "protobuf": {
287 | "hashes": [
288 | "sha256:11788df3e176f44e0375fe6361342d7258a457b346504ea259a21b77ffc18a90",
289 | "sha256:50c24f0d00b7efb3a72ae638ddc118e713cfe8cef40527afe24f7ebcb878e46d",
290 | "sha256:41661f9a442eba2f1967f15333ebe9ecc7e7c51bcbaa2972303ad33a4ca0168e",
291 | "sha256:06ec363b74bceb7d018f2171e0892f03ab6816530e2b0f77d725a58264551e48",
292 | "sha256:b20f861b55efd8206428c13e017cc8e2c34b40b2a714446eb202bbf0ff7597a6",
293 | "sha256:c1f9c36004a7ae6f1ce4a23f06070f6b07f57495f251851aa15cc4da16d08378",
294 | "sha256:4d2e665410b0a278d2eb2c0a529ca2366bb325eb2ae34e189a826b71fb1b28cd",
295 | "sha256:95b78959572de7d7fafa3acb718ed71f482932ddddddbd29ba8319c10639d863"
296 | ],
297 | "version": "==3.5.1"
298 | },
299 | "pygments": {
300 | "hashes": [
301 | "sha256:78f3f434bcc5d6ee09020f92ba487f95ba50f1e3ef83ae96b9d5ffa1bab25c5d",
302 | "sha256:dbae1046def0efb574852fab9e90209b23f556367b5a320c0bcb871c77c3e8cc"
303 | ],
304 | "version": "==2.2.0"
305 | },
306 | "pytz": {
307 | "hashes": [
308 | "sha256:80af0f3008046b9975242012a985f04c5df1f01eed4ec1633d56cc47a75a6a48",
309 | "sha256:feb2365914948b8620347784b6b6da356f31c9d03560259070b2f30cff3d469d",
310 | "sha256:59707844a9825589878236ff2f4e0dc9958511b7ffaae94dc615da07d4a68d33",
311 | "sha256:d0ef5ef55ed3d37854320d4926b04a4cb42a2e88f71da9ddfdacfde8e364f027",
312 | "sha256:c41c62827ce9cafacd6f2f7018e4f83a6f1986e87bfd000b8cfbd4ab5da95f1a",
313 | "sha256:8cc90340159b5d7ced6f2ba77694d946fc975b09f1a51d93f3ce3bb399396f94",
314 | "sha256:dd2e4ca6ce3785c8dd342d1853dd9052b19290d5bf66060846e5dc6b8d6667f7",
315 | "sha256:699d18a2a56f19ee5698ab1123bbcc1d269d061996aeb1eda6d89248d3542b82",
316 | "sha256:fae4cffc040921b8a2d60c6cf0b5d662c1190fe54d718271db4eb17d44a185b7"
317 | ],
318 | "version": "==2017.3"
319 | },
320 | "pyyaml": {
321 | "hashes": [
322 | "sha256:3262c96a1ca437e7e4763e2843746588a965426550f3797a79fca9c6199c431f",
323 | "sha256:16b20e970597e051997d90dc2cddc713a2876c47e3d92d59ee198700c5427736",
324 | "sha256:e863072cdf4c72eebf179342c94e6989c67185842d9997960b3e69290b2fa269",
325 | "sha256:bc6bced57f826ca7cb5125a10b23fd0f2fff3b7c4701d64c439a300ce665fff8",
326 | "sha256:c01b880ec30b5a6e6aa67b09a2fe3fb30473008c85cd6a67359a1b15ed6d83a4",
327 | "sha256:827dc04b8fa7d07c44de11fabbc888e627fa8293b695e0f99cb544fdfa1bf0d1",
328 | "sha256:592766c6303207a20efc445587778322d7f73b161bd994f227adaa341ba212ab",
329 | "sha256:5f84523c076ad14ff5e6c037fe1c89a7f73a3e04cf0377cb4d017014976433f3",
330 | "sha256:0c507b7f74b3d2dd4d1322ec8a94794927305ab4cebbe89cc47fe5e81541e6e8",
331 | "sha256:b4c423ab23291d3945ac61346feeb9a0dc4184999ede5e7c43e1ffb975130ae6",
332 | "sha256:ca233c64c6e40eaa6c66ef97058cdc80e8d0157a443655baa1b2966e812807ca",
333 | "sha256:4474f8ea030b5127225b8894d626bb66c01cda098d47a2b0d3429b6700af9fd8",
334 | "sha256:326420cbb492172dec84b0f65c80942de6cedb5233c413dd824483989c000608",
335 | "sha256:5ac82e411044fb129bae5cfbeb3ba626acb2af31a8d17d175004b70862a741a7"
336 | ],
337 | "version": "==3.12"
338 | },
339 | "raven": {
340 | "hashes": [
341 | "sha256:0adae40e004dfe2181d1f2883aa3d4ca1cf16dbe449ae4b445b011c6eb220a90",
342 | "sha256:84da75114739191bdf2388f296ffd6177e83567a7fbaf2701e034ad6026e4f3b"
343 | ],
344 | "version": "==6.5.0"
345 | },
346 | "recurrentshop": {
347 | "git": "https://github.com/datalogai/recurrentshop.git"
348 | },
349 | "requests": {
350 | "hashes": [
351 | "sha256:6a1b267aa90cac58ac3a765d067950e7dbbf75b1da07e895d1f594193a40a38b",
352 | "sha256:9c443e7324ba5b85070c4a818ade28bfabedf16ea10206da1132edaa6dda237e"
353 | ],
354 | "version": "==2.18.4"
355 | },
356 | "requests-toolbelt": {
357 | "hashes": [
358 | "sha256:42c9c170abc2cacb78b8ab23ac957945c7716249206f90874651971a4acff237",
359 | "sha256:f6a531936c6fa4c6cfce1b9c10d5c4f498d16528d2a54a22ca00011205a187b5"
360 | ],
361 | "version": "==0.8.0"
362 | },
363 | "scandir": {
364 | "hashes": [
365 | "sha256:913d0d04f3ea8f38a52a38e930a08deacd3643d71875a0751a5c01e006102998",
366 | "sha256:eb9d4a55bbeb0473a9c7d3ff81e12d44f0ad86daff48b02a95e2398c87ff1a00",
367 | "sha256:2b28d118b372de8950f85b65d8ddfd43643f139a5b721281dd6532bed6b8321c",
368 | "sha256:f14476800cfdd6809d5130840f78ca3c08aa25544113e2b33a0b2fe914583d69",
369 | "sha256:6db5aadb667bb709cc23921203e9c27f08225506a9b84b7ebe2b645dee47a4dd",
370 | "sha256:8129fe7b9211d080457e0ff87397d85bb9be6ebb482b6be6ad9700059ac2e516",
371 | "sha256:8fe782abf9314f2733c09d2191c1b3047475218ddbae90052b5c0f1a4215d5e2",
372 | "sha256:a93b6cc872eeccdc91b4c1c1e510820bee17f79c9455064fb8d3b73b51e52024",
373 | "sha256:9851e782da220073093da68b3451e3c33b10f84eca2aec17a24661c7c63357a2",
374 | "sha256:937d27e367af994afd3792904b794a82645ea9616dd336f5030e0b50e527eb57",
375 | "sha256:e0278a2d4bc6c0569aedbe66bf26c8ab5b2b08378b3289de49257f23ac624338"
376 | ],
377 | "markers": "python_version < '3.5'",
378 | "version": "==1.6"
379 | },
380 | "scikit-learn": {
381 | "hashes": [
382 | "sha256:3775cca4ce3f94508bb7c8a6b113044b78c16b0a30a5c169ddeb6b9fe57a8a72",
383 | "sha256:873245b03361710f47c5410a050dc56ee8ae97b9f8dcc6e3a81521ca2b64ad10",
384 | "sha256:370919e3148253fd6552496c33a1e3d78290a336fc8d1b9349d9e9770fae6ec0",
385 | "sha256:ce78bf4d10bd7e28807c36c6d2ab25a9934aaf80906ad987622a5e45627d91a2",
386 | "sha256:ba3fd442ae1a46830789b3578867daaf2c8409dcca6bf192e30e85beeabbfc2f",
387 | "sha256:a21cf8217e31a9e8e32c559246e05e6909981816152406945ae2e3e244dfcc1f",
388 | "sha256:e54a3dd1fe1f8124de90b93c48d120e6da2ea8df29b6895325df01ddc1bd8e26",
389 | "sha256:f9abae483f4d52acd6f660addb1b67e35dc5748655250af479de2ea6aefc6df0",
390 | "sha256:5c9ff456d67ef9094e5ea272fff2be05d399a47fc30c6c8ed653b94bdf787bd1",
391 | "sha256:871669cdb5b3481650fe3adff46eb97c455e30ecdc307eaf382ef90d4e2570ab",
392 | "sha256:d4da369614e55540c7e830ccdd17ab4fe5412ff8e803a4906d3ece393e2e3a63",
393 | "sha256:42f3c5bd893ed73bf47ccccf04dfb98fae743f397d688bb58c2238c0e6ec15d2",
394 | "sha256:95b155ef6bf829ddfba6026f100ba8e4218b7171ecab97b2163bc9e8d206848f",
395 | "sha256:72c194c5092e921d6107a8de8a5adae58c35bbc54e030ba624b6f02fd823bb21",
396 | "sha256:f528c4b2bba652cf116f5cccf36f4db95a7f9cbfcd1ee549c4e8d0f8628783b5",
397 | "sha256:d384e6f9a055b7a43492f9d27779adb717eb5dcf78b0603b01d0f070a608d241",
398 | "sha256:ee8c3b1898c728b6e5b5659c233f547700a1fea13ce876b6fe7d3434c70cc0e0",
399 | "sha256:56cfa19c31edf62e6414da0a337efee37a4af488b135640e67238786b9be6ab3",
400 | "sha256:5db9e68a384ce80a17fc449d4d5d9b45025fe17cf468429599bf404eccb51049",
401 | "sha256:8b17fc29554c5c98d88142f895516a5bec2b6b61daa815e1193a64c868ad53d2",
402 | "sha256:13136c6e4f6b808569f7f59299d439b2cd718f85d72ea14b5b6077d44ebc7d17",
403 | "sha256:ddc1eb10138ae93c136cc4b5945d3977f302b5d693592a4731b2805a7d7f2a74",
404 | "sha256:5ca0ad32ee04abe0d4ba02c8d89d501b4e5e0304bdf4d45c2e9875a735b323a0",
405 | "sha256:6e0899953611d0c47c0d49c5950082ab016b38811fced91cd2dcc889dd94f50a",
406 | "sha256:b2a10e2f9b73de10d8486f7a23549093436062b69139158802910a0f154aa53b",
407 | "sha256:a58746d4f389ea7df1d908dba8b52f709835f91c342f459a3ade5424330c69d1",
408 | "sha256:fdc39e89bd3466befb76dfc0c258d4ccad159df974954a87de3be5759172a067"
409 | ],
410 | "version": "==0.19.1"
411 | },
412 | "scipy": {
413 | "hashes": [
414 | "sha256:70e6fc3f2f52c9152f05e27eb9bd8543cb862cacb71f8521a571e4ffb837f450",
415 | "sha256:08041e5336fcd57defcc78650b44b3df652eff3e3a801638d894e50494fb630d",
416 | "sha256:ff8b6637d8d2c074ed67f3d57513e62f94747c6f1210f43e60ad3d8e93a424e4",
417 | "sha256:5964dba6a3c0be226d44d2520de8fb4ba1501768bad57eec687d36d3f53b6254",
418 | "sha256:bf36f3485e7b7291c36330a93bbfd4f5e8db23bbe4ea46c37b2839fef463f4e2",
419 | "sha256:e3a5673c105eab802fdecb77f102d877352e201df9328698a265b7f57546b34b",
420 | "sha256:cd23894e1cc6eaa00e6807b6b12e4ca66d5ff092986c9c3eb01e97f24e2d6462",
421 | "sha256:23a7238279ae94e088396b8b05a9795ef598dc79c5cd1adb91ad1ff87c7514fd",
422 | "sha256:3b66d5e40152175bca75cbbfd1eb5c108c50de9ae5625923f1c4f8f51cbe2dea",
423 | "sha256:fa17be6c66985931d3a391f61a6ba97c902585cf26020aa3eb24604115732d22",
424 | "sha256:d84df0bc86bbdd49f0a6b6bad5cd62ccb02a3bfe546bf79263de44ae081bcd7b",
425 | "sha256:912499ddb521b7ac6287ac4ccf5f296a83d38996c2d04f43c9e62a91f7b420aa",
426 | "sha256:889602ead28054a15e8c26e1a6b8420d5a4fa777cfeb3ec98cfa52b9f317d153",
427 | "sha256:5774adb6047983489bc81edaa72cd132e665e5680f0b2cf8ea28cd3b99e65d39",
428 | "sha256:01c7040a83eb4e020ab729488637dcadef54cb728b035b76668ab92a72515d60",
429 | "sha256:046705c604c6f1d63cad3e89677c0618b7abb40ed09a4c241c671a2d8e5128a9",
430 | "sha256:1f58fbd59e8d9652759df0d137832ff2a325ed708c173cba20c86589d811c210",
431 | "sha256:424500b2fe573d30de6dea927076c01acaadb3efb3d1f40340e8cc37151ccf27",
432 | "sha256:97123a25216616723083942eb595f47fee18da6b637a88b803de5f078009003c",
433 | "sha256:a79b99b8b5af9a63312bd053bbb7bdb7710e6bbb9cc81617f9f6b9b1e49c72f8",
434 | "sha256:9bd193686fd837472bdb6425486cb234ed0a4db76b930c141cc8d095ab213c8d",
435 | "sha256:a9e479648aab5f36330da94f351ebbfe79acb4e6f5e6ac6aeddc9291eb096839",
436 | "sha256:87ea1f11a0e9ec08c264dc64551d501fa307289460705f6fccd84cbfc7926d10"
437 | ],
438 | "version": "==1.0.0"
439 | },
440 | "six": {
441 | "hashes": [
442 | "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb",
443 | "sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9"
444 | ],
445 | "version": "==1.11.0"
446 | },
447 | "sklearn": {
448 | "hashes": [
449 | "sha256:e23001573aa194b834122d2b9562459bf5ae494a2d59ca6b8aa22c85a44c0e31"
450 | ],
451 | "version": "==0.0"
452 | },
453 | "tabulate": {
454 | "hashes": [
455 | "sha256:e4ca13f26d0a6be2a2915428dc21e732f1e44dad7f76d7030b2ef1ec251cf7f2"
456 | ],
457 | "version": "==0.8.2"
458 | },
459 | "tensorflow": {
460 | "hashes": [
461 | "sha256:f9c03acc5d26ac903e177fb904ceb797632830c5a0fae5c8b49d688a748337db",
462 | "sha256:c6d798da0002778f38e3b097acd7a620c89ff060fa3823c054113885b2472173",
463 | "sha256:975cbdeb016c3f14ad44f4919260e279918fba08c4bb3d7172ae4bf1aa612292",
464 | "sha256:62e3884a1d7824f20a172ae2861aab50b1802989e85a971f9dfaf61444226856",
465 | "sha256:9e6681a4b1e46936dbcc56ac213f61633979f6f348319658431181ffc3c1936c",
466 | "sha256:e43641ac5bbfc8a0d37fb8b78657f664856fe83b1ab7acf298f57780e6fbf2de",
467 | "sha256:cceb8439975ea508ffd19a312d7ff83149ab81d7e8a88685852bbea4ded98736",
468 | "sha256:bf51429bc11ab4561b5d124c08a5ee6476519d33b5970338586767563a02adc4",
469 | "sha256:ee96a38a3ba3c53e1cdd8cc2af59d5f378b7992e63c54fba9605c963b209e814",
470 | "sha256:233d66bfad2287c61434384ec315bbf37b2f551beda2e0d37a8c24a0f2ed3896"
471 | ],
472 | "version": "==1.4.1"
473 | },
474 | "tensorflow-tensorboard": {
475 | "hashes": [
476 | "sha256:4ff1c16faa8189c921b57ccb5f05ea1e19c276d59de7dcae3d846a6267a132d0",
477 | "sha256:6684571c711e07b3aae25dd91cb4b106738d71acfce385b9d359ab14374ac518"
478 | ],
479 | "version": "==0.4.0"
480 | },
481 | "tqdm": {
482 | "hashes": [
483 | "sha256:4c041f8019f7be65b8028ddde9a836f7ccc51c4637f1ff2ba9b5813d38d19d5a",
484 | "sha256:df32e6f127dc0ccbc675eadb33f749abbcb8f174c5cb9ec49c0cdb73aa737377"
485 | ],
486 | "version": "==4.19.5"
487 | },
488 | "urllib3": {
489 | "hashes": [
490 | "sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b",
491 | "sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f"
492 | ],
493 | "version": "==1.22"
494 | },
495 | "werkzeug": {
496 | "hashes": [
497 | "sha256:d5da73735293558eb1651ee2fddc4d0dedcfa06538b8813a2e20011583c9e49b",
498 | "sha256:c3fd7a7d41976d9f44db327260e263132466836cef6f91512889ed60ad26557c"
499 | ],
500 | "version": "==0.14.1"
501 | },
502 | "wheel": {
503 | "hashes": [
504 | "sha256:e721e53864f084f956f40f96124a74da0631ac13fbbd1ba99e8e2b5e9cafdf64",
505 | "sha256:9515fe0a94e823fd90b08d22de45d7bde57c90edce705b22f5e1ecf7e1b653c8"
506 | ],
507 | "version": "==0.30.0"
508 | }
509 | },
510 | "develop": {}
511 | }
512 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Join our Discord >> https://discord.gg/a2Z82Te
2 |
3 | # Review prediction
4 |
5 | ## Introduction
6 |
7 | The aim of this experiment is to investigate the performance of
8 | 1) different NN approaches
9 | 2) different graph representations of the same data
10 |
11 | on a simple synthetic prediction task.
12 |
13 | ## The Task
14 |
15 | We model personalised recommendations as a system containing _people_, _products_ and _recommendations_. In our system every product has a _style_ and each person has a _style preference_. _People_ can make _reviews_ of products. In our system the _review score_ will be a function _Y(...)_ of the person's _style preference_ and the product's _style_. We call this function the _opinion function_ i.e.:
16 |
17 | _review_score_ = _Y(product_style, person_style_preference)_
18 |
19 | We will generate data using this model. We will then use this synthetic data to investigate how effective various ML approaches on the data set are at learning the behaviour of this system.
20 |
21 |
22 | If necessary we can change the opinion function _Y(...)_ to increase or decrease the difficulty of the task.
23 |
24 | ## The Synthetic Data
25 |
26 | The synthetic data for this task can be varied in various ways:
27 |
28 | 1) Change which information is hidden e.g. we could hide _product_style_, _style_preference_ or both.
29 | 1) Change the representation of the key properties e.g. reviews/styles and preferences could be boolean, categorical, continuous scalars or even multi dimensional vectors.
30 | 1) Change how the data is represented as a graph e.g. reviews could be nodes in their own right, or they could be edges with properties, product_style could be a property on a product node or product_style could be a seperate node connected to a product node by a _HAS_STYLE_ relationship (edge).
31 | 1) Add additional meaningless or semi-meaningless information to the training data.
32 |
33 | We will generate different data sets to qualitatively investigate different ML approaches on the same basic system.
34 |
35 |
36 | ## Evaluation Tasks
37 |
38 | We are interested in four different evaluation tasks depending on whether the person or product is included in the training set or not:
39 |
40 | - **new product == unknown** at training time i.e. not in training set or validation set
41 | - **new person == unknown** at training time i.e. not in training set or validation set
42 | - **existing product == known** at training time i.e. present in training set
43 | - **existing person == known** at training time i.e. present in training set
44 |
45 | The evaluation tasks we are interested in are, how well can you predict the person's review? Given:
46 |
47 | 1) new product and new person
48 | 1) existing product and new person
49 | 1) new product and existing person
50 | 1) existing product and existing person
51 |
52 |
53 | ## Approach
54 |
55 | Although we have a synthetic system for which we can generate more data we want to get into good habits for working with "real" data. So we will attempt to blind the ML system to the fact that we are working with synthetic data and not rely on our ability to generate more information at will.
56 |
57 | It will be the responsibility of the ML part of the system to split the data into Test / Train and Validation sets. However for each data set that we generate we will keep back a small portion to make up a "golden" test set which is only to be used at the very end of our investigation. This is to perform a final test of the ML predictor, one which we haven't had the opportunity to optimise the meta-parameters for.
58 |
59 | Because of the three different evaluation tasks it will be necessary for us to keep back three different golden test sets, of a large enough size to test the system regardless of the test/training split. We will keep the following volumes of golden test data:
60 |
61 | 1) INDEPENDENT: A completely independent data set containing 1000 reviews
62 | 2) NEW_PEOPLE: new people + their reviews of existing products containing approx 2000 reviews
63 | 3) NEW_PRODUCTS: new products + reviews of them by existing people containing approx 2000 reviews
64 | 4) EXISTING: 2000 additional reviews between existing people and products.
65 |
66 |
67 |
68 | # The Data Sets
69 |
70 | ## Data Set 1: A simple binary preference system
71 |
72 | Products have a binary style and people have a binary preference.
73 |
74 | - All variables will be 'public' in the data set
75 |
76 |
77 | ### Product Style
78 | - _product_style_ will be categorical with two mutually exclusive elements (A and B).
79 | - The distribution of product styles will be uniform i.e. Approx 50% of products will have style A and 50% will have style B.
80 |
81 |
82 | ### Style Preference
83 | - _person_style_preference_ will be categorical with two mutually exclusive elements (likes_A_dislikes_B | likes_B_dislikes_A ).
84 | - The distribution of product styles will be uniform i.e. Approx 50% of people will like style A and 50% will like style B.
85 |
86 |
87 | ### Reviews and Opinion Function
88 | - _review_score_ will be boolean (1 for a positive review and 0 for a negative review)
89 | - Each person will have made either 1 or 2 reviews. The mean number of reviews-per-person will be approx 1.5 i.e. approx 50% will have made 2 reviews and 50% will have made 1 review.
90 | - _review_score_ is the dot product of the _product_style_ and _person_style_preference_ normalised to the range of 0 to 1
91 |
92 | Note: having people with 0 reviews would be useless since you cannot train or validate/test using them.
93 |
94 | Note: fixing the number of reviews-per-person would restrict the graph structure too much and open up the problem to approaches that we aren't interested in right now.
95 |
96 |
97 | ### Entity Ratios and Data Set Size
98 |
99 | I basically made these up. Intuitively the reviews-per-product and reviews-per-person parameters affect how much we can infer about people/product hidden variables. I like the idea of those figures being very different so we can see how systems cope with that distinction.
100 |
101 | - _people_:_products_ = 50:1
102 | - _people_:_reviews_ = 1:1.5
103 | - _reviews_:_products_ = 75:1
104 |
105 | Data set size: 12000 reviews / 160 products / 8000 people
106 |
107 | n.b. because we assign the reviews randomly some products may not have reviews, but it is relatively unlikely.
108 |
109 | ### Graph Schema
110 |
111 | PERSON(id: , style_preference: A|B, is_golden: True|False) -- WROTE(is_golden: True|False) -> REVIEW(id: , score: 1|0, is_golden: True|False) -- OF(is_golden: True|False) --> PRODUCT(id: , style: A:B, is_golden: True|False)
112 |
113 | ### Data generation algorithm
114 |
115 | 1) Instantiate all products for public data set and write to Neo, keeping an array of the ids.
116 | 1) Iteratively instantiate people, decide how many reviews that person will have made (probabilistically)
117 | 1) For each review that the person has to make randomly choose a product to review (without replacement)
118 | 1) Calculate the review score and submit the Person + their reviews to Neo
119 | 1) Read the data back out of neo and validate the entity ratios
120 | 1) Create the golden test sets:
121 | - NEW_PEOPLE: create 2000/reviews_per_person new people + their reviews of randomly selected (with replacement) existing products.
122 | - NEW_PRODUCTS: create 2000/reviews_per_product new products, have randomly selected (with replacement) people review them.
123 | - EXISTING randomly pick 2000 people (with replacement) have each of them review a randomly selected (with replacement) product
124 | - INDEPENDENT is easy, but best to leave till last to avoid confusion - just repeat the basic data generation from scratch
125 |
126 |
127 |
--------------------------------------------------------------------------------
/bin/floyd-run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | floyd run \
4 | --data davidmack/datasets/graph_experiments/1:/data \
5 | --env tensorflow-1.4 \
6 | --gpu \
7 | --tensorboard \
8 | --message "adj dense with dropout" \
9 | "ENVIRONMENT=floyd python train.py \
10 | --output-dir /output \
11 | --data-dir /data/ \
12 | --epochs 100"
--------------------------------------------------------------------------------
/bin/start_neo4j_locally.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | CONTAINER_ID=$(docker run -d -e NEO4J_dbms_memory_heap_max__size=2000m --publish=7474:7474 --publish=7687:7687 --volume=$(pwd)/data/neo4j:/data neo4j:3.2.7)
4 | sleep 10
5 | docker run -it --net host neo4j:3.2.7 bin/cypher-shell -u neo4j -p neo4j "CALL dbms.changePassword('local neo hates security!')"
6 |
7 | echo "Neo4j running locally. To stop it: docker kill ${CONTAINER_ID}"
--------------------------------------------------------------------------------
/config/.gitignore:
--------------------------------------------------------------------------------
1 | local*
--------------------------------------------------------------------------------
/config/__init__.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 | from .environment import Environment
3 |
4 | # There are too many layers and too many files to this config system
5 | default_values = {
6 | 'neo4j_url': 'bolt://localhost',
7 | 'neo4j_user': 'neo4j',
8 | 'neo4j_password': 'local neo hates security!'
9 | }
10 |
11 | environment_box = Environment(None)
12 |
13 |
14 | def set_environment(environment_name):
15 | environment_box.name = environment_name
16 |
17 |
18 | def get(config_variable_name):
19 | # don't execute code in overrides till necessary
20 | from .overrides import overrides
21 | return overrides[environment_box.name].get(config_variable_name, default_values[config_variable_name])
22 |
23 |
24 | class Config(object):
25 | @property
26 | def neo4j_url(self):
27 | return get('neo4j_url')
28 |
29 | @property
30 | def neo4j_user(self):
31 | return get('neo4j_user')
32 |
33 | @property
34 | def neo4j_password(self):
35 | return get('neo4j_password')
36 |
37 |
38 | config: Config = Config()
39 |
40 | import os
41 |
42 | if 'ENVIRONMENT' not in os.environ:
43 | raise Exception("You must set an ENVIRONMENT variable. Sorry, I am very opinionated that we should not have a default value because it will mask misconfiguration issues later.")
44 | set_environment(os.environ['ENVIRONMENT'])
--------------------------------------------------------------------------------
/config/environment.py:
--------------------------------------------------------------------------------
1 | class Environment(object):
2 | def __init__(self, name):
3 | self.name = name
--------------------------------------------------------------------------------
/config/overrides.py:
--------------------------------------------------------------------------------
1 | import json
2 | overrides = dict()
3 |
4 | # There are too many layers and too many files to this config system
5 | overrides.update(**{
6 | 'remote': {
7 | 'neo4j_url': 'bolt://796bafef-staging.databases.neo4j.io',
8 | 'neo4j_user': 'readonly',
9 | 'neo4j_password': '0s3DGA6Zq'
10 | },
11 | 'floyd': { # Todo: implement me?
12 | 'neo4j_url': 'bolt://796bafef-staging.databases.neo4j.io',
13 | 'neo4j_user': 'readonly',
14 | 'neo4j_password': '0s3DGA6Zq'
15 | },
16 | 'local': { # Just uses defaults
17 |
18 | }
19 | })
20 |
21 | with open('./config/local_overrides.json') as f:
22 | overrides.update(json.load(f))
--------------------------------------------------------------------------------
/data_sets/synthetic_review_prediction/article_0/__init__.py:
--------------------------------------------------------------------------------
1 | from .configure import DATASET_NAME, create_data_set_properties
2 | from .generate import run as _run
3 |
4 |
5 | def run(client):
6 | print(DATASET_NAME)
7 | return _run(client, create_data_set_properties())
8 |
--------------------------------------------------------------------------------
/data_sets/synthetic_review_prediction/article_0/configure.py:
--------------------------------------------------------------------------------
1 | from ..meta_classes import DataSetProperties
2 | from ..meta_classes.data_set_properties import PersonStyleWeightDistribution, PersonStyleWeight, ProductStyleWeight
3 | from ..utils import WeightedOption, Distribution
4 | from ..classes import PersonStylePreferenceEnum, ProductStyleEnum, Style
5 | from ..experiment_1.opinion_function import opinion_function
6 | from ..experiment_1.style_functions import person_style_function, product_style_function
7 | from graph_io.classes.dataset_name import DatasetName
8 |
9 | DATASET_NAME = DatasetName('article_0')
10 |
11 |
12 | def create_data_set_properties() -> DataSetProperties:
13 | N_STYLES = 2
14 | styles = [Style(str(i)) for i in range(N_STYLES)]
15 |
16 | for style in styles:
17 | ProductStyleEnum.register('LIKES_STYLE_'+style.value, style)
18 | PersonStylePreferenceEnum.register('HAS_STYLE_'+style.value, style)
19 |
20 | data_set_properties = DataSetProperties(
21 | dataset_name=DATASET_NAME,
22 | n_reviews=20000,
23 | reviews_per_product=10,
24 | reviews_per_person_distribution=[
25 | WeightedOption[int](1, 0.25),
26 | WeightedOption[int](2, 0.25),
27 | WeightedOption[int](3, 0.25),
28 | WeightedOption[int](4, 0.25)
29 | ],
30 | person_styles_distribution=PersonStyleWeightDistribution([
31 | PersonStyleWeight(x, 1) for x in PersonStylePreferenceEnum.iterate()
32 | ]),
33 | product_styles_distribution=Distribution[ProductStyleWeight, ProductStyleEnum]([
34 | ProductStyleWeight(x, 1) for x in ProductStyleEnum.iterate()
35 | ]),
36 | opinion_function=opinion_function,
37 | person_style_function=person_style_function,
38 | product_style_function=product_style_function,
39 | n_companies=0,
40 | person_company_number_of_relationships_distribution=[]
41 | )
42 |
43 | return data_set_properties
44 |
--------------------------------------------------------------------------------
/data_sets/synthetic_review_prediction/article_0/generate.py:
--------------------------------------------------------------------------------
1 | from ..classes import PersonWroteReview, ReviewOfProduct, IsGoldenFlag
2 | import random
3 |
4 | from ..meta_classes import DataSetProperties
5 | from ..experiment_1.simple_data_set import SimpleDataSet
6 | from ..utils import DatasetWriter
7 | from graph_io import QueryParams, CypherQuery
8 |
9 |
10 | def run(client, data_set_properties: DataSetProperties):
11 |
12 | with DatasetWriter(client, data_set_properties.dataset_name, {"is_golden",""}) as writer:
13 |
14 | writer.nuke_dataset()
15 |
16 | data_set: SimpleDataSet = SimpleDataSet(data_set_properties)
17 |
18 | def create_indexes():
19 | client.execute_cypher_write(CypherQuery("CREATE INDEX ON :NODE(id)"), QueryParams())
20 | #client.execute_cypher_write(CypherQuery("CREATE INDEX ON :NODE(id, dataset_name)"), QueryParams())
21 | pass
22 |
23 | create_indexes()
24 |
25 | for i, product in enumerate(data_set.generate_public_products()):
26 | writer.create_node_if_not_exists(product, {"style"})
27 |
28 | for i, person in enumerate(data_set.generate_public_people()):
29 | writer.create_node_if_not_exists(person, {"style_preference"})
30 |
31 | for review in data_set.generate_reviews(person):
32 | review.test = random.random() <= 0.1
33 | writer.create_node_if_not_exists(review, {"score", "test"})
34 | writer.create_edge_if_not_exists(PersonWroteReview(review.by_person, review.id, IsGoldenFlag(False)), set())
35 | writer.create_edge_if_not_exists(ReviewOfProduct(review.id, review.of_product, IsGoldenFlag(False)), set())
36 |
37 |
38 |
--------------------------------------------------------------------------------
/data_sets/synthetic_review_prediction/utils/dataset_writer.py:
--------------------------------------------------------------------------------
1 | from graph_io import SimpleNodeClient, CypherQuery, QueryParams
2 | from ..classes import GraphNode, GraphEdge, IsGoldenFlag
3 | from graph_io.classes.dataset_name import DatasetName
4 | from typing import Set, AnyStr
5 | from multiprocessing.pool import ThreadPool
6 | from multiprocessing.queues import Queue
7 | from uuid import UUID
8 |
9 |
10 | class DatasetWriter(object):
11 | ADDITIONAL_NODE_PROPERTIES: Set[AnyStr] = {'id'}
12 |
13 | def __init__(self,
14 | client: SimpleNodeClient,
15 | dataset_name: DatasetName,
16 | properties_to_ignore: Set[str] = set()
17 | ):
18 | self.properties_to_ignore = properties_to_ignore
19 | self.dataset_name = dataset_name
20 | self._client = client
21 | self.pool = ThreadPool(1)
22 |
23 | def __enter__(self):
24 | # TODO: do query batching with a buffer etc. to increase performance
25 | return self
26 |
27 | def __exit__(self, exc_type, exc_val, exc_tb):
28 | self._client.run_batch()
29 | # TODO: on non error exits wait until the buffer has all flushed
30 | pass
31 |
32 | def nuke_dataset(self):
33 | query = CypherQuery("MATCH (n:NODE {dataset_name: $dataset_name}) DETACH DELETE n")
34 | self._client.execute_cypher_write(query, QueryParams(dataset_name=self.dataset_name))
35 |
36 | def create_node_if_not_exists(self, node: GraphNode, properties: Set[AnyStr]): # TODO: define properties on the node entity itself?
37 | properties = properties.union(self.ADDITIONAL_NODE_PROPERTIES)
38 |
39 | query_params = self._get_properties_for_query(node, properties)
40 |
41 | create_query = CypherQuery(f"MERGE (n:{node.label_string} {query_params.query_string} )")
42 |
43 | result = self._client.add_to_batch(create_query, query_params)
44 | # TODO: check that result wasn't an error
45 |
46 | print("merged node", query_params._params, result)
47 |
48 | def create_edge_if_not_exists(self, edge: GraphEdge, properties: Set[AnyStr]):
49 | _from = edge._from
50 | _to = edge._to
51 |
52 | query_params = self._get_properties_for_query(edge, properties)
53 |
54 | match = f"MATCH (from:{_from.label_string} {{ id: $from_id }}), (to:{_to.label_string} {{ id: $to_id }})"
55 | merge = f"MERGE (from)-[r:{edge.relationship} {query_params.query_string} ]->(to)"
56 |
57 | create_query = CypherQuery(match + "\n" + merge)
58 | query_params = query_params.union(QueryParams(from_id=str(_from.id.value), to_id=str(_to.id.value)))
59 |
60 | result = self._client.add_to_batch(create_query, query_params)
61 | print("merged edge", query_params._params, result)
62 |
63 | def _get_properties_for_query(self, node, properties, prefix=None):
64 | properties.add('is_golden')
65 |
66 | properties_dict = {
67 | name if not prefix else f"{prefix}_{name}": getattr(node, name) for name in properties if name not in self.properties_to_ignore
68 | }
69 |
70 | query_params = QueryParams(dataset_name=self.dataset_name, **properties_dict)
71 | return query_params
72 |
--------------------------------------------------------------------------------
/experiment/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | from .experiment import Experiment
3 | from .experiment_header import ExperimentHeader
4 | from .directory import directory
--------------------------------------------------------------------------------
/experiment/arguments.py:
--------------------------------------------------------------------------------
1 |
2 | import argparse
3 |
4 | from .directory import directory, default_experiment
5 |
6 | class Arguments(object):
7 | def parse():
8 |
9 | parser = argparse.ArgumentParser()
10 |
11 | parser.add_argument('--experiment', type=str, default=default_experiment, choices=directory.keys())
12 | parser.add_argument('--dataset-name', type=str, default=None)
13 |
14 |
15 | parser.add_argument('--batch_size', type=int, default=32)
16 | parser.add_argument('--epochs', type=int, default=None)
17 | parser.add_argument('--random-seed', type=int, default=None)
18 | parser.add_argument('--verbose', type=int, default=1)
19 |
20 | parser.add_argument('--golden', action='store_true')
21 | parser.add_argument('--not-lazy', dest='lazy', action='store_false')
22 | parser.add_argument('--no-say', dest='say_result', action='store_false')
23 | parser.add_argument('--load-weights', action='store_true')
24 | parser.add_argument('--print-weights', action='store_true')
25 | parser.add_argument('--custom-test', action='store_true')
26 |
27 | parser.add_argument('--output-dir', type=str, default="./output")
28 | parser.add_argument('--data-dir', type=str, default="./data")
29 |
30 | return parser.parse_args()
31 |
--------------------------------------------------------------------------------
/experiment/directory.py:
--------------------------------------------------------------------------------
1 | from data_sets import *
2 | from basic_types import NanoType
3 |
4 | from .experiment_header import ExperimentHeader
5 |
6 | shared_query = {
7 | "product_and_product_subgraph": """
8 | MATCH p=
9 | (a:PERSON {is_golden:{golden}, dataset_name:{dataset_name}})
10 | -[:WROTE {is_golden:{golden}, dataset_name:{dataset_name}}]->
11 | (b:REVIEW {is_golden:{golden}, dataset_name:{dataset_name}})
12 | -[:OF {is_golden:{golden}, dataset_name:{dataset_name}}]->
13 | (product:PRODUCT {is_golden:{golden}, dataset_name:{dataset_name}})
14 |
15 | WITH
16 | product,
17 | COLLECT(p) as neighbors
18 |
19 | RETURN
20 | product,
21 | neighbors
22 |
23 | """
24 |
25 | }
26 |
27 | directory = {
28 | "review_from_visible_style": ExperimentHeader(
29 | """
30 | A simple baseline experiment.
31 |
32 | From a person's style preference and a product's style, predict review score.
33 |
34 | review_score = dot(style_preference, product_style)
35 | """,
36 | EXPERIMENT_2_DATASET,
37 | """MATCH p=
38 | (a:PERSON {is_golden:{golden}, dataset_name:{dataset_name}})
39 | -[:WROTE {is_golden:{golden}, dataset_name:{dataset_name}}]->
40 | (b:REVIEW {is_golden:{golden}, dataset_name:{dataset_name}})
41 | -[:OF {is_golden:{golden}, dataset_name:{dataset_name}}]->
42 | (c:PRODUCT {is_golden:{golden}, dataset_name:{dataset_name}})
43 | RETURN a.style_preference AS style_preference, c.style AS style, b.score AS score
44 | """,
45 | float
46 | ),
47 |
48 |
49 | "review_from_hidden_style_neighbor_conv": ExperimentHeader(
50 | """
51 | A simple experiment requiring the ML system to aggregate information from a sub-graph
52 |
53 | Predict a person's score for a product, given a person's style preference and the product
54 |
55 | This needs to be able to take in the review graph for a product
56 | and infer the product's style based on the style_preference and scores other people gave the product.
57 |
58 | Plan for the network (assume 1 hot encoding for categorical variables):
59 |
60 | For a product (product):
61 | For a person (person):
62 |
63 | - get array of N other people's reviews: [other_person.style_preference, score] x N
64 | - Apply 1d_convolution output: [product_style] x N
65 | - Apply average across N, output: [product_style]
66 | - Apply softmax, output: [product_style]
67 | - Concat with person, output: [product_style, person.style_preference]
68 | - Apply dense layer, activation sigmoid, output: [score]
69 |
70 | - Train that!
71 |
72 | """,
73 | EXPERIMENT_2_DATASET,
74 | """
75 | MATCH (a:PERSON)
76 | -[e1:WROTE ]->
77 | (b:REVIEW {is_golden:{golden}, dataset_name:{dataset_name}})
78 | -[e2:OF ]->
79 | (c:PRODUCT),
80 | others=
81 | (other_person:PERSON {is_golden:{golden}, dataset_name:{dataset_name}})
82 | -[:WROTE {is_golden:{golden}, dataset_name:{dataset_name}}]->
83 | (other_review:REVIEW {is_golden:{golden}, dataset_name:{dataset_name}})
84 | -[:OF {is_golden:{golden}, dataset_name:{dataset_name}}]->
85 | (c)
86 | WHERE other_person<>a AND other_review<>b
87 | WITH
88 | a,b,c,
89 | e1,e2,
90 | COLLECT(others) as neighbors
91 | WHERE a.dataset_name={dataset_name} AND a.is_golden={golden}
92 | AND b.dataset_name={dataset_name} AND b.is_golden={golden}
93 | AND c.dataset_name={dataset_name} AND c.is_golden={golden}
94 | AND e1.dataset_name={dataset_name} AND e1.is_golden={golden}
95 | AND e2.dataset_name={dataset_name} AND e2.is_golden={golden}
96 | RETURN
97 | a.style_preference AS style_preference,
98 | b.score AS score,
99 | neighbors
100 |
101 | """,
102 | float
103 | ),
104 |
105 | "review_from_all_hidden_simple_unroll": ExperimentHeader(
106 | """
107 | # Objective
108 |
109 | Learn a function `score(input_person, input_product)` that gives a product review
110 | given a person and a product.
111 |
112 | ## Input format
113 |
114 | People, reviews and products are essentially anonymous and defined by their relationship
115 | to each-other.
116 |
117 | Our network needs to take in a portion of the graph then output the predicted score.
118 |
119 | The graph is transformed and formatted in a consistent fashion, allowing the network
120 | to understand which person and product is being input.
121 |
122 | # Solution
123 |
124 | Allow the network to find look-a-likes by generating array of person-product-person-product-person chains
125 |
126 | E.g. If me and my lookalike both liked product X, then we'll agree for product Y
127 |
128 | This has a limitation that it can only successfully predict a score of there happens to be someone
129 | with the same style_preference who has reviewed a product you have also reviewed.
130 |
131 | """,
132 | EXPERIMENT_4_DATASET,
133 | """
134 | MATCH g=(input_person:PERSON)
135 | -[:WROTE]->
136 | (target_review:REVIEW {dataset_name:{dataset_name}})
137 | -[:OF]->
138 | (input_product:PRODUCT)
139 | <-[:OF]-
140 | (review1:REVIEW)
141 | <-[:WROTE]-
142 | (person2:PERSON)
143 | -[:WROTE]->
144 | (review2:REVIEW)
145 | -[:OF]->
146 | (product2:PRODUCT)
147 | <-[:OF]-
148 | (review3:REVIEW)
149 | <-[:WROTE]-
150 | (input_person)
151 |
152 | WHERE
153 | input_person<>person2
154 | AND input_product<>product2
155 |
156 | RETURN
157 | target_review.score as score,
158 | COLLECT([1.0, review1.score, review2.score, review3.score])[0..50] as neighbors,
159 |
160 | // These two need to be here otherwise the query implicitly groups by score
161 | input_product.id,
162 | input_person.id
163 |
164 | """,
165 | float,
166 | {
167 | "neighbor_count":50
168 | }
169 | ),
170 |
171 | "review_from_all_hidden_random_walks": ExperimentHeader(
172 | """
173 | Let's try to do a RNN that operates on pieces of the graph
174 | Generate random walks.
175 |
176 | This is a great problem because it requires the network to find a specific
177 | shape of subgraph in order to answer the question.
178 |
179 | It needs to find a loop, with 1s on the review scores, like such:
180 |
181 | (REVIEW=1) --> (PRODUCT) <-- (REVIEW=1) <-- (PERSON_B)
182 | ↑ |
183 | | ↓
184 | (PERSON_A) --> (THE_REVIEW) --> (PRODUCT) <-- (REVIEW=1)
185 |
186 |
187 | # Idea
188 |
189 | What if the parameters define a shape the network wants to look for?
190 |
191 | That's the solution to this problem and could be useful for other problems,
192 | particularly since the magic of neural networks lets you define a noise-resiliant
193 | function, and an ensemble of shapes.
194 |
195 | Let:
196 |
197 | const string_length = 9
198 | pattern:List[part] = |----|-----|-----|----| ==> Convolve 1D with path
199 | path:List[part] = (a)-->(b)-->(c)-->(d)
200 | part = (type, parameter_values, is_target) | Loop | None
201 | target_type = "REVIEW"
202 |
203 | ## Algorithm
204 |
205 | 1) For each node of type=target_type:
206 | 1.a) Generate all paths s.t. |path| <= string_length
207 | 1.b) If a path is cyclic it should have a 'Loop' element after the nodes
208 | 2) Feed to network ([path, ..., path], target_review_score)
209 | 3) Network performs 1D convolution of each path with pattern kernel (The overflow of the kernel should wrap around the input path)
210 | 4) Network performs a 1D convolution on those outputs
211 | 5) Network sums those values
212 | 6) Network applies a dense layer, thus outputting y_prediction
213 |
214 |
215 | """,
216 | EXPERIMENT_4_DATASET,
217 | """
218 | MATCH p=
219 | (review:REVIEW {is_golden:{golden}, dataset_name:{dataset_name}})
220 | -[*8]-
221 | (otherB)
222 | WHERE review.id={id}
223 | WITH
224 | review,
225 | COLLECT(p)[0..600] as neighbors
226 | RETURN
227 | review,
228 | neighbors
229 | """,
230 | float,
231 | {
232 | "generate_address": False,
233 | "target_dropout": 0.0,
234 | "memory_size": 1000,
235 | "word_size": 4,
236 | "sequence_size": 600,
237 | "patch_width": 7,
238 | "patch_size": 20,
239 | "epochs": 20,
240 | "repeat_batch": 1,
241 | "working_width": 64,
242 | "id_limit": 32 * 10
243 | },
244 | ["id_limit"]
245 | ),
246 |
247 | "review_from_all_hidden_adj": ExperimentHeader(
248 | """
249 | Try the following:
250 | - variable pr represents PRODUCT style vectors
251 | - variable pe represents PERSON preference vectors
252 | - x = adj matrix of PRODUCT-REVIEW-PERSON
253 | - y = adj matrix of same with REVIEW.score as the weights
254 | - Use optimizer to optimize the style/pref vectors such that: Dot(MatMul(pr, T(pe)), x) = y
255 |
256 |
257 | """,
258 | EXPERIMENT_5_DATASET,
259 | """
260 | MATCH p=
261 | (person:PERSON) -->
262 | (review:REVIEW {is_golden:{golden}, dataset_name:{dataset_name}}) -->
263 | (product:PRODUCT)
264 | RETURN
265 | person.id as person_id, review.score as score, product.id as product_id
266 | """,
267 | "adj_equals",
268 | {
269 | "product_count": 160, # total 160
270 | "person_count": 1200, # total 1200
271 | "style_width": 12,
272 | "epochs": 10000,
273 | "batch_per_epoch": 10
274 | }
275 | ),
276 |
277 | "style_from_neighbor_conv": ExperimentHeader(
278 | """
279 | A precursor to review_from_hidden_style_neighbor_conv
280 |
281 | This experiment seeks to see if we can efficiently determine a product's style
282 | given it's set of reviews and the style_preference of each reviewer.
283 |
284 | This should be easy!!
285 |
286 | """,
287 | EXPERIMENT_2_DATASET,
288 | shared_query["product_and_product_subgraph"],
289 | list,
290 | ),
291 |
292 | "style_from_neighbor_rnn": ExperimentHeader(
293 | """ The same as style_from_neighbor_conv but using an RNN instead of convolution """,
294 | EXPERIMENT_2_DATASET,
295 | shared_query["product_and_product_subgraph"],
296 | list
297 | )
298 |
299 | }
300 |
301 | default_experiment = "review_from_all_hidden_adj"
302 |
303 |
304 |
--------------------------------------------------------------------------------
/experiment/experiment.py:
--------------------------------------------------------------------------------
1 |
2 | from datetime import datetime
3 | from colorama import init, Fore, Style
4 | import logging
5 | import coloredlogs
6 | import colored_traceback.auto
7 | import os
8 |
9 | from graph_ml import Train, Dataset
10 | from .arguments import Arguments
11 | from .directory import directory
12 |
13 |
14 | init()
15 | logger = logging.getLogger(__name__)
16 |
17 |
18 | class Experiment(object):
19 | def __init__(self, name, header, params):
20 | self.name = name
21 | self.header = header
22 | self.params = params
23 | self.run_tag = str(datetime.now())
24 |
25 | @classmethod
26 | def run(cls):
27 |
28 | params = Arguments.parse()
29 |
30 | if params.verbose > 0:
31 | coloredlogs.install(level='INFO', logger=logging.getLogger("experiment"))
32 | coloredlogs.install(level='INFO', logger=logging.getLogger("graph_ml"))
33 | coloredlogs.install(level='INFO', logger=logging.getLogger("graph_io"))
34 |
35 | experiment = Experiment(params.experiment, directory[params.experiment], params)
36 |
37 | print(Fore.GREEN)
38 | print("#######################################################################")
39 | print(f"📟 Running experiment {experiment.name} {experiment.run_tag}")
40 | print("#######################################################################")
41 | print(Style.RESET_ALL)
42 |
43 | dataset = Dataset.get(experiment)
44 | score = Train.run(experiment, dataset)
45 |
46 | print(Fore.YELLOW)
47 | print("#######################################################################")
48 | print("Experiment results")
49 | print(f"{experiment.name} test loss {round(score[0],6)}")
50 | print(f"{experiment.name} test accuracy {round(score[1])}%")
51 | print("#######################################################################")
52 | print(Style.RESET_ALL)
53 |
54 | # t = '-title {!r}'.format(title)
55 | # s = '-subtitle {!r}'.format(subtitle)
56 | # m = '-message {!r}'.format(message)
57 | os.system(f"terminal-notifier -message 'test accuracy {round(score[1]*100)}% loss {round(score[0],2)}' -title Octavian")
58 |
59 | if params.say_result:
60 | os.system(f"say test accuracy {round(score[1]*100)} percent")
61 |
62 |
--------------------------------------------------------------------------------
/experiment/experiment_header.py:
--------------------------------------------------------------------------------
1 |
2 | from typing import List
3 | from graph_io.classes import DatasetName
4 |
5 | class ExperimentHeader(object):
6 | def __init__(self, doc="", dataset_name: DatasetName=None, cypher_query=None, target=None, params={}, lazy_params:List[str]=[]):
7 | # Jesus I have to spell this out?!
8 | # WTF are the python language devs doing?!
9 | self.dataset_name = dataset_name
10 | self.doc = doc
11 | self.cypher_query = cypher_query
12 | self.target = target
13 | self.params = params
14 | self.lazy_params = lazy_params
--------------------------------------------------------------------------------
/floyd_requirements.txt:
--------------------------------------------------------------------------------
1 | neo4j-driver
2 | lazy
3 | h5py
4 | colorama
5 | coloredlogs
6 | more-itertools
7 | git+git://github.com/datalogai/recurrentshop.git#egg=recurrentshop
8 | colored-traceback
9 | sklearn
10 | tqdm
--------------------------------------------------------------------------------
/graph_ml/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | from .train import Train
3 | from .dataset import Dataset
4 | from .ntm import NTMBase
5 |
--------------------------------------------------------------------------------
/graph_ml/adjacency_layer.py:
--------------------------------------------------------------------------------
1 | from keras import backend as K
2 | import tensorflow as tf
3 | from keras.engine.topology import Layer, Input
4 | from keras import regularizers, initializers, layers, activations
5 | from functools import partial
6 | import numpy as np
7 |
8 | class PD(regularizers.Regularizer):
9 | def __init__(self, a=0.0001, b=0.0, axis=-1):
10 | self.a = K.cast_to_floatx(a)
11 | self.b = K.cast_to_floatx(b)
12 |
13 | self.axis = axis
14 |
15 | def __call__(self, x):
16 | sum_to_one = K.abs(1.0 - K.sum(K.abs(x), axis=self.axis))
17 | different_by_one = K.abs(1.0 - K.abs(x[:,0] - x[:,1]))
18 | core = self.a * sum_to_one + self.b * different_by_one
19 |
20 | return K.sum(core)
21 |
22 | def get_config(self):
23 | return {'a': float(self.a), 'b': float(self.b)}
24 |
25 |
26 | class Clip(regularizers.Regularizer):
27 | def __init__(self, max=1):
28 | self.max = max
29 |
30 | def __call__(self, x):
31 | K.clip(x, min_value=-1, max_value=1)
32 |
33 | def get_config(self):
34 | return {'max': float(self.max)}
35 |
36 |
37 | class Adjacency(Layer):
38 |
39 | def __init__(self, person_count, product_count, style_width, **kwargs):
40 | self.person_count = person_count
41 | self.product_count = product_count
42 | self.style_width = style_width
43 | self.dense1 = layers.Dense(units=(style_width), activation=activations.softplus, use_bias=False, kernel_regularizer=Clip)
44 | #self.dense2 = layers.(units=(1), activation=activations.linear)
45 | self.dense3 = layers.Dense(units=1, activation=partial(activations.relu, alpha=0.1), use_bias=False, kernel_regularizer=Clip)
46 | super(Adjacency, self).__init__(**kwargs)
47 |
48 | def __call__(self, inputs, **kwargs):
49 | self.batch_size = inputs.shape[0]
50 | product_ct = inputs.shape[1]
51 | person_ct = inputs.shape[2]
52 | my_batch = product_ct * person_ct
53 |
54 | self.inner_input = Input(batch_shape=(product_ct, person_ct, 2, self.style_width), dtype='float32', name="inner_d0")
55 | self.reshaped_to_look_like_a_batch = K.reshape(self.inner_input, (product_ct * person_ct, 2 * self.style_width))
56 | self.dense1_called = self.dense1(self.reshaped_to_look_like_a_batch)
57 | #self.dense2_called = self.dense2(self.dense1_called)
58 | self.dense3_called = self.dense3(self.dense1_called)
59 | self.reshaped_to_look_like_adj_mat = K.reshape(self.dense3_called, (product_ct, person_ct, 1))
60 | return super(Adjacency, self).__call__(inputs, **kwargs)
61 |
62 | def cartesian_product_matrix(self, a, b):
63 | tile_a = tf.tile(tf.expand_dims(a, 1), [1, tf.shape(b)[0], 1])
64 | tile_a = tf.expand_dims(tile_a, 2)
65 |
66 | tile_b = tf.tile(tf.expand_dims(b, 0), [tf.shape(a)[0], 1, 1])
67 | tile_b = tf.expand_dims(tile_b, 2)
68 |
69 | cartesian_product = tf.concat([tile_a, tile_b], axis=-1)
70 |
71 | return cartesian_product
72 |
73 |
74 |
75 | def build(self, input_shape):
76 | # Create a trainable weight variable for this layer.
77 | self.person = self.add_weight(name='people',
78 | shape=(self.person_count, self.style_width),
79 | initializer='uniform',
80 | # initializer='ones',
81 | # regularizer=PD(),
82 | trainable=True)
83 |
84 | self.product = self.add_weight(name='product',
85 | shape=(self.product_count, self.style_width),
86 | initializer='uniform',
87 | # initializer='ones',
88 | # regularizer=PD(),
89 | trainable=True)
90 |
91 |
92 | # self.wc1 = self.add_weight(name='w1',
93 | # shape=(2, 1),
94 | # initializer='glorot_uniform',
95 | # trainable=True)
96 |
97 | # self.b1 = self.add_weight(name='b1',
98 | # shape=(1, ),
99 | # initializer='zero',
100 | # trainable=True)
101 |
102 | self.w1 = self.add_weight(name='w1',
103 | shape=(2 * self.style_width,
104 | self.style_width),
105 | initializer='glorot_uniform',
106 | trainable=True)
107 |
108 | # self.b1 = self.add_weight(name='b1',
109 | # shape=(self.style_width, ),
110 | # initializer='zero',
111 | # trainable=True)
112 |
113 | self.w2 = self.add_weight(name='w2',
114 | shape=(self.style_width, 1),
115 | initializer='glorot_uniform',
116 | trainable=True)
117 |
118 | # self.b2 = self.add_weight(name='b2',
119 | # shape=(1, ),
120 | # initializer='zero',
121 | # trainable=True)
122 |
123 |
124 | # self.b3 = self.add_weight(name='b2',
125 | # shape=(1,),
126 | # initializer='zero',
127 | # trainable=True)
128 |
129 | # self.w3 = self.add_weight(name='m2',
130 | # shape=(1,),
131 | # initializer='one',
132 | # trainable=True)
133 |
134 |
135 | super(Adjacency, self).build(input_shape) # Be sure to call this somewhere!
136 |
137 | def jitter(self, idx=[0,1], var=0.2):
138 | wts = self.get_weights()
139 |
140 | for i in idx:
141 | wts[i] += np.random.normal(0, var, wts[i].shape)
142 |
143 | self.set_weights(wts)
144 |
145 | def call(self, x):
146 | return self.call_dense(x)
147 |
148 | # 100pc test accuracy
149 | def call_dot_softmax(self, x):
150 | pr = self.product
151 | pe = self.person
152 |
153 | pr = K.softmax(self.product)
154 | pe = K.softmax(self.person)
155 |
156 | m = K.dot(pr, K.transpose(pe))
157 | m = (self.w3 * m) + self.b3
158 | m = K.relu(m, alpha=0.1)
159 |
160 | m = m * x
161 |
162 | return m
163 |
164 | # 100pc test accuracy
165 | def call_dot(self, x):
166 | pr = self.product
167 | pe = self.person
168 |
169 | m = K.dot(pr, K.transpose(pe))
170 | m = m * x
171 |
172 | return m
173 |
174 | # Seen at 68% 1-accuracy test
175 | def call_dense(self, x):
176 | self.jitter(idx=[0,1], var=0.1)
177 |
178 | pr = self.product
179 | pe = self.person
180 |
181 | pr = K.softmax(pr)
182 | pe = K.softmax(pe)
183 |
184 | all_pairs = self.cartesian_product_matrix(pr, pe)
185 | flat = K.reshape(all_pairs, (self.product_count * self.person_count, self.style_width * 2))
186 |
187 | m = K.dot(flat, self.w1)
188 | # m = K.bias_add(m, self.b1)
189 | m = K.relu(m, alpha=0.1)
190 |
191 | m = K.dropout(m, level=0.1)
192 |
193 | m = K.dot(m, self.w2)
194 | m = K.relu(m, alpha=0.1)
195 |
196 | m = K.reshape(m, (1, self.product_count, self.person_count))
197 | masked = m * x
198 | return masked
199 |
200 |
201 |
202 | # 100pc test accuracy
203 | def call_dense_conv(self, x):
204 | self.jitter(idx=[0,1])
205 |
206 | pr = self.product
207 | pe = self.person
208 |
209 | pr = K.softmax(pr)
210 | pe = K.softmax(pe)
211 |
212 | all_pairs = self.cartesian_product_matrix(pr, pe)
213 |
214 | flat = K.reshape(all_pairs, (self.product_count * self.person_count * self.style_width, 2))
215 | m = K.dot(flat, self.wc1)
216 | m = K.tanh(m)
217 |
218 | m = K.reshape(m, (self.product_count * self.person_count, self.style_width))
219 | m = K.dot(m, self.w2)
220 | m = K.relu(m, alpha=0.1)
221 |
222 | m = K.reshape(m, (1, self.product_count, self.person_count))
223 | masked = m * x
224 | return masked
225 |
226 |
227 |
228 | def compute_output_shape(self, input_shape):
229 | return input_shape
230 |
231 |
232 |
--------------------------------------------------------------------------------
/graph_ml/dataset.py:
--------------------------------------------------------------------------------
1 |
2 | from collections import Counter, namedtuple
3 | import random
4 | import pickle
5 | import os.path
6 | import hashlib
7 | import neo4j
8 | import math
9 | from typing import Callable, Generator, Tuple
10 | import logging
11 | import itertools
12 | from itertools import cycle
13 | import more_itertools
14 | from more_itertools import peekable
15 |
16 | import keras
17 | import numpy as np
18 | from keras.preprocessing import text
19 | from keras.utils import np_utils
20 |
21 | from .path import generate_output_path, generate_data_path
22 | from graph_io import *
23 | # from experiment import Experiment
24 | from .util import *
25 | from .dataset_helpers import *
26 |
27 | logger = logging.getLogger(__name__)
28 |
29 |
30 | class Dataset(object):
31 |
32 |
33 | # Applies a per-experiment recipe to Neo4j to get a dataset to train on
34 | # This performs all transformations in-memory - it is not very efficient
35 | @classmethod
36 | def get(cls, experiment):
37 |
38 | # TODO: delete this
39 | legacy_recipes = {
40 | 'review_from_visible_style': Recipe(
41 | split=lambda row: Point(np.concatenate((row['style_preference'], row['style'])), row['score'])
42 | ),
43 | 'review_from_hidden_style_neighbor_conv': Recipe(
44 | split=DatasetHelpers.review_from_hidden_style_neighbor_conv(100),
45 | finalize_x=lambda x: {'person':np.array([i['person'] for i in x]), 'neighbors': np.array([i['neighbors'] for i in x])}
46 | ),
47 | 'style_from_neighbor_conv': Recipe(
48 | split=DatasetHelpers.style_from_neighbor(100)
49 | ),
50 | 'style_from_neighbor_rnn': Recipe(
51 | split=DatasetHelpers.style_from_neighbor(100)
52 | )
53 | }
54 |
55 | try:
56 | recipe = legacy_recipes[experiment.name]
57 | except:
58 | # TODO: move all to this pattern
59 | recipe = getattr(DatasetHelpers, experiment.name)(experiment)
60 |
61 |
62 | return Dataset(experiment, recipe)
63 |
64 |
65 |
66 | # Split data into test/train set, organise it into a class
67 | def __init__(self, experiment, recipe):
68 |
69 | self.experiment = experiment
70 | self.recipe = recipe
71 |
72 | if experiment.params.random_seed is not None:
73 | random.seed(experiment.params.random_seed)
74 |
75 | if experiment.params.dataset_name is not None:
76 | dataset_name = experiment.params.dataset_name
77 | else:
78 | dataset_name = experiment.header.dataset_name
79 |
80 | query_params = QueryParams(
81 | golden=experiment.params.golden,
82 | dataset_name=dataset_name,
83 | experiment=experiment.name)
84 |
85 | query_params.update(QueryParams(**experiment.header.params))
86 |
87 | # Calculate params for lazy data loading
88 | data_path_params = {i:query_params[i] for i in experiment.header.lazy_params}
89 | data_path_params["dataset_name"] = dataset_name
90 |
91 | dataset_file = generate_data_path(experiment, '.pkl', data_path_params)
92 | logger.info(f"Dataset file {dataset_file}")
93 |
94 | if os.path.isfile(dataset_file) and experiment.params.lazy:
95 | logger.info(f"Opening dataset pickle {dataset_file}")
96 | data = pickle.load(open(dataset_file, "rb"))
97 |
98 | else:
99 | logger.info("Querying data from database")
100 | with SimpleNodeClient() as client:
101 | cq = CypherQuery(experiment.header.cypher_query)
102 | data = recipe.query(client, cq, query_params)
103 |
104 | # Later shift to query-on-demand
105 | data = list(data)
106 | pickle.dump(data, open(dataset_file, "wb"))
107 |
108 | # We need to know total length of data, so for ease I've listed it here.
109 | # I've used generators everywhere, so if it wasn't for Keras, this would
110 | # be memory efficient
111 |
112 | logger.info(f"Rows returned by Neo4j {len(data)}")
113 | list_data = list(recipe.transform(data))
114 | total_data = len(list_data)
115 | logger.info(f"Number of rows of data: {total_data}")
116 |
117 |
118 | def repeat_infinitely(gen_fn):
119 | while True:
120 | for x in gen_fn():
121 | yield x
122 | stream = repeat_infinitely(lambda: recipe.partition(recipe.transform(data)))
123 |
124 | def just(tag):
125 | return ( (i[1].x, i[1].y) for i in stream if i[0] == tag)
126 |
127 | def chunk(it, length):
128 | chunky = more_itertools.chunked(it, length)
129 | for i in chunky:
130 | xs = np.array([j[0] for j in i])
131 | ys = np.array([j[1] for j in i])
132 | yield (xs, ys)
133 |
134 |
135 | bs = experiment.params.batch_size
136 |
137 | self.train_generator = peekable(chunk(just("train"), bs))
138 | self.validation_generator = peekable(chunk(just("validate"), bs))
139 | self.test_generator = peekable(chunk(just("test"), bs))
140 |
141 | self.generator = {
142 | "test": self.test_generator,
143 | "train": self.train_generator,
144 | "validate": self.validation_generator
145 | }
146 |
147 | f = self.train_generator.peek()
148 | # logger.info(f"First training item: x:{f[0].shape}, y:{f[1].shape}")
149 |
150 | # These are not exact counts since the data is randomly split at generation time
151 | self.validation_steps = math.ceil(total_data * 0.1 / experiment.params.batch_size)
152 | self.test_steps = math.ceil(total_data * 0.1 / experiment.params.batch_size)
153 | self.steps_per_epoch = math.ceil(total_data * 0.8 / experiment.params.batch_size) * int(experiment.header.params.get('repeat_batch', 1))
154 |
155 | self.input_shape = self.train_generator.peek()[0][0].shape
156 |
157 |
158 |
159 |
160 |
161 |
--------------------------------------------------------------------------------
/graph_ml/dataset_helpers.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | from collections import Counter, namedtuple
4 | import random
5 | import pickle
6 | import os.path
7 | import hashlib
8 | import neo4j
9 | import math
10 | from typing import Callable, Generator, Tuple
11 | import logging
12 | import itertools
13 | from itertools import cycle
14 | import more_itertools
15 | from more_itertools import peekable
16 |
17 | import keras
18 | import numpy as np
19 | from keras.preprocessing import text
20 | from keras.utils import np_utils
21 |
22 | from .path import generate_output_path, generate_data_path
23 | from graph_io import *
24 | # from experiment import Experiment
25 | from .util import *
26 |
27 | logger = logging.getLogger(__name__)
28 |
29 |
30 |
31 | class Point(object):
32 | def __init__(self, x, y):
33 | self.x = x
34 | self.y = y
35 |
36 | # This is weird, I know, re-write later when I'm making this more efficient
37 | def append(self, point):
38 | self.x.append(point.x)
39 | self.y.append(point.y)
40 |
41 | def __str__(self):
42 | return "{x:\n" + str(self.x) + ",\ny:\n" + str(self.y) + "}"
43 |
44 | def __repr__(self):
45 | return self.__str__()
46 |
47 |
48 | def noop():
49 | pass
50 |
51 | RecordGenerator = Generator[neo4j.v1.Record, None, None]
52 | PointGenerator = Generator[Point, None, None]
53 |
54 | class Recipe:
55 | def __init__(self,
56 | transform:Callable[[RecordGenerator], PointGenerator] = None,
57 | query:Callable[[], RecordGenerator] = None,
58 | partition:Callable[[PointGenerator], Generator[Tuple[str, Point], None, None]] = None,
59 | split:Callable[[neo4j.v1.Record], Point] = None,
60 | finalize_x = None):
61 |
62 | self.transform = transform
63 | self.query = query
64 | self.partition = partition
65 |
66 | # TODO: migrate older experiments
67 | if transform is None:
68 | def legacy_transform(rows):
69 | for i in rows:
70 | p = split(i)
71 | p.x = finalize_x(p.x) if finalize_x else p.x
72 | yield p
73 | self.transform = legacy_transform
74 |
75 | if query is None:
76 | def default_query(client, cypher_query, query_params):
77 | return client.execute_cypher(cypher_query, query_params)
78 |
79 | self.query = default_query
80 |
81 | if partition is None:
82 | def default_partition(data):
83 | random.shuffle(data)
84 | c = 0
85 | for i in data:
86 |
87 | if c == 9:
88 | l = "test"
89 | elif c == 8:
90 | l = "validate"
91 | else:
92 | l = "train"
93 |
94 | c = (c + 1) % 10
95 |
96 | yield (l, i)
97 | self.partition = default_partition
98 |
99 |
100 | class DatasetHelpers(object):
101 |
102 | @staticmethod
103 | def ensure_length(arr, length):
104 | delta = length - arr.shape[0]
105 | if delta > 0:
106 | pad_shape = ((0,delta),)
107 | for i in range(len(arr.shape)-1):
108 | pad_shape += ((0, 0),)
109 | arr = np.pad(arr, pad_shape, 'constant', constant_values=0.0)
110 | elif delta < 0:
111 | arr = arr[:length]
112 |
113 | assert len(arr) == length, f"ensure_length failed to resize, {len(arr)} != {length}"
114 |
115 | return arr
116 |
117 | @staticmethod
118 | def path_map_style_preference_score(cls, path):
119 | other_person = path.nodes[0]
120 | other_review = path.nodes[1]
121 | return np.concatenate((
122 | np.array(other_person.properties['style_preference']),
123 | [other_review.properties['score']]
124 | ))
125 |
126 | # Turn neighbors sub-graph into a sampled array of neighbours
127 | # @argument length What size of array should be returned. Use None for variable. If you request a fixed length, the first column of the feature is a 0.0/1.0 flag of where there is data or zeros in that feature row
128 | @classmethod
129 | def collect_neighbors(cls, row, key, path_map, length:int):
130 | subrows = []
131 | for path in row[key]:
132 | subrows.append(path_map(path))
133 |
134 | # Lets always shuffle to keep the network on its toes
135 | # If you use --random-seed you'll fix this to be the same each run
136 | np.random.shuffle(subrows)
137 |
138 | if length is not None:
139 | if len(subrows) > length:
140 | subrows = subrows[:length]
141 |
142 | subrows = np.pad(subrows, ((0,0), (1,0)), 'constant', constant_values=1.0) # add 'none' flag
143 |
144 | # pad out if too small
145 | # note if there are zero subrows, this won't know the width to make the zeros, so it'll be 1 wide and broadcast later
146 | if len(subrows) < length:
147 | delta = length - subrows.shape[0]
148 | subrows = np.pad(subrows, ((0,delta), (0, 0)), 'constant', constant_values=0.0)
149 |
150 | return subrows
151 |
152 |
153 | @classmethod
154 | def review_from_hidden_style_neighbor_conv(cls, length):
155 | def transform_row(row):
156 | neighbors = cls.collect_neighbors(row, 'neighbors', cls.path_map_style_preference_score)
157 | return Point({'person': np.array(row["style_preference"]), 'neighbors':neighbors}, row["score"])
158 | return transform_row
159 |
160 |
161 | @classmethod
162 | def style_from_neighbor(cls, length):
163 | # Python you suck at developer productivity.
164 | # Seriously, coffeescript has all these things sorted out
165 | # Like no anonymous functions? Fuck you.
166 | def transform_row(row):
167 | neighbors = cls.collect_neighbors(row, 'neighbors', cls.path_map_style_preference_score, length)
168 | return Point(neighbors, row["product"].properties["style"])
169 | return transform_row
170 |
171 |
172 | @classmethod
173 | def review_from_all_hidden_simple_unroll(cls, experiment):
174 | def t(row):
175 | length = experiment.header.params["neighbor_count"]
176 | neighbors = np.array(row["neighbors"])
177 | delta = length - neighbors.shape[0]
178 |
179 | if delta > 0:
180 | neighbors = np.pad(neighbors, ((0,delta), (0, 0)), 'constant', constant_values=0.0)
181 |
182 | return Point(neighbors, row["score"])
183 |
184 | return Recipe(t)
185 |
186 | @staticmethod
187 | def review_from_all_hidden_random_walks(experiment):
188 |
189 | encode_label = {
190 | "NODE": [1,0,0,0,0],
191 | "PERSON": [0,1,0,0,0],
192 | "REVIEW": [0,0,1,0,0],
193 | "PRODUCT": [0,0,0,1,0],
194 | "LOOP": [0,0,0,0,1]
195 | }
196 |
197 | FakeNode = namedtuple('FakeNode', ['id', 'properties', 'labels'])
198 | loop_node = FakeNode(None, {}, set(['NODE', 'LOOP']))
199 |
200 | def extract_label(l):
201 | return encode_label.get(list(set(l) - set('NODE'))[0], [1,0,0,0])
202 |
203 | node_id_dict = {}
204 |
205 | def node_id_to_memory_addr(nid):
206 |
207 | if nid not in node_id_dict:
208 | node_id_dict[nid] = len(node_id_dict) % experiment.header.params['memory_size']
209 |
210 | return node_id_dict[nid]
211 |
212 | def package_node(n, is_target=False):
213 | ms = experiment.header.params['memory_size']
214 |
215 | if experiment.header.params["generate_address"]:
216 | address_trunc = node_id_to_memory_addr(n.id)
217 | address_one_hot = np.zeros(ms)
218 | address_one_hot[address_trunc] = 1.0
219 | else:
220 | address_one_hot = np.array([])
221 |
222 | label = extract_label(n.labels)
223 | score = n.properties.get("score", -1.0)
224 |
225 | if random.random() < experiment.header.params["target_dropout"] or is_target:
226 | score = -1.0
227 |
228 | x = np.concatenate(([score, float(is_target)], label, address_one_hot))
229 |
230 | return x
231 |
232 |
233 | def path_to_patch(node, path):
234 | ps = np.array([package_node(i, i.id == node.id) for i in path.nodes])
235 |
236 | if path.nodes[0].id == path.nodes[-1].id:
237 | print("outputting loop_node for ", path.nodes[0].id, [i.id for i in path.nodes])
238 | l = np.array([package_node(loop_node, False)])
239 | np.append(ps, l, axis=0)
240 |
241 | ps = np.repeat(ps, 2, axis=0)
242 |
243 | patch_size = experiment.header.params["patch_size"]
244 | ps = DatasetHelpers.ensure_length(ps, patch_size)
245 | return ps
246 |
247 |
248 | def row_to_point(row):
249 | patch_size = experiment.header.params["patch_size"]
250 | seq_size = experiment.header.params["sequence_size"]
251 |
252 | neighbors = row["neighbors"]
253 | review = row["review"]
254 |
255 | x = np.array([path_to_patch(review, path) for path in neighbors])
256 | x = DatasetHelpers.ensure_length(x, seq_size)
257 | # x = np.repeat(x, 3, axis=0)
258 |
259 | y = row["review"].properties.get("score", -1.0)
260 | # y = np.repeat([y], seq_size)
261 | # y = np.expand_dims(y, axis=-1)
262 |
263 | target_shape = (seq_size, patch_size, experiment.header.params["patch_width"])
264 | assert x.shape == target_shape, f"{x.shape} != {target_shape}"
265 |
266 | return Point(x, y)
267 |
268 | def query(client, cypher_query, query_params):
269 | return client.execute_cypher_once_per_id(
270 | cypher_query,
271 | query_params,
272 | dataset_name=experiment.header.dataset_name,
273 | id_limit=experiment.header.params["id_limit"],
274 | id_type="REVIEW"
275 | )
276 |
277 | def balance_classes(stream):
278 | # ugh arch pain
279 | # instead pass in an arg that is a callable stream generator
280 |
281 | classes = [0.0, 1.0]
282 | last = [None, None]
283 |
284 | # Over-sample
285 | # This is imperfectly balanced as it cold-starts without last values
286 | for i in stream:
287 | for index, c in enumerate(classes):
288 | if np.array([i.y]).flatten()[0] == c:
289 | last[index] = i
290 | yield i
291 | elif last[index] is not None:
292 | yield last[index]
293 |
294 |
295 | def transform(stream):
296 | # y_count = Counter()
297 | # y_count[str(y)] += 1
298 | # print(f"Counter of y values: {[(i, y_count[i] / len(list(y_count.elements())) * 100.0) for i in y_count]}")
299 | stream = (row_to_point(row) for row in stream)
300 | stream = balance_classes(stream)
301 | return stream
302 |
303 | return Recipe(transform=transform,query=query)
304 |
305 | @staticmethod
306 | def review_from_all_hidden_adj(experiment) -> Recipe:
307 | bs = experiment.params.batch_size
308 | person_product = {}
309 |
310 | reviews_per_person = Counter()
311 | reviews_per_product = Counter()
312 |
313 | pr_c = experiment.header.params["product_count"]
314 | pe_c = experiment.header.params["person_count"]
315 |
316 | shape = (pr_c, pe_c)
317 | unmasked_products=np.zeros(shape=(pr_c,))
318 | unmasked_products[0] = 1
319 | unmasked_people=np.zeros(shape=(pe_c,))
320 | cache = []
321 | training_mask = np.zeros(shape)
322 | pause=[0]
323 | def gen_output(datas):
324 | for i in range(bs * experiment.header.params["batch_per_epoch"]):
325 | for partition, pt in datas.items():
326 | if partition=="train":
327 | pe_flag = False
328 | pr_flag = False
329 | if pause[0] > 48:
330 |
331 | def do_product():
332 | if not pr_flag:
333 | for x in range(pe_c):
334 | if unmasked_people[x] == 0 and any(pt.x[y][x] == 1 for y in range(pr_c) if unmasked_products[y] == 1):
335 | unmasked_people[x] = 1
336 | pe_flag = True
337 | break
338 |
339 | def do_person():
340 | if not pe_flag:
341 | for y in range(pr_c):
342 | if unmasked_products[y] == 0 and any(pt.x[y][x] == 1 for x in range(pe_c) if unmasked_people[x] == 1):
343 | unmasked_products[y] = 1
344 | pr_flag = True
345 | break
346 |
347 | if random.random() > 0.5:
348 | do_product()
349 | else:
350 | do_person()
351 |
352 | if not pr_flag and not pe_flag:
353 | for x in range(pe_c):
354 | if unmasked_people[x] == 0:
355 | unmasked_people[x] = 1
356 | pe_flag = True
357 | break
358 | if not pe_flag:
359 | for y in range(pr_c):
360 | if unmasked_products[y] == 0:
361 | unmasked_products[y] = 1
362 | pr_flag = True
363 | break
364 | for x in range(pe_c):
365 | #TODO this is like a np.cross or something
366 | for y in range(pr_c):
367 | if unmasked_people[x] * unmasked_products[y] == 1:
368 | training_mask[y][x] = 1
369 | if not pe_flag and not pr_flag:
370 | assert np.sum(training_mask) == pr_c * pe_c
371 | print('all data')
372 | pause[0] = 0
373 | pause[0]+=1
374 |
375 | pt = Point(np.where(training_mask, pt.x, 0), np.where(training_mask, pt.y, 0))
376 | #print(np.sum(pt.x))
377 | #print(np.sum(pt.y))
378 | yield (partition, pt)
379 | # yield Point(adj_con, adj_score)
380 |
381 | def transform(stream):
382 | if len(cache) == 1:
383 | return gen_output(cache[0])
384 |
385 | data = list(stream)
386 |
387 | products = set()
388 | people = set()
389 | # Construct adjacency dict
390 | for i in data:
391 | if i["person_id"] not in person_product:
392 | person_product[i["person_id"]] = {}
393 |
394 | if len(people) < pe_c or i["person_id"] in people:
395 | if len(products) < pr_c or i["product_id"] in products:
396 |
397 | person_product[i["person_id"]][i["product_id"]] = i["score"]
398 |
399 | reviews_per_person[i["person_id"]] += 1
400 | reviews_per_product[i["product_id"]] += 1
401 |
402 | products.add(i["product_id"])
403 | people.add(i["person_id"])
404 |
405 | def exists(person, product):
406 | return 1.0 if person in person_product and product in person_product[person] else 0.0
407 |
408 | def score(person, product):
409 | return person_product.get(person, 0.0).get(product, 0.0)
410 |
411 | ppe = list(dict(reviews_per_person).values())
412 | ppr = list(dict(reviews_per_product).values())
413 |
414 | #print("Reviews per product: ", np.histogram(ppe) )
415 | #print("Reviews per person: ", np.histogram(ppr) )
416 |
417 | #logger.info(f"People returned {len(people)} of capacity {pe_c}")
418 | #logger.info(f"Products returned {len(products)} of capacity {pr_c}")
419 |
420 | people = sorted(list(people))[:pe_c]
421 | products = sorted(list(products))[:pr_c]
422 |
423 | def build(fn):
424 | return DatasetHelpers.ensure_length(np.array([
425 | DatasetHelpers.ensure_length(
426 | np.array([fn(person, product) for person in people])
427 | , pe_c) for product in products
428 | ]), pr_c)
429 |
430 | adj_score = build(score)
431 | adj_con = build(exists)
432 |
433 | # print("Connections:",adj_con)
434 | # print("Scores:",adj_score)
435 |
436 | assert_mtx_shape(adj_score, shape, "adj_score")
437 | assert_mtx_shape(adj_con, shape)
438 |
439 | mask_seed = np.random.randint(10, size=shape)
440 | masks = {
441 | "test": np.equal(mask_seed, 0),
442 | "train": np.greater(mask_seed, 1),
443 | "validate": np.equal(mask_seed, 1),
444 | "all": Point(adj_con, adj_score)
445 | }
446 |
447 | def gen_d(mask):
448 | return Point(np.where(mask, adj_con, 0), np.where(mask, adj_score, 0))
449 |
450 | datas = {
451 | k: gen_d(v)
452 | for (k, v) in masks.items()
453 | }
454 |
455 | warm_up = False
456 |
457 |
458 | if warm_up:
459 | cache.append(datas)
460 | return gen_output(datas)
461 |
462 | else:
463 | for i in range(experiment.params.batch_size * experiment.header.params["batch_per_epoch"]):
464 | for partition, pt in datas.items():
465 | yield (partition, pt)
466 |
467 |
468 | return Recipe(transform=transform, partition=lambda x:x)
469 |
470 |
471 |
--------------------------------------------------------------------------------
/graph_ml/model.py:
--------------------------------------------------------------------------------
1 |
2 | import keras
3 | from keras.models import Sequential, Model
4 | from keras.layers import *
5 | import keras.backend as K
6 |
7 | import tensorflow as tf
8 |
9 | from .ntm import *
10 | from .adjacency_layer import Adjacency
11 |
12 |
13 | # Rainbow sprinkles for your activation function
14 | # Try to use all activation functions
15 | # @argument m: (?,N) tensor
16 | # @returns (?,N*5) tensor
17 | def PolyActivation(m):
18 | # wildcard of the day - let's do inception style activation because I've no idea which is best
19 | # and frequently I get great boosts from switching activation functions
20 | activations = ['tanh', 'sigmoid', 'softmax', 'softplus', 'relu']
21 |
22 | # TODO: Add dense layer to resize back to original size
23 | # I cannot work out how to do that in Keras yet :/
24 | return Concatenate()([
25 | Activation(i)(m) for i in activations
26 | ])
27 |
28 |
29 | # Choose activation function for me
30 | # More efficient than PolyActivation
31 | # @returns Same sized tensor as input
32 | def PolySwitchActivation(m):
33 | # will fail for shared nodes
34 | print(m.shape)
35 |
36 | if len(m.shape) != 3:
37 | # TODO: make this work in a sane way
38 | m = Reshape([i for i in m.shape.dims if i is not None] + [1])(m) # warning: assumes tensorflow
39 |
40 | activations = ['tanh', 'sigmoid', 'softmax', 'softplus', 'relu']
41 | return add([
42 | Conv1D(1,1)(Activation(i)(m)) for i in activations
43 | ])
44 |
45 | class Model(object):
46 |
47 | @classmethod
48 | def generate(cls, experiment, dataset):
49 | params = experiment.params
50 |
51 | # TODO: Move this into Experiment header
52 | n_styles = 6
53 | n_sequence = 100
54 |
55 | bs = experiment.params.batch_size
56 |
57 | if experiment.name == "review_from_visible_style":
58 | model = Sequential([
59 | Dense(8,
60 | input_shape=dataset.input_shape,
61 | activation='softmax'),
62 | Dense(1, activation='sigmoid'),
63 | ])
64 |
65 |
66 | elif experiment.name == "review_from_hidden_style_neighbor_conv":
67 | neighbors = Input(shape=(n_sequence,n_styles*2,), dtype='float32', name='neighbors')
68 | person = Input(shape=(n_styles,), dtype='float32', name='person')
69 |
70 | m = cls.style_from_neighbors(neighbors, n_styles, n_sequence)
71 | m = Concatenate()([m, person])
72 | m = Dense(n_styles*4)(m)
73 | m = PolyActivation(m)
74 | m = Dense(1, activation='sigmoid')(m)
75 |
76 | model = keras.models.Model(inputs=[person, neighbors], outputs=[m])
77 |
78 |
79 | elif experiment.name == "style_from_neighbor_conv":
80 | neighbors = Input(shape=(n_sequence,n_styles+2,), dtype='float32', name='neighbors')
81 | m = cls.style_from_neighbors(neighbors, n_styles, n_sequence)
82 |
83 | model = keras.models.Model(inputs=[neighbors], outputs=[m])
84 |
85 |
86 | elif experiment.name == "style_from_neighbor_rnn":
87 | neighbors = Input(shape=(n_sequence,n_styles+2,), dtype='float32', name='neighbors')
88 | m = LSTM(n_styles*4)(neighbors)
89 | m = Dense(n_styles)(m)
90 | m = Activation('sigmoid', name='final_activation')(m)
91 |
92 | model = keras.models.Model(inputs=[neighbors], outputs=[m])
93 |
94 |
95 | elif experiment.name == "review_from_all_hidden_simple_unroll":
96 | thinking_width = 10
97 |
98 | neighbors = Input(shape=(experiment.header.params["neighbor_count"],4,), dtype='float32', name='neighbors')
99 | m = Conv1D(thinking_width, 1, activation='tanh')(neighbors)
100 | m = MaxPooling1D(experiment.header.params["neighbor_count"])(m)
101 | m = Reshape([thinking_width])(m)
102 | m = Dense(1)(m)
103 | m = Activation("sigmoid", name='final_activation')(m)
104 |
105 | model = keras.models.Model(inputs=[neighbors], outputs=[m])
106 |
107 |
108 | elif experiment.name == 'review_from_all_hidden_random_walks':
109 |
110 | ss = experiment.header.params["sequence_size"]
111 | ps = experiment.header.params["patch_size"]
112 | pw = experiment.header.params["patch_width"]
113 |
114 | patch = Input(batch_shape=(bs,ss,ps,pw), dtype='float32', name="patch")
115 | # flat_patch = Reshape([ss*ps*pw])(patch)
116 | # score = Dense(experiment.header.params["working_width"]*2, activation="tanh")(flat_patch)
117 | # score = Dense(experiment.header.params["working_width"], activation="tanh")(flat_patch)
118 |
119 | # rnn = PatchNTM(experiment).build()
120 | # score = rnn(patch)
121 |
122 | # Data format
123 | # x = [x_path, x_path, x_path]
124 | # x_path = [x_node, x_node, x_node]
125 | # x_node = [label, score, is_head]
126 |
127 | # x = [
128 | # [
129 | # [label, score, is_head]:Node,
130 | # [label, score, is_head]:Node
131 | # ]:Path,
132 | # [
133 | # [label, score, is_head]:Node,
134 | # [label, score, is_head]:Node
135 | # ]:Path
136 | # ]:Sequence
137 |
138 | # Convolve path-pattern
139 | channels = 8
140 | pattern_length = 8
141 |
142 | m = patch
143 |
144 | # Add channels for convolution
145 | m = Lambda(lambda x: K.expand_dims(x, axis=-1))(m)
146 |
147 | # Compute!!
148 | m = Conv3D(channels, (1, pattern_length, pw), activation='relu')(m)
149 | pattern_conv_out_size = ps - pattern_length + 1
150 |
151 | m = Reshape([ss * channels * pattern_conv_out_size])(m)
152 | m = Dense(4, activation="relu", name="score_dense")(m)
153 | score = Dense(1, activation="sigmoid", name="score_out")(m)
154 |
155 | model = keras.models.Model(inputs=[patch], outputs=[score])
156 |
157 |
158 | elif experiment.name == 'review_from_all_hidden_adj':
159 |
160 | pr_c = experiment.header.params["product_count"]
161 | pe_c = experiment.header.params["person_count"]
162 | style_width = experiment.header.params["style_width"]
163 |
164 | adj_con = Input(batch_shape=(bs, pr_c, pe_c), dtype='float32', name="adj_con")
165 | features = Adjacency(pe_c, pr_c, style_width, name="hidden_to_adj")(adj_con)
166 |
167 | model = keras.models.Model(inputs=[adj_con], outputs=[features])
168 |
169 | model.compile(loss=keras.losses.mean_squared_error,
170 | optimizer=keras.optimizers.Adam(lr=0.2, decay=0.01),
171 | metrics=['accuracy'])
172 |
173 | return model
174 |
175 |
176 |
177 | # Compile time!
178 | if experiment.header.target == float:
179 | model.compile(loss=keras.losses.mean_squared_error,
180 | optimizer=keras.optimizers.SGD(lr=0.3),
181 | metrics=['accuracy'])
182 |
183 | elif experiment.header.target == list:
184 | model.compile(loss='categorical_crossentropy',
185 | optimizer=keras.optimizers.SGD(lr=0.3),
186 | metrics=['accuracy'])
187 |
188 |
189 |
190 | return model
191 |
192 | @classmethod
193 | def style_from_neighbors(cls, neighbors, n_styles, n_sequence):
194 | m = Conv1D(n_styles, 1, activation='tanh')(neighbors)
195 | m = MaxPooling1D(n_sequence)(m)
196 | m = Reshape([n_styles])(m)
197 | m = Dense(n_styles)(m)
198 | m = Activation('softmax')(m)
199 |
200 | return m
201 |
202 |
203 |
--------------------------------------------------------------------------------
/graph_ml/ntm.py:
--------------------------------------------------------------------------------
1 |
2 | import keras
3 | import keras.backend as K
4 |
5 | import tensorflow as tf
6 |
7 | from keras.models import Model
8 | from keras.layers import *
9 | from recurrentshop import RecurrentModel
10 |
11 | from .util import *
12 |
13 | class NTMBase(object):
14 |
15 | def __init__(self, experiment):
16 | self.experiment = experiment
17 |
18 | self.patch_size = experiment.header.params["patch_size"]
19 | self.patch_width = experiment.header.params["patch_width"]
20 | self.working_width = experiment.header.params["working_width"]
21 | self.word_size = self.experiment.header.params["word_size"]
22 | self.batch_size = self.experiment.params.batch_size
23 | self.memory_size = self.experiment.header.params["memory_size"]
24 | self.patch_data_width = self.patch_width - self.memory_size
25 |
26 | self.word_shape = [self.word_size]
27 | self.word_shape_batch = [self.batch_size, self.word_size]
28 | self.memory_shape = [self.memory_size, self.word_size]
29 | self.memory_shape_batch = [self.batch_size] + self.memory_shape
30 |
31 |
32 | def combine_nodes(self, patch, width):
33 | patch_data = Lambda(lambda x: x[:,:,0:self.patch_data_width:])(patch)
34 |
35 | n1 = Conv1D(
36 | filters=width,
37 | kernel_size=1,
38 | activation='tanh',
39 | kernel_initializer='random_uniform',
40 | bias_initializer='zeros',
41 | name="ConvPatch1")(patch_data)
42 |
43 | n2 = Conv1D(
44 | filters=width,
45 | kernel_size=1,
46 | activation='tanh',
47 | kernel_initializer='random_uniform',
48 | bias_initializer='zeros',
49 | name="ConvPatch2")(patch_data)
50 |
51 | n = multiply([n1, n2])
52 |
53 | n = Conv1D(
54 | filters=width,
55 | kernel_size=1,
56 | activation='tanh',
57 | kernel_initializer='random_uniform',
58 | bias_initializer='zeros',
59 | name="ConvPatch3")(n)
60 |
61 | n = MaxPooling1D(self.patch_size)(n)
62 | n = Reshape([width])(n)
63 | return n
64 |
65 | def patch_extract(self, address, patch, slice_begin):
66 | extract_width = self.patch_width - (slice_begin % self.patch_width)
67 |
68 | address_repeated = Lambda(lambda x:K.repeat_elements(K.expand_dims(x, -1), extract_width, -1))(address)
69 | patch_slices = Lambda(lambda x: x[:,:,slice_begin::])(patch)
70 | assert_shape(patch_slices, [self.patch_size, extract_width])
71 |
72 | rows = multiply([patch_slices, address_repeated])
73 | row = Lambda(lambda x: K.sum(x,-2))(rows)
74 | assert_shape(row, [extract_width])
75 |
76 | return row
77 |
78 | def resolve_address(self, address, patch):
79 | assert_shape(address, [self.patch_size])
80 | assert_shape(patch, [self.patch_size, self.patch_width])
81 | return self.patch_extract(address, patch, -self.memory_size)
82 |
83 | def read(self, memory, address):
84 | address_repeated = Lambda(lambda x:K.repeat_elements(K.expand_dims(x, -1), self.word_size, -1))(address)
85 | read_rows = multiply([memory, address_repeated])
86 | read = Lambda(lambda x: K.sum(x,-2))(read_rows)
87 |
88 | assert_shape(read, [self.word_size])
89 |
90 | return read
91 |
92 | def write(self, memory, address, write):
93 | assert_shape(memory, self.memory_shape)
94 | assert_shape(write, [self.word_size])
95 | assert_shape(address, [self.memory_size])
96 |
97 | address_expanded = expand_dims(address, -1)
98 | write = expand_dims(write, 1)
99 | write_e = dot([address_expanded, write], axes=[2,1], name="WriteExpanded")
100 | memory = add([memory, write_e], name="MemoryWrite")
101 | return memory
102 |
103 | def erase(self, memory, address, erase):
104 | assert_shape(memory, self.memory_shape)
105 | assert_shape(erase, [self.word_size])
106 | assert_shape(address, [self.memory_size])
107 |
108 | erase = expand_dims(erase, 1)
109 | address_expanded = expand_dims(address, -1)
110 | erase_e = dot([address_expanded, erase], axes=[2,1], name="EraseExpanded")
111 | assert_shape(erase_e, self.memory_shape)
112 | erase_mask = Lambda(lambda x: 1.0 - x)(erase_e)
113 | memory = multiply([memory, erase_mask])
114 | return memory
115 |
116 | def generate_address(self, input_data, patch, name):
117 | address_ptr = Dense(self.patch_size, activation="softplus",name=name)(input_data)
118 | address = self.resolve_address(address_ptr, patch)
119 | return address
120 |
121 |
122 | class PatchNTM(NTMBase):
123 |
124 | def __init__(self, experiment):
125 | NTMBase.__init__(self, experiment)
126 |
127 | def build(self):
128 |
129 | patch = Input((self.patch_size, self.patch_width), name="InputPatch")
130 | memory_tm1 = Input(batch_shape=self.memory_shape_batch, name="Memory")
131 | memory_t = memory_tm1
132 |
133 | # conv = self.combine_nodes(patch, working_width)
134 | # first_node = Lambda(lambda x: x[:,:self.patch_data_width])(flat_patch)
135 | patch_without_memory_addr = Lambda(lambda x: x[:,:,:self.patch_data_width:])(patch)
136 | flat_patch = Reshape([self.patch_size*self.patch_data_width])(patch_without_memory_addr)
137 |
138 | working_memory = Dense(self.working_width, activation='relu')(flat_patch)
139 | # conv = self.combine_nodes(patch, self.working_width)
140 | # working_memory = concatenate([working_memory, conv])
141 | # working_memory = Dense(self.working_width, activation='relu')(working_memory)
142 |
143 | pre_memory = working_memory
144 |
145 | use_memory = False
146 |
147 | if use_memory:
148 | # ------- Memory operations --------- #
149 |
150 | primary_address = Lambda(lambda x: x[:,3,self.patch_data_width:])(patch)
151 | print(primary_address)
152 |
153 | address = self.generate_address(primary_address, patch, name="address_read1")
154 | read1 = self.read(memory_t, address)
155 |
156 | # Turn batch dimension from None to batch_size
157 | batched_working_memory = Lambda(lambda x: K.reshape(x, [self.batch_size, self.working_width]))(working_memory)
158 | batched_working_memory = concatenate([batched_working_memory, read1], batch_size=self.batch_size)
159 |
160 | batched_working_memory = Dense(self.working_width, activation='relu')(batched_working_memory)
161 |
162 | erase_word = Dense(self.word_size, name="DenseEraseWord", activation='relu')(batched_working_memory)
163 | # address = self.generate_address(batched_working_memory, patch, name="address_erase")
164 | erase_word = Lambda(lambda x: K.ones_like(x))(erase_word)
165 | memory_t = self.erase(memory_t, primary_address, erase_word)
166 |
167 | write_word = Dense(self.word_size, name="DenseWriteWord", activation='relu')(batched_working_memory)
168 | # address = self.generate_address(batched_working_memory, patch, name="address_write")
169 | memory_t = self.write(memory_t, primary_address, write_word)
170 |
171 | # address = self.generate_address(batched_working_memory, patch, name="address_read2")
172 | # read2 = self.read(memory_t, address)
173 |
174 | # working_memory = concatenate([batched_working_memory, read1])
175 | working_memory = Dense(self.working_width, activation="relu")(batched_working_memory)
176 |
177 |
178 | return RecurrentModel(
179 | input=patch,
180 | output=working_memory,
181 | return_sequences=True,
182 | stateful=True,
183 |
184 | initial_states=[memory_tm1],
185 | final_states=[memory_t],
186 | state_initializer=[initializers.random_normal(stddev=1.0)]
187 | )
188 |
189 |
190 |
--------------------------------------------------------------------------------
/graph_ml/path.py:
--------------------------------------------------------------------------------
1 |
2 | import hashlib
3 | import os.path
4 | import logging
5 |
6 | logger = logging.getLogger(__name__)
7 |
8 | def generate_path(experiment, prefix, suffix, extra=""):
9 | query = experiment.header.cypher_query
10 | m = hashlib.md5()
11 |
12 | m.update(query.encode('utf-8'))
13 | m.update(extra.encode('utf-8'))
14 | # logger.info(f"generate_path {prefix} {suffix} {query} {extra}")
15 | return os.path.join(prefix + '/' + experiment.name + '_' + m.hexdigest() + suffix)
16 |
17 | def generate_output_path(experiment, suffix):
18 | return generate_path(experiment, experiment.params.output_dir, suffix)
19 |
20 | def generate_data_path(experiment, suffix, query_params=None):
21 | return generate_path(experiment, experiment.params.data_dir, suffix, str(query_params))
22 |
--------------------------------------------------------------------------------
/graph_ml/train.py:
--------------------------------------------------------------------------------
1 |
2 | import os.path
3 | from datetime import datetime
4 | import logging
5 | from sklearn.metrics import classification_report
6 | import itertools
7 |
8 | import keras
9 | import numpy as np
10 | import keras.callbacks
11 |
12 | from .model import Model
13 | from .dataset import Dataset
14 | from .path import generate_output_path
15 |
16 | logger = logging.getLogger(__name__)
17 |
18 | class StopEarlyIfAbove(keras.callbacks.Callback):
19 | def __init__(self, monitor='val_acc', value=0.99, verbose=0, patience=3):
20 | super(keras.callbacks.Callback, self).__init__()
21 | self.monitor = monitor
22 | self.value = value
23 | self.verbose = verbose
24 | self.stopped_epoch = 0
25 | self.patience = patience
26 |
27 | def on_epoch_end(self, epoch, logs={}):
28 | current = logs.get(self.monitor)
29 | if current is None:
30 | logger.error("Early stopping requires %s available!" % self.monitor)
31 | exit()
32 |
33 | if current > self.value:
34 | self.patience -= 1
35 | if self.patience <= 0:
36 | self.stopped_epoch = epoch
37 | self.model.stop_training = True
38 |
39 | def on_train_end(self, logs=None):
40 | if self.stopped_epoch > 0 and self.verbose > 0:
41 | logger.info("Epoch {}: early stopping {} > {}".format(self.stopped_epoch+1, self.monitor, self.value))
42 |
43 |
44 | class SpecialValidator(keras.callbacks.Callback):
45 | def __init__(self, experiment, dataset, model, verbose):
46 | self.experiment = experiment
47 | self.model = model
48 | self.dataset = dataset
49 | self.verbose = verbose
50 | super(keras.callbacks.Callback, self).__init__()
51 |
52 |
53 | def on_train_end(self, logs):
54 | self.test(self.verbose)
55 |
56 | def on_epoch_end(self, epoch, logs):
57 | self.test()
58 |
59 | def test(self, verbose=False):
60 | print() # Clear from epoch status bar
61 | for (label, genie) in self.dataset.generator.items():
62 | # print(f"Prediction for {label}")
63 |
64 | row = genie.peek()
65 | y_true = row[1][0]
66 | x_test = row[0][0]
67 |
68 | y_pred = self.model.predict_generator(
69 | generator=genie,
70 | steps=1,
71 | workers=0,
72 | use_multiprocessing=False,
73 | )
74 | y_pred = np.array(y_pred[0])
75 |
76 | y_correct = np.isclose(y_pred, y_true, atol=0.1)
77 | y_zero = np.isclose(y_pred, 0, atol=0.1)
78 |
79 | # The bits that should be one
80 | y_true_set_and_in_mask = np.where(np.greater(y_true, 0.1), np.greater(x_test, 0.1), False)
81 |
82 | # The bits that should be one and were one
83 | y_masked = np.where(y_true_set_and_in_mask, y_correct, False)
84 |
85 | # The correct predictions for the input adj
86 | y_masked_david = np.where(np.greater(x_test, 0.1), y_correct, False)
87 |
88 | if verbose:
89 | print("y_pred: ", np.around(y_pred, 1))
90 | print("y_correct: ", y_correct)
91 | # print(f"y_masked {np.count_nonzero(y_masked)} / {np.count_nonzero(y_correct)} / {np.count_nonzero(x_test)}")
92 |
93 | net_accuracy = round(np.count_nonzero(y_masked) / (np.count_nonzero(y_true_set_and_in_mask)+0.001) * 100, 3)
94 | net_accuracy_david = round(np.count_nonzero(y_masked_david) / (np.count_nonzero(x_test)+0.001) * 100, 3)
95 | gross_accuracy = round(np.count_nonzero(y_correct) / np.size(y_correct) * 100, 3)
96 |
97 | print(f"{label} 1-accuracy: {net_accuracy}% accuracy: {net_accuracy_david}%")
98 | # print()
99 |
100 | if label == "validate" and net_accuracy == 100:
101 | self.model.stop_training = True
102 |
103 |
104 |
105 |
106 |
107 | class Train(object):
108 |
109 | @staticmethod
110 | def run(experiment, dataset):
111 |
112 | params = experiment.params
113 |
114 | if params.random_seed is not None:
115 | np.random.seed(params.random_seed)
116 |
117 | logger.info("Generate model")
118 |
119 | model = Model.generate(experiment, dataset)
120 | params_file = generate_output_path(experiment, ".hdf5")
121 |
122 | if os.path.isfile(params_file) and params.load_weights:
123 | model.load_weights(params_file)
124 |
125 | callbacks = [
126 | #StopEarlyIfAbove(verbose=params.verbose),
127 | SpecialValidator(experiment, dataset, model, params.print_weights),
128 | # keras.callbacks.ModelCheckpoint(params_file, verbose=params.verbose, save_best_only=True, monitor='val_loss', mode='auto', period=3),
129 | # keras.callbacks.TensorBoard(log_dir=generate_output_path(experiment, f"_log/{experiment.run_tag}/")),
130 | #keras.callbacks.EarlyStopping(monitor='loss', min_delta=0.000000001, patience=8, verbose=0, mode='auto')
131 | ]
132 |
133 | # TODO: move to more general overriding mechanism
134 | # Perhaps unify os.environ, arguments, experiment parameters
135 | if params.epochs is not None:
136 | epochs = params.epochs
137 | else:
138 | epochs = experiment.header.params.get('epochs', 20)
139 |
140 | logger.info("Fit model")
141 |
142 | # Once I've worked out Python multithreading conflicts we can introduce workers > 0
143 | model.fit_generator(
144 | generator=dataset.train_generator,
145 | steps_per_epoch=dataset.steps_per_epoch,
146 | validation_data=dataset.validation_generator,
147 | validation_steps=dataset.validation_steps,
148 |
149 | epochs=epochs,
150 | verbose=params.verbose,
151 |
152 | workers=0,
153 | use_multiprocessing=False,
154 | shuffle=True,
155 | callbacks=callbacks
156 | )
157 |
158 | logger.info("Evaluate model")
159 |
160 | score = model.evaluate_generator(
161 | generator=dataset.test_generator,
162 | steps=dataset.test_steps,
163 | workers=0,
164 | use_multiprocessing=False,
165 | )
166 |
167 |
168 | if params.print_weights:
169 | for layer in model.layers:
170 | for var, weight in zip(layer.weights, layer.get_weights()):
171 | print(f"{var.name} {np.around(weight, decimals=1)}")
172 |
173 |
174 | return score
175 |
176 |
177 |
--------------------------------------------------------------------------------
/graph_ml/util.py:
--------------------------------------------------------------------------------
1 |
2 | from keras.layers import Lambda
3 | import keras.backend as K
4 |
5 | # Take that keras
6 | from tensorflow import float32
7 |
8 | def assert_shape(tensor, shape, strict=False):
9 | if strict:
10 | assert hasattr(tensor, '_keras_shape'), f"{tensor.name} is missing _keras_shape"
11 | assert tensor.shape[1:] == shape, f"{tensor.name} is wrong shape, expected {shape} found {tensor.shape[1:]}"
12 |
13 | def assert_mtx_shape(mtx, shape, name="matrix"):
14 | assert mtx.shape == shape, f"{name} is wrong shape, expected {shape} found {mtx.shape}"
15 |
16 | def expand_dims(v, axis):
17 | return Lambda(lambda x: K.expand_dims(x,axis))(v)
--------------------------------------------------------------------------------
/output/.gitignore:
--------------------------------------------------------------------------------
1 | *
--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | python -m unittest discover test
--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | from .test_memory_cell import test_memory_cell
--------------------------------------------------------------------------------
/test/test_memory_cell.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | import keras.backend as K
4 | from keras.utils.test_utils import keras_test
5 | from keras.models import Model
6 | from keras.layers import *
7 |
8 | from recurrentshop import RecurrentModel
9 |
10 | import numpy as np
11 | from numpy.testing import *
12 |
13 | import random
14 | from collections import namedtuple
15 | from tensorflow import float32
16 | from unittest import TestCase
17 |
18 | from graph_ml import Train, Dataset
19 | from graph_ml import NTMBase
20 | from experiment import Experiment, ExperimentHeader
21 |
22 | Args = namedtuple('DummyArgs', 'batch_size')
23 |
24 |
25 | class Tests(TestCase):
26 |
27 | @keras_test
28 | def test_memory_ops(self):
29 |
30 | memory_size = 10
31 | word_size = 4
32 | batch_size = 1
33 |
34 | header = ExperimentHeader(params={"word_size":word_size, "memory_size":memory_size, "patch_size":4, "patch_width":4})
35 | experiment = Experiment("test_memory_cell", header, Args(batch_size))
36 |
37 | # Initialise memory with zeros
38 | memory_initial = np.random.random((batch_size, memory_size, word_size))
39 | memory_tm1 = K.constant(memory_initial, name="memory",dtype=float32)
40 | memory_t = memory_tm1
41 |
42 | # Write address is random int
43 | address_w = random.randint(0,memory_size - 1)
44 | address_one_hot_w = np.zeros([batch_size, memory_size])
45 | address_one_hot_w[0][address_w] = 1.0
46 | t_address_w = K.constant(address_one_hot_w, name="address",dtype=float32)
47 |
48 | # Write random pattern
49 | write = np.random.random([batch_size, word_size])
50 | t_write = K.constant(write, name="write")
51 |
52 | pb = NTMBase(experiment)
53 | memory_t = pb.write(memory_t, t_address_w, t_write)
54 | read = pb.read(memory_t, t_address_w)
55 |
56 | address_e = (address_w+1) % memory_size
57 | address_one_hot_e = np.zeros([batch_size, memory_size])
58 | address_one_hot_e[0][address_e] = 1.0
59 | t_address_e = K.constant(address_one_hot_e, name="address",dtype=float32)
60 |
61 | t_erase = K.constant(np.ones([batch_size, word_size]),name="erase")
62 | memory_t = pb.erase(memory_t, t_address_e, t_erase)
63 |
64 | read_final = K.eval(read)
65 | memory_after_erase = K.eval(memory_t)
66 |
67 | write_expected = [write[0] + memory_initial[0][address_w]]
68 |
69 | for i in range(batch_size):
70 | for j in range(memory_size):
71 | if j == address_w:
72 | assert_allclose(memory_after_erase[i][j], write_expected[0])
73 | elif j == address_e:
74 | assert_allclose(memory_after_erase[i][j], 0)
75 | else:
76 | assert_allclose(memory_after_erase[i][j], memory_initial[i][j])
77 |
78 | assert_allclose(read_final, write_expected)
79 |
80 |
81 | @keras_test
82 | def test_memory_loopback(self):
83 |
84 | memory_size = 10
85 | word_size = 4
86 | batch_size = 1
87 |
88 | header = ExperimentHeader(params={"word_size":word_size, "memory_size":memory_size, "patch_size":4, "patch_width":4})
89 | experiment = Experiment("test_memory_cell", header, Args(batch_size))
90 |
91 | # Initialise memory with zeros
92 | memory_initial = np.random.random((batch_size, memory_size, word_size))
93 | memory_tm1 = K.constant(memory_initial, name="memory",dtype=float32)
94 | memory_t = memory_tm1
95 |
96 | # Write address is random int
97 | address = random.randint(0,memory_size - 1)
98 | address_one_hot = np.zeros([batch_size, memory_size])
99 | address_one_hot[0][address] = 1.0
100 | t_address = K.constant(address_one_hot, name="address",dtype=float32)
101 |
102 | # Write random pattern
103 | write = np.random.random([batch_size, word_size])
104 | t_write = K.constant(write, name="write")
105 | t_erase = K.constant(np.ones([batch_size, word_size]),name="erase")
106 |
107 | pb = NTMBase(experiment)
108 | memory_t = pb.erase(memory_t, t_address, t_erase)
109 | memory_t = pb.write(memory_t, t_address, t_write)
110 | t_read = pb.read( memory_t, t_address)
111 |
112 | read_final = K.eval(t_read)
113 |
114 | assert_allclose(read_final, write)
115 |
116 |
117 | @keras_test
118 | def test_address_resolution(self):
119 |
120 | # Data setup
121 | memory_size = 20
122 | word_size = 4
123 | batch_size = 1
124 | patch_size = 10
125 | patch_width = memory_size + 5
126 |
127 | header = ExperimentHeader(params={"word_size":word_size, "memory_size":memory_size, "patch_size":patch_size, "patch_width":patch_width})
128 | experiment = Experiment("test_memory_cell", header, Args(batch_size))
129 |
130 | pointer = random.randint(0,patch_size - 1)
131 | pointer_one_hot = np.zeros([batch_size, patch_size])
132 | pointer_one_hot[0][pointer] = 1.0
133 |
134 | patch = np.random.random([batch_size, patch_size, patch_width])
135 |
136 | t_patch = K.constant(patch, dtype=float32, name="patch")
137 | t_pointer_one_hot = K.constant(pointer_one_hot, dtype=float32, name="pointer_one_hot")
138 | pb = NTMBase(experiment)
139 | resolved = K.eval(pb.resolve_address(t_pointer_one_hot, t_patch))
140 |
141 | for i in range(batch_size):
142 | assert_almost_equal(resolved[i], patch[i][pointer][-memory_size::])
143 |
144 |
145 |
146 | @keras_test
147 | def test_address_resolution_gradient(self):
148 |
149 | # Data setup
150 | memory_size = 20
151 | word_size = 4
152 | batch_size = 1
153 | patch_size = 10
154 | patch_width = memory_size + 5
155 |
156 | header = ExperimentHeader(params={"word_size":word_size, "memory_size":memory_size, "patch_size":patch_size, "patch_width":patch_width})
157 | experiment = Experiment("test_memory_cell", header, Args(batch_size))
158 |
159 | pb = NTMBase(experiment)
160 |
161 | ptr = Input((patch_size,), name="ptr")
162 | patch = Input((patch_size,patch_width), name="patch")
163 | memory = Input((memory_size, word_size), name="memory")
164 |
165 | resolved = pb.resolve_address(ptr, patch)
166 | read = pb.read(memory, resolved)
167 |
168 | out = Dense(3)(read)
169 |
170 | model = Model([ptr, patch, memory], out)
171 | model.compile(loss='mse', optimizer='sgd')
172 |
173 | model.fit({
174 | "ptr": np.random.random((batch_size, patch_size)),
175 | "patch": np.random.random((batch_size, patch_size, patch_width)),
176 | "memory": np.random.random((batch_size, memory_size, word_size)),
177 | }, np.random.random((batch_size, 3)))
178 |
179 |
180 | model.predict({
181 | "ptr": np.zeros((batch_size, patch_size)),
182 | "patch": np.zeros((batch_size, patch_size, patch_width)),
183 | "memory": np.zeros((batch_size, memory_size, word_size)),
184 | })
185 |
186 |
187 | @keras_test
188 | def test_memory_gradient(self):
189 |
190 | # Data setup
191 | memory_size = 20
192 | word_size = 4
193 | batch_size = 1
194 | patch_size = 10
195 | patch_width = memory_size + 5
196 |
197 | header = ExperimentHeader(params={"word_size":word_size, "memory_size":memory_size, "patch_size":patch_size, "patch_width":patch_width})
198 | experiment = Experiment("test_memory_cell", header, Args(batch_size))
199 |
200 | pb = NTMBase(experiment)
201 |
202 | patch = Input((patch_size, patch_width), name="patch")
203 | memory_tm1 = Input((memory_size, word_size), name="memory")
204 | memory_t = memory_tm1
205 |
206 | flat_patch = Reshape((patch_size*patch_width,))(patch)
207 |
208 | write_word = Dense(word_size)(flat_patch)
209 | erase_word = Dense(word_size)(flat_patch)
210 |
211 | ptr = Dense(patch_size)(flat_patch)
212 | address = pb.resolve_address(ptr, patch)
213 | memory_t = pb.erase(memory_t, address, erase_word)
214 |
215 | ptr = Dense(patch_size)(flat_patch)
216 | address = pb.resolve_address(ptr, patch)
217 | memory_t = pb.write(memory_t, address, write_word)
218 |
219 | ptr = Dense(patch_size)(flat_patch)
220 | address = pb.resolve_address(ptr, patch)
221 | read = pb.read(memory_t, address)
222 |
223 | out = Dense(3)(read)
224 |
225 | model = Model([patch, memory_tm1], out)
226 | model.compile(loss='mse', optimizer='sgd')
227 |
228 | model.fit({
229 | "patch": np.random.random((batch_size, patch_size, patch_width)),
230 | "memory": np.random.random((batch_size, memory_size, word_size)),
231 | }, np.random.random((batch_size, 3)))
232 |
233 |
234 | model.predict({
235 | "patch": np.zeros((batch_size, patch_size, patch_width)),
236 | "memory": np.zeros((batch_size, memory_size, word_size)),
237 | })
238 |
239 |
240 |
241 |
242 | @keras_test
243 | def test_memory_rnn_gradient(self):
244 |
245 | # Data setup
246 | memory_size = 20
247 | word_size = 4
248 | batch_size = 1
249 | patch_size = 10
250 | patch_width = memory_size + 5
251 | sequence_length = 10
252 |
253 | header = ExperimentHeader(params={"word_size":word_size, "memory_size":memory_size, "patch_size":patch_size, "patch_width":patch_width})
254 | experiment = Experiment("test_memory_cell", header, Args(batch_size))
255 |
256 | pb = NTMBase(experiment)
257 |
258 | patch = Input((patch_size, patch_width), name="patch")
259 | memory_tm1 = Input((memory_size, word_size), name="memory")
260 | memory_t = memory_tm1
261 |
262 | flat_patch = Reshape((patch_size*patch_width,))(patch)
263 |
264 | write_word = Dense(word_size)(flat_patch)
265 | erase_word = Dense(word_size)(flat_patch)
266 |
267 | ptr = Dense(patch_size)(flat_patch)
268 | address = pb.resolve_address(ptr, patch)
269 | memory_t = pb.erase(memory_t, address, erase_word)
270 |
271 | ptr = Dense(patch_size)(flat_patch)
272 | address = pb.resolve_address(ptr, patch)
273 | memory_t = pb.write(memory_t, address, write_word)
274 |
275 | ptr = Dense(patch_size)(flat_patch)
276 | address = pb.resolve_address(ptr, patch)
277 | read = pb.read(memory_t, address)
278 |
279 | out = Dense(3)(read)
280 |
281 | rnn = RecurrentModel(input=patch, output=out, initial_states=[memory_tm1], final_states=[memory_t])
282 | a = Input((sequence_length, patch_size, patch_width), name="patch_seq")
283 | b = rnn(a)
284 | model = Model(a, b)
285 | model.compile(loss='mse', optimizer='sgd')
286 |
287 | model.fit({
288 | "patch_seq": np.random.random((batch_size, sequence_length, patch_size, patch_width)),
289 | # "memory": np.random.random((batch_size, memory_size, word_size)),
290 | }, np.random.random((batch_size, 3)))
291 |
292 |
293 | model.predict({
294 | "patch_seq": np.zeros((batch_size, sequence_length, patch_size, patch_width)),
295 | # "memory": np.zeros((batch_size, memory_size, word_size)),
296 | })
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from experiment import Experiment
4 |
5 | if __name__ == '__main__':
6 | Experiment.run()
7 |
8 |
--------------------------------------------------------------------------------