├── .floydexpt ├── .floydignore ├── .gitignore ├── .idea ├── misc.xml ├── modules.xml └── vcs.xml ├── LICENSE ├── Pipfile ├── Pipfile.lock ├── README.md ├── bin ├── floyd-run.sh └── start_neo4j_locally.sh ├── config ├── .gitignore ├── __init__.py ├── environment.py └── overrides.py ├── data_sets └── synthetic_review_prediction │ ├── article_0 │ ├── __init__.py │ ├── configure.py │ └── generate.py │ └── utils │ └── dataset_writer.py ├── experiment ├── __init__.py ├── arguments.py ├── directory.py ├── experiment.py └── experiment_header.py ├── floyd_requirements.txt ├── graph_ml ├── __init__.py ├── adjacency_layer.py ├── dataset.py ├── dataset_helpers.py ├── model.py ├── ntm.py ├── path.py ├── train.py └── util.py ├── output └── .gitignore ├── test.sh ├── test ├── __init__.py └── test_memory_cell.py └── train.py /.floydexpt: -------------------------------------------------------------------------------- 1 | {"family_id": "XaCDPUiGtasLwxhbKi4y7S", "name": "graph-investigations"} -------------------------------------------------------------------------------- /.floydignore: -------------------------------------------------------------------------------- 1 | 2 | # Directories and files to ignore when uploading code to floyd 3 | 4 | .git 5 | .eggs 6 | eggs 7 | lib 8 | lib64 9 | parts 10 | sdist 11 | var 12 | *.pyc 13 | *.swp 14 | .DS_Store 15 | data 16 | output 17 | log -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *~ 6 | # C extensions 7 | *.so 8 | .DS_Store 9 | 10 | # Distribution / packaging 11 | .Python 12 | env/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | data/ 29 | output/ 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # SageMath parsed files 83 | *.sage.py 84 | 85 | # dotenv 86 | .env 87 | 88 | # virtualenv 89 | .venv 90 | venv/ 91 | ENV/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Octavian-ai 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | 3 | url = "https://pypi.python.org/simple" 4 | verify_ssl = true 5 | name = "pypi" 6 | 7 | 8 | [packages] 9 | 10 | "neo4j-driver" = "*" 11 | tensorflow = "*" 12 | keras = "*" 13 | numpy = "*" 14 | lazy = "*" 15 | "h5py" = "*" 16 | colorama = "*" 17 | coloredlogs = "*" 18 | more-itertools = "*" 19 | recurrentshop = {git = "https://github.com/datalogai/recurrentshop.git"} 20 | generate-data = {git = "https://github.com/Octavian-ai/generate-data.git"} 21 | colored-traceback = "*" 22 | sklearn = "*" 23 | tqdm = "*" 24 | floyd-cli = "*" 25 | 26 | 27 | [dev-packages] 28 | 29 | -------------------------------------------------------------------------------- /Pipfile.lock: -------------------------------------------------------------------------------- 1 | { 2 | "_meta": { 3 | "hash": { 4 | "sha256": "abe2e1e33a7a78d6c130b15bb5444b6c11496bc59b24304e35c505c2081a253b" 5 | }, 6 | "host-environment-markers": { 7 | "implementation_name": "cpython", 8 | "implementation_version": "3.6.2", 9 | "os_name": "posix", 10 | "platform_machine": "x86_64", 11 | "platform_python_implementation": "CPython", 12 | "platform_release": "17.3.0", 13 | "platform_system": "Darwin", 14 | "platform_version": "Darwin Kernel Version 17.3.0: Thu Nov 9 18:09:22 PST 2017; root:xnu-4570.31.3~1/RELEASE_X86_64", 15 | "python_full_version": "3.6.2", 16 | "python_version": "3.6", 17 | "sys_platform": "darwin" 18 | }, 19 | "pipfile-spec": 6, 20 | "requires": {}, 21 | "sources": [ 22 | { 23 | "name": "pypi", 24 | "url": "https://pypi.python.org/simple", 25 | "verify_ssl": true 26 | } 27 | ] 28 | }, 29 | "default": { 30 | "args": { 31 | "hashes": [ 32 | "sha256:a785b8d837625e9b61c39108532d95b85274acd679693b71ebb5156848fcf814" 33 | ], 34 | "version": "==0.1.0" 35 | }, 36 | "backports.weakref": { 37 | "hashes": [ 38 | "sha256:81bc9b51c0abc58edc76aefbbc68c62a787918ffe943a37947e162c3f8e19e82", 39 | "sha256:bc4170a29915f8b22c9e7c4939701859650f2eb84184aee80da329ac0b9825c2" 40 | ], 41 | "version": "==1.0.post1" 42 | }, 43 | "bleach": { 44 | "hashes": [ 45 | "sha256:e67f46adcec78dbc3c04462f3aba3213a673d5652eba2609ed1ef15492a44b8d", 46 | "sha256:978e758599b54cd3caa2e160d74102879b230ea8dc93871d0783721eef58bc65" 47 | ], 48 | "version": "==1.5.0" 49 | }, 50 | "certifi": { 51 | "hashes": [ 52 | "sha256:14131608ad2fd56836d33a71ee60fa1c82bc9d2c8d98b7bdbc631fe1b3cd1296", 53 | "sha256:edbc3f203427eef571f79a7692bb160a2b0f7ccaa31953e99bd17e307cf63f7d" 54 | ], 55 | "version": "==2018.1.18" 56 | }, 57 | "chardet": { 58 | "hashes": [ 59 | "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691", 60 | "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae" 61 | ], 62 | "version": "==3.0.4" 63 | }, 64 | "click": { 65 | "hashes": [ 66 | "sha256:29f99fc6125fbc931b758dc053b3114e55c77a6e4c6c3a2674a2dc986016381d", 67 | "sha256:f15516df478d5a56180fbf80e68f206010e6d160fc39fa508b65e035fd75130b" 68 | ], 69 | "version": "==6.7" 70 | }, 71 | "clint": { 72 | "hashes": [ 73 | "sha256:05224c32b1075563d0b16d0015faaf9da43aa214e4a2140e51f08789e7a4c5aa" 74 | ], 75 | "version": "==0.5.1" 76 | }, 77 | "colorama": { 78 | "hashes": [ 79 | "sha256:463f8483208e921368c9f306094eb6f725c6ca42b0f97e313cb5d5512459feda", 80 | "sha256:48eb22f4f8461b1df5734a074b57042430fb06e1d61bd1e11b078c0fe6d7a1f1" 81 | ], 82 | "version": "==0.3.9" 83 | }, 84 | "colored-traceback": { 85 | "hashes": [ 86 | "sha256:f76c21a4b4c72e9e09763d4d1b234afc469c88693152a763ad6786467ef9e79f", 87 | "sha256:6da7ce2b1da869f6bb54c927b415b95727c4bb6d9a84c4615ea77d9872911b05" 88 | ], 89 | "version": "==0.3.0" 90 | }, 91 | "coloredlogs": { 92 | "hashes": [ 93 | "sha256:6bd7ceac109c3f2e138db8578396664b1067f32aca55d3280a57dbf05f1ada6c", 94 | "sha256:e3b19320bd21bde506444601a71397cf5215f040df06503013697c6261b05de9" 95 | ], 96 | "version": "==9.0" 97 | }, 98 | "contextlib2": { 99 | "hashes": [ 100 | "sha256:f5260a6e679d2ff42ec91ec5252f4eeffdcf21053db9113bd0a8e4d953769c00", 101 | "sha256:509f9419ee91cdd00ba34443217d5ca51f5a364a404e1dce9e8979cea969ca48" 102 | ], 103 | "markers": "python_version < '3.2'", 104 | "version": "==0.5.5" 105 | }, 106 | "enum34": { 107 | "hashes": [ 108 | "sha256:6bd0f6ad48ec2aa117d3d141940d484deccda84d4fcd884f5c3d93c23ecd8c79", 109 | "sha256:644837f692e5f550741432dd3f223bbb9852018674981b1664e5dc339387588a", 110 | "sha256:8ad8c4783bf61ded74527bffb48ed9b54166685e4230386a9ed9b1279e2df5b1", 111 | "sha256:2d81cbbe0e73112bdfe6ef8576f2238f2ba27dd0d55752a776c41d38b7da2850" 112 | ], 113 | "version": "==1.1.6" 114 | }, 115 | "floyd-cli": { 116 | "hashes": [ 117 | "sha256:0ecd7d42b91ab88b4e3e852f37c22f8ede849de96e5c0f3b9c20e4bd6fad5bbc" 118 | ], 119 | "version": "==0.10.31" 120 | }, 121 | "funcsigs": { 122 | "hashes": [ 123 | "sha256:330cc27ccbf7f1e992e69fef78261dc7c6569012cf397db8d3de0234e6c937ca", 124 | "sha256:a7bb0f2cf3a3fd1ab2732cb49eba4252c2af4240442415b4abce3b87022a8f50" 125 | ], 126 | "markers": "python_version < '3.3'", 127 | "version": "==1.0.2" 128 | }, 129 | "futures": { 130 | "hashes": [ 131 | "sha256:c4884a65654a7c45435063e14ae85280eb1f111d94e542396717ba9828c4337f", 132 | "sha256:51ecb45f0add83c806c68e4b06106f90db260585b25ef2abfcda0bd95c0132fd" 133 | ], 134 | "markers": "python_version < '3.2'", 135 | "version": "==3.1.1" 136 | }, 137 | "generate-data": { 138 | "git": "https://github.com/Octavian-ai/generate-data.git" 139 | }, 140 | "h5py": { 141 | "hashes": [ 142 | "sha256:562045c57a2e47aca9c716ac8cd64448d4897c0f5fe456ab5a34b17c8b3907cb", 143 | "sha256:e1bfcfa2c425dc0f637d4edd858b94e400bbb5746dba324ace124d55fc21d3df", 144 | "sha256:9e0537058efea7547d976f9c00067f7193727bb41ce6b4733c52de35beaa46f5", 145 | "sha256:9d9fb861e10735c5c710fe18f34c69e470cf161a4ba38717b7dde21de2d33760", 146 | "sha256:2d137a1b2f529e58886b5865f6dec51cd96ea0671dd84cebc6dba5cd8c7d0a75", 147 | "sha256:2ccb4f405059314829ebad1859d2c68e133a9d13ca7c3cc7a298a76a438fd09c", 148 | "sha256:52204972a02032d6a427addd37a24a22a2b97d4bce0850c84a6995db9c91926c", 149 | "sha256:1be9cd57e74b24f836d0d2c34ae376ff2df704f40aa8815aa9113b5a860d467f", 150 | "sha256:2258fca3533a3276fd86e9196326786f408a95748ac707c010fff265edf60342", 151 | "sha256:66609c48f8841357ced4291b7c9009518bb6e6fec449d91eb46aa417b6f5f4cf", 152 | "sha256:4a6e6cd8668fa453864f4f9e243460dcc2d41e79d14516b84f4ba74ebcc5b222", 153 | "sha256:a314e5e98037ece52ad0b88b4e0d788ca554935268f3e9d293ca9bcd18611b42", 154 | "sha256:478efa37b84a56061af5fcd286678331e873e216f6c5987cd31f9666edc2f157", 155 | "sha256:2b91c9117f2e7a2ef924bec41ac77e57567bec6731773373bf78eb4387b39a2a", 156 | "sha256:07ddea6bb649a257fc57ccae359a36d691b2ef8b9617971ae7d6f74ef6f67cad", 157 | "sha256:bb990d8663dbeee22ce44135ffd65ab38bd23d6a689722a653cfbf2d18d46688", 158 | "sha256:e78f09a44fc9256b84c9df98edf7b6ead3b3da2e12bf2d1e00384960a6a78a1a", 159 | "sha256:40dd37cbf24ca3b935a8d6eb8960ec5d0381219f82317bdc40aa9e08b3fcc143", 160 | "sha256:1fad9aa32835230de77b31edd6980b7c202de7bb7d8384d1bcb47b5dd32c8c7c", 161 | "sha256:537a60879485e5ce484ab4350c7bd8b3da4b531f9f82ef0a18780beabde98c90", 162 | "sha256:c050791989cd9979fe57a770d4e323b2e67ef95800e89e7dc6ad3652b8ccd86f", 163 | "sha256:b7e1c42367513108c3615cf1a24a9d366fd93eb9d2d92085bafb3011b785e8a9", 164 | "sha256:180a688311e826ff6ae6d3bda9b5c292b90b28787525ddfcb10a29d5ddcae2cc" 165 | ], 166 | "version": "==2.7.1" 167 | }, 168 | "html5lib": { 169 | "hashes": [ 170 | "sha256:2612a191a8d5842bfa057e41ba50bbb9dcb722419d2408c78cff4758d0754868" 171 | ], 172 | "version": "==0.9999999" 173 | }, 174 | "humanfriendly": { 175 | "hashes": [ 176 | "sha256:587b16ce804bec8e3cbb8c420decea051b38e3d895272b2c1e38fc69b4286b1c", 177 | "sha256:d0e74171b87318a94b99520e4f0c5651e944b5f11d696c46be3330bb82b85300" 178 | ], 179 | "version": "==4.8" 180 | }, 181 | "idna": { 182 | "hashes": [ 183 | "sha256:8c7309c718f94b3a625cb648ace320157ad16ff131ae0af362c9f21b80ef6ec4", 184 | "sha256:2c6a5de3089009e3da7c5dde64a141dbc8551d5b7f6cf4ed7c2568d0cc520a8f" 185 | ], 186 | "version": "==2.6" 187 | }, 188 | "keras": { 189 | "hashes": [ 190 | "sha256:7b1116bad7fb497758cfaffcd180e9adc2904be7deec2d9164543955e9973d0b", 191 | "sha256:7ca3a381523bad40a6922e88951a316664cb088fd01cea07e5ec8ada3327e3c7" 192 | ], 193 | "version": "==2.1.3" 194 | }, 195 | "lazy": { 196 | "hashes": [ 197 | "sha256:c80a77bf7106ba7b27378759900cfefef38271088dc63b014bcfe610c8e68e3d" 198 | ], 199 | "version": "==1.3" 200 | }, 201 | "markdown": { 202 | "hashes": [ 203 | "sha256:9ba587db9daee7ec761cfc656272be6aabe2ed300fece21208e4aab2e457bc8f", 204 | "sha256:a856869c7ff079ad84a3e19cd87a64998350c2b94e9e08e44270faef33400f81" 205 | ], 206 | "version": "==2.6.11" 207 | }, 208 | "marshmallow": { 209 | "hashes": [ 210 | "sha256:8740ada95f47fa19f905772aa4932dc5512226a90c30da5672d6d6bf3dd791a7", 211 | "sha256:d3f31fe7be2106b1d783cbd0765ef4e1c6615505514695f33082805f929dd584" 212 | ], 213 | "version": "==2.15.0" 214 | }, 215 | "mock": { 216 | "hashes": [ 217 | "sha256:5ce3c71c5545b472da17b72268978914d0252980348636840bd34a00b5cc96c1", 218 | "sha256:b158b6df76edd239b8208d481dc46b6afd45a846b7812ff0ce58971cf5bc8bba" 219 | ], 220 | "version": "==2.0.0" 221 | }, 222 | "monotonic": { 223 | "hashes": [ 224 | "sha256:0bcd2b14e3b7ee7cfde796e408176ceffa01d89646f2e532964ef2aae0c9fa3e", 225 | "sha256:a02611d5b518cd4051bf22d21bd0ae55b3a03f2d2993a19b6c90d9d168691f84" 226 | ], 227 | "markers": "python_version == '2.6' or python_version == '2.7' or python_version == '3.0' or python_version == '3.1' or python_version == '3.2'", 228 | "version": "==1.4" 229 | }, 230 | "more-itertools": { 231 | "hashes": [ 232 | "sha256:11a625025954c20145b37ff6309cd54e39ca94f72f6bb9576d1195db6fa2442e", 233 | "sha256:0dd8f72eeab0d2c3bd489025bb2f6a1b8342f9b198f6fc37b52d15cfa4531fea", 234 | "sha256:c9ce7eccdcb901a2c75d326ea134e0886abfbea5f93e91cc95de9507c0816c44" 235 | ], 236 | "version": "==4.1.0" 237 | }, 238 | "neo4j-driver": { 239 | "hashes": [ 240 | "sha256:a25c9b67e63403b6ca8114d18bee581d2cff032cdc89c68970a4be8cd30585d0" 241 | ], 242 | "version": "==1.5.3" 243 | }, 244 | "numpy": { 245 | "hashes": [ 246 | "sha256:428cd3c0b197cf857671353d8c85833193921af9fafcc169a1f29c7185833d50", 247 | "sha256:a476e437d73e5754aa66e1e75840d0163119c3911b7361f4cd06985212a3c3fb", 248 | "sha256:289ff717138cd9aa133adcbd3c3e284458b9c8230db4d42b39083a3407370317", 249 | "sha256:c5eccb4bf96dbb2436c61bb3c2658139e779679b6ae0d04c5e268e6608b58053", 250 | "sha256:75471acf298d455b035226cc609a92aee42c4bb6aa71def85f77fa2c2b646b61", 251 | "sha256:5c54fb98ecf42da59ed93736d1c071842482b18657eb16ba6e466bd873e1b923", 252 | "sha256:9ddf384ac3aacb72e122a8207775cc29727cbd9c531ee1a4b95754f24f42f7f3", 253 | "sha256:781d3197da49c421a07f250750de70a52c42af08ca02a2f7bdb571c0625ae7eb", 254 | "sha256:93b26d6c06a22e64d56aaca32aaaffd27a4143db0ac2f21a048f0b571f2bfc55", 255 | "sha256:b2547f57d05ba59df4289493254f29f4c9082d255f1f97b7e286f40f453e33a1", 256 | "sha256:eef6af1c752eef538a96018ef9bdf8e37bbf28aab50a1436501a4aa47a6467df", 257 | "sha256:ff8a4b2c3ac831964f529a2da506c28d002562b230261ae5c16885f5f53d2e75", 258 | "sha256:194074058c22a4066e1b6a4ea432486ee468d24ab16f13630c1030409e6b8666", 259 | "sha256:4e13f1a848fde960dea33702770265837c72b796a6a3eaac7528cfe75ddefadd", 260 | "sha256:91101216d72749df63968d86611b549438fb18af2c63849c01f9a897516133c7", 261 | "sha256:97507349abb7d1f6b76b877258defe8720833881dc7e7fd052bac90c88587387", 262 | "sha256:1479b46b6040b5c689831496354c8859c456b152d37315673a0c18720b41223b", 263 | "sha256:98b1ac79c160e36093d7914244e40ee1e7164223e795aa2c71dcce367554e646", 264 | "sha256:24bbec9a199f938eab75de8390f410969bc33c218e5430fa1ae9401b00865255", 265 | "sha256:7880f412543e96548374a4bb1d75e4cdb8cad80f3a101ed0f8d0e0428f719c1c", 266 | "sha256:6112f152b76a28c450bbf665da11757078a724a90330112f5b7ea2d6b6cefd67", 267 | "sha256:7c5276763646480143d5f3a6c2acb2885460c765051a1baf4d5070f63d05010f", 268 | "sha256:3de643935b212307b420248018323a44ec51987a336d1d747c1322afc3c099fb" 269 | ], 270 | "version": "==1.14.0" 271 | }, 272 | "pathlib2": { 273 | "hashes": [ 274 | "sha256:db3e43032d23787d3e9aec8c7ef1e0d2c3c589d5f303477661ebda2ca6d4bfba", 275 | "sha256:d32550b75a818b289bd4c1f96b60c89957811da205afcceab75bc8b4857ea5b3" 276 | ], 277 | "version": "==2.3.0" 278 | }, 279 | "pbr": { 280 | "hashes": [ 281 | "sha256:60c25b7dfd054ef9bb0ae327af949dd4676aa09ac3a9471cdc871d8a9213f9ac", 282 | "sha256:05f61c71aaefc02d8e37c0a3eeb9815ff526ea28b3b76324769e6158d7f95be1" 283 | ], 284 | "version": "==3.1.1" 285 | }, 286 | "protobuf": { 287 | "hashes": [ 288 | "sha256:11788df3e176f44e0375fe6361342d7258a457b346504ea259a21b77ffc18a90", 289 | "sha256:50c24f0d00b7efb3a72ae638ddc118e713cfe8cef40527afe24f7ebcb878e46d", 290 | "sha256:41661f9a442eba2f1967f15333ebe9ecc7e7c51bcbaa2972303ad33a4ca0168e", 291 | "sha256:06ec363b74bceb7d018f2171e0892f03ab6816530e2b0f77d725a58264551e48", 292 | "sha256:b20f861b55efd8206428c13e017cc8e2c34b40b2a714446eb202bbf0ff7597a6", 293 | "sha256:c1f9c36004a7ae6f1ce4a23f06070f6b07f57495f251851aa15cc4da16d08378", 294 | "sha256:4d2e665410b0a278d2eb2c0a529ca2366bb325eb2ae34e189a826b71fb1b28cd", 295 | "sha256:95b78959572de7d7fafa3acb718ed71f482932ddddddbd29ba8319c10639d863" 296 | ], 297 | "version": "==3.5.1" 298 | }, 299 | "pygments": { 300 | "hashes": [ 301 | "sha256:78f3f434bcc5d6ee09020f92ba487f95ba50f1e3ef83ae96b9d5ffa1bab25c5d", 302 | "sha256:dbae1046def0efb574852fab9e90209b23f556367b5a320c0bcb871c77c3e8cc" 303 | ], 304 | "version": "==2.2.0" 305 | }, 306 | "pytz": { 307 | "hashes": [ 308 | "sha256:80af0f3008046b9975242012a985f04c5df1f01eed4ec1633d56cc47a75a6a48", 309 | "sha256:feb2365914948b8620347784b6b6da356f31c9d03560259070b2f30cff3d469d", 310 | "sha256:59707844a9825589878236ff2f4e0dc9958511b7ffaae94dc615da07d4a68d33", 311 | "sha256:d0ef5ef55ed3d37854320d4926b04a4cb42a2e88f71da9ddfdacfde8e364f027", 312 | "sha256:c41c62827ce9cafacd6f2f7018e4f83a6f1986e87bfd000b8cfbd4ab5da95f1a", 313 | "sha256:8cc90340159b5d7ced6f2ba77694d946fc975b09f1a51d93f3ce3bb399396f94", 314 | "sha256:dd2e4ca6ce3785c8dd342d1853dd9052b19290d5bf66060846e5dc6b8d6667f7", 315 | "sha256:699d18a2a56f19ee5698ab1123bbcc1d269d061996aeb1eda6d89248d3542b82", 316 | "sha256:fae4cffc040921b8a2d60c6cf0b5d662c1190fe54d718271db4eb17d44a185b7" 317 | ], 318 | "version": "==2017.3" 319 | }, 320 | "pyyaml": { 321 | "hashes": [ 322 | "sha256:3262c96a1ca437e7e4763e2843746588a965426550f3797a79fca9c6199c431f", 323 | "sha256:16b20e970597e051997d90dc2cddc713a2876c47e3d92d59ee198700c5427736", 324 | "sha256:e863072cdf4c72eebf179342c94e6989c67185842d9997960b3e69290b2fa269", 325 | "sha256:bc6bced57f826ca7cb5125a10b23fd0f2fff3b7c4701d64c439a300ce665fff8", 326 | "sha256:c01b880ec30b5a6e6aa67b09a2fe3fb30473008c85cd6a67359a1b15ed6d83a4", 327 | "sha256:827dc04b8fa7d07c44de11fabbc888e627fa8293b695e0f99cb544fdfa1bf0d1", 328 | "sha256:592766c6303207a20efc445587778322d7f73b161bd994f227adaa341ba212ab", 329 | "sha256:5f84523c076ad14ff5e6c037fe1c89a7f73a3e04cf0377cb4d017014976433f3", 330 | "sha256:0c507b7f74b3d2dd4d1322ec8a94794927305ab4cebbe89cc47fe5e81541e6e8", 331 | "sha256:b4c423ab23291d3945ac61346feeb9a0dc4184999ede5e7c43e1ffb975130ae6", 332 | "sha256:ca233c64c6e40eaa6c66ef97058cdc80e8d0157a443655baa1b2966e812807ca", 333 | "sha256:4474f8ea030b5127225b8894d626bb66c01cda098d47a2b0d3429b6700af9fd8", 334 | "sha256:326420cbb492172dec84b0f65c80942de6cedb5233c413dd824483989c000608", 335 | "sha256:5ac82e411044fb129bae5cfbeb3ba626acb2af31a8d17d175004b70862a741a7" 336 | ], 337 | "version": "==3.12" 338 | }, 339 | "raven": { 340 | "hashes": [ 341 | "sha256:0adae40e004dfe2181d1f2883aa3d4ca1cf16dbe449ae4b445b011c6eb220a90", 342 | "sha256:84da75114739191bdf2388f296ffd6177e83567a7fbaf2701e034ad6026e4f3b" 343 | ], 344 | "version": "==6.5.0" 345 | }, 346 | "recurrentshop": { 347 | "git": "https://github.com/datalogai/recurrentshop.git" 348 | }, 349 | "requests": { 350 | "hashes": [ 351 | "sha256:6a1b267aa90cac58ac3a765d067950e7dbbf75b1da07e895d1f594193a40a38b", 352 | "sha256:9c443e7324ba5b85070c4a818ade28bfabedf16ea10206da1132edaa6dda237e" 353 | ], 354 | "version": "==2.18.4" 355 | }, 356 | "requests-toolbelt": { 357 | "hashes": [ 358 | "sha256:42c9c170abc2cacb78b8ab23ac957945c7716249206f90874651971a4acff237", 359 | "sha256:f6a531936c6fa4c6cfce1b9c10d5c4f498d16528d2a54a22ca00011205a187b5" 360 | ], 361 | "version": "==0.8.0" 362 | }, 363 | "scandir": { 364 | "hashes": [ 365 | "sha256:913d0d04f3ea8f38a52a38e930a08deacd3643d71875a0751a5c01e006102998", 366 | "sha256:eb9d4a55bbeb0473a9c7d3ff81e12d44f0ad86daff48b02a95e2398c87ff1a00", 367 | "sha256:2b28d118b372de8950f85b65d8ddfd43643f139a5b721281dd6532bed6b8321c", 368 | "sha256:f14476800cfdd6809d5130840f78ca3c08aa25544113e2b33a0b2fe914583d69", 369 | "sha256:6db5aadb667bb709cc23921203e9c27f08225506a9b84b7ebe2b645dee47a4dd", 370 | "sha256:8129fe7b9211d080457e0ff87397d85bb9be6ebb482b6be6ad9700059ac2e516", 371 | "sha256:8fe782abf9314f2733c09d2191c1b3047475218ddbae90052b5c0f1a4215d5e2", 372 | "sha256:a93b6cc872eeccdc91b4c1c1e510820bee17f79c9455064fb8d3b73b51e52024", 373 | "sha256:9851e782da220073093da68b3451e3c33b10f84eca2aec17a24661c7c63357a2", 374 | "sha256:937d27e367af994afd3792904b794a82645ea9616dd336f5030e0b50e527eb57", 375 | "sha256:e0278a2d4bc6c0569aedbe66bf26c8ab5b2b08378b3289de49257f23ac624338" 376 | ], 377 | "markers": "python_version < '3.5'", 378 | "version": "==1.6" 379 | }, 380 | "scikit-learn": { 381 | "hashes": [ 382 | "sha256:3775cca4ce3f94508bb7c8a6b113044b78c16b0a30a5c169ddeb6b9fe57a8a72", 383 | "sha256:873245b03361710f47c5410a050dc56ee8ae97b9f8dcc6e3a81521ca2b64ad10", 384 | "sha256:370919e3148253fd6552496c33a1e3d78290a336fc8d1b9349d9e9770fae6ec0", 385 | "sha256:ce78bf4d10bd7e28807c36c6d2ab25a9934aaf80906ad987622a5e45627d91a2", 386 | "sha256:ba3fd442ae1a46830789b3578867daaf2c8409dcca6bf192e30e85beeabbfc2f", 387 | "sha256:a21cf8217e31a9e8e32c559246e05e6909981816152406945ae2e3e244dfcc1f", 388 | "sha256:e54a3dd1fe1f8124de90b93c48d120e6da2ea8df29b6895325df01ddc1bd8e26", 389 | "sha256:f9abae483f4d52acd6f660addb1b67e35dc5748655250af479de2ea6aefc6df0", 390 | "sha256:5c9ff456d67ef9094e5ea272fff2be05d399a47fc30c6c8ed653b94bdf787bd1", 391 | "sha256:871669cdb5b3481650fe3adff46eb97c455e30ecdc307eaf382ef90d4e2570ab", 392 | "sha256:d4da369614e55540c7e830ccdd17ab4fe5412ff8e803a4906d3ece393e2e3a63", 393 | "sha256:42f3c5bd893ed73bf47ccccf04dfb98fae743f397d688bb58c2238c0e6ec15d2", 394 | "sha256:95b155ef6bf829ddfba6026f100ba8e4218b7171ecab97b2163bc9e8d206848f", 395 | "sha256:72c194c5092e921d6107a8de8a5adae58c35bbc54e030ba624b6f02fd823bb21", 396 | "sha256:f528c4b2bba652cf116f5cccf36f4db95a7f9cbfcd1ee549c4e8d0f8628783b5", 397 | "sha256:d384e6f9a055b7a43492f9d27779adb717eb5dcf78b0603b01d0f070a608d241", 398 | "sha256:ee8c3b1898c728b6e5b5659c233f547700a1fea13ce876b6fe7d3434c70cc0e0", 399 | "sha256:56cfa19c31edf62e6414da0a337efee37a4af488b135640e67238786b9be6ab3", 400 | "sha256:5db9e68a384ce80a17fc449d4d5d9b45025fe17cf468429599bf404eccb51049", 401 | "sha256:8b17fc29554c5c98d88142f895516a5bec2b6b61daa815e1193a64c868ad53d2", 402 | "sha256:13136c6e4f6b808569f7f59299d439b2cd718f85d72ea14b5b6077d44ebc7d17", 403 | "sha256:ddc1eb10138ae93c136cc4b5945d3977f302b5d693592a4731b2805a7d7f2a74", 404 | "sha256:5ca0ad32ee04abe0d4ba02c8d89d501b4e5e0304bdf4d45c2e9875a735b323a0", 405 | "sha256:6e0899953611d0c47c0d49c5950082ab016b38811fced91cd2dcc889dd94f50a", 406 | "sha256:b2a10e2f9b73de10d8486f7a23549093436062b69139158802910a0f154aa53b", 407 | "sha256:a58746d4f389ea7df1d908dba8b52f709835f91c342f459a3ade5424330c69d1", 408 | "sha256:fdc39e89bd3466befb76dfc0c258d4ccad159df974954a87de3be5759172a067" 409 | ], 410 | "version": "==0.19.1" 411 | }, 412 | "scipy": { 413 | "hashes": [ 414 | "sha256:70e6fc3f2f52c9152f05e27eb9bd8543cb862cacb71f8521a571e4ffb837f450", 415 | "sha256:08041e5336fcd57defcc78650b44b3df652eff3e3a801638d894e50494fb630d", 416 | "sha256:ff8b6637d8d2c074ed67f3d57513e62f94747c6f1210f43e60ad3d8e93a424e4", 417 | "sha256:5964dba6a3c0be226d44d2520de8fb4ba1501768bad57eec687d36d3f53b6254", 418 | "sha256:bf36f3485e7b7291c36330a93bbfd4f5e8db23bbe4ea46c37b2839fef463f4e2", 419 | "sha256:e3a5673c105eab802fdecb77f102d877352e201df9328698a265b7f57546b34b", 420 | "sha256:cd23894e1cc6eaa00e6807b6b12e4ca66d5ff092986c9c3eb01e97f24e2d6462", 421 | "sha256:23a7238279ae94e088396b8b05a9795ef598dc79c5cd1adb91ad1ff87c7514fd", 422 | "sha256:3b66d5e40152175bca75cbbfd1eb5c108c50de9ae5625923f1c4f8f51cbe2dea", 423 | "sha256:fa17be6c66985931d3a391f61a6ba97c902585cf26020aa3eb24604115732d22", 424 | "sha256:d84df0bc86bbdd49f0a6b6bad5cd62ccb02a3bfe546bf79263de44ae081bcd7b", 425 | "sha256:912499ddb521b7ac6287ac4ccf5f296a83d38996c2d04f43c9e62a91f7b420aa", 426 | "sha256:889602ead28054a15e8c26e1a6b8420d5a4fa777cfeb3ec98cfa52b9f317d153", 427 | "sha256:5774adb6047983489bc81edaa72cd132e665e5680f0b2cf8ea28cd3b99e65d39", 428 | "sha256:01c7040a83eb4e020ab729488637dcadef54cb728b035b76668ab92a72515d60", 429 | "sha256:046705c604c6f1d63cad3e89677c0618b7abb40ed09a4c241c671a2d8e5128a9", 430 | "sha256:1f58fbd59e8d9652759df0d137832ff2a325ed708c173cba20c86589d811c210", 431 | "sha256:424500b2fe573d30de6dea927076c01acaadb3efb3d1f40340e8cc37151ccf27", 432 | "sha256:97123a25216616723083942eb595f47fee18da6b637a88b803de5f078009003c", 433 | "sha256:a79b99b8b5af9a63312bd053bbb7bdb7710e6bbb9cc81617f9f6b9b1e49c72f8", 434 | "sha256:9bd193686fd837472bdb6425486cb234ed0a4db76b930c141cc8d095ab213c8d", 435 | "sha256:a9e479648aab5f36330da94f351ebbfe79acb4e6f5e6ac6aeddc9291eb096839", 436 | "sha256:87ea1f11a0e9ec08c264dc64551d501fa307289460705f6fccd84cbfc7926d10" 437 | ], 438 | "version": "==1.0.0" 439 | }, 440 | "six": { 441 | "hashes": [ 442 | "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb", 443 | "sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9" 444 | ], 445 | "version": "==1.11.0" 446 | }, 447 | "sklearn": { 448 | "hashes": [ 449 | "sha256:e23001573aa194b834122d2b9562459bf5ae494a2d59ca6b8aa22c85a44c0e31" 450 | ], 451 | "version": "==0.0" 452 | }, 453 | "tabulate": { 454 | "hashes": [ 455 | "sha256:e4ca13f26d0a6be2a2915428dc21e732f1e44dad7f76d7030b2ef1ec251cf7f2" 456 | ], 457 | "version": "==0.8.2" 458 | }, 459 | "tensorflow": { 460 | "hashes": [ 461 | "sha256:f9c03acc5d26ac903e177fb904ceb797632830c5a0fae5c8b49d688a748337db", 462 | "sha256:c6d798da0002778f38e3b097acd7a620c89ff060fa3823c054113885b2472173", 463 | "sha256:975cbdeb016c3f14ad44f4919260e279918fba08c4bb3d7172ae4bf1aa612292", 464 | "sha256:62e3884a1d7824f20a172ae2861aab50b1802989e85a971f9dfaf61444226856", 465 | "sha256:9e6681a4b1e46936dbcc56ac213f61633979f6f348319658431181ffc3c1936c", 466 | "sha256:e43641ac5bbfc8a0d37fb8b78657f664856fe83b1ab7acf298f57780e6fbf2de", 467 | "sha256:cceb8439975ea508ffd19a312d7ff83149ab81d7e8a88685852bbea4ded98736", 468 | "sha256:bf51429bc11ab4561b5d124c08a5ee6476519d33b5970338586767563a02adc4", 469 | "sha256:ee96a38a3ba3c53e1cdd8cc2af59d5f378b7992e63c54fba9605c963b209e814", 470 | "sha256:233d66bfad2287c61434384ec315bbf37b2f551beda2e0d37a8c24a0f2ed3896" 471 | ], 472 | "version": "==1.4.1" 473 | }, 474 | "tensorflow-tensorboard": { 475 | "hashes": [ 476 | "sha256:4ff1c16faa8189c921b57ccb5f05ea1e19c276d59de7dcae3d846a6267a132d0", 477 | "sha256:6684571c711e07b3aae25dd91cb4b106738d71acfce385b9d359ab14374ac518" 478 | ], 479 | "version": "==0.4.0" 480 | }, 481 | "tqdm": { 482 | "hashes": [ 483 | "sha256:4c041f8019f7be65b8028ddde9a836f7ccc51c4637f1ff2ba9b5813d38d19d5a", 484 | "sha256:df32e6f127dc0ccbc675eadb33f749abbcb8f174c5cb9ec49c0cdb73aa737377" 485 | ], 486 | "version": "==4.19.5" 487 | }, 488 | "urllib3": { 489 | "hashes": [ 490 | "sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b", 491 | "sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f" 492 | ], 493 | "version": "==1.22" 494 | }, 495 | "werkzeug": { 496 | "hashes": [ 497 | "sha256:d5da73735293558eb1651ee2fddc4d0dedcfa06538b8813a2e20011583c9e49b", 498 | "sha256:c3fd7a7d41976d9f44db327260e263132466836cef6f91512889ed60ad26557c" 499 | ], 500 | "version": "==0.14.1" 501 | }, 502 | "wheel": { 503 | "hashes": [ 504 | "sha256:e721e53864f084f956f40f96124a74da0631ac13fbbd1ba99e8e2b5e9cafdf64", 505 | "sha256:9515fe0a94e823fd90b08d22de45d7bde57c90edce705b22f5e1ecf7e1b653c8" 506 | ], 507 | "version": "==0.30.0" 508 | } 509 | }, 510 | "develop": {} 511 | } 512 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Join our Discord >> https://discord.gg/a2Z82Te 2 | 3 | # Review prediction 4 | 5 | ## Introduction 6 | 7 | The aim of this experiment is to investigate the performance of 8 | 1) different NN approaches 9 | 2) different graph representations of the same data 10 | 11 | on a simple synthetic prediction task. 12 | 13 | ## The Task 14 | 15 | We model personalised recommendations as a system containing _people_, _products_ and _recommendations_. In our system every product has a _style_ and each person has a _style preference_. _People_ can make _reviews_ of products. In our system the _review score_ will be a function _Y(...)_ of the person's _style preference_ and the product's _style_. We call this function the _opinion function_ i.e.: 16 | 17 | _review_score_ = _Y(product_style, person_style_preference)_ 18 | 19 | We will generate data using this model. We will then use this synthetic data to investigate how effective various ML approaches on the data set are at learning the behaviour of this system. 20 | 21 | 22 | If necessary we can change the opinion function _Y(...)_ to increase or decrease the difficulty of the task. 23 | 24 | ## The Synthetic Data 25 | 26 | The synthetic data for this task can be varied in various ways: 27 | 28 | 1) Change which information is hidden e.g. we could hide _product_style_, _style_preference_ or both. 29 | 1) Change the representation of the key properties e.g. reviews/styles and preferences could be boolean, categorical, continuous scalars or even multi dimensional vectors. 30 | 1) Change how the data is represented as a graph e.g. reviews could be nodes in their own right, or they could be edges with properties, product_style could be a property on a product node or product_style could be a seperate node connected to a product node by a _HAS_STYLE_ relationship (edge). 31 | 1) Add additional meaningless or semi-meaningless information to the training data. 32 | 33 | We will generate different data sets to qualitatively investigate different ML approaches on the same basic system. 34 | 35 | 36 | ## Evaluation Tasks 37 | 38 | We are interested in four different evaluation tasks depending on whether the person or product is included in the training set or not: 39 | 40 | - **new product == unknown** at training time i.e. not in training set or validation set 41 | - **new person == unknown** at training time i.e. not in training set or validation set 42 | - **existing product == known** at training time i.e. present in training set 43 | - **existing person == known** at training time i.e. present in training set 44 | 45 | The evaluation tasks we are interested in are, how well can you predict the person's review? Given: 46 | 47 | 1) new product and new person 48 | 1) existing product and new person 49 | 1) new product and existing person 50 | 1) existing product and existing person 51 | 52 | 53 | ## Approach 54 | 55 | Although we have a synthetic system for which we can generate more data we want to get into good habits for working with "real" data. So we will attempt to blind the ML system to the fact that we are working with synthetic data and not rely on our ability to generate more information at will. 56 | 57 | It will be the responsibility of the ML part of the system to split the data into Test / Train and Validation sets. However for each data set that we generate we will keep back a small portion to make up a "golden" test set which is only to be used at the very end of our investigation. This is to perform a final test of the ML predictor, one which we haven't had the opportunity to optimise the meta-parameters for. 58 | 59 | Because of the three different evaluation tasks it will be necessary for us to keep back three different golden test sets, of a large enough size to test the system regardless of the test/training split. We will keep the following volumes of golden test data: 60 | 61 | 1) INDEPENDENT: A completely independent data set containing 1000 reviews 62 | 2) NEW_PEOPLE: new people + their reviews of existing products containing approx 2000 reviews 63 | 3) NEW_PRODUCTS: new products + reviews of them by existing people containing approx 2000 reviews 64 | 4) EXISTING: 2000 additional reviews between existing people and products. 65 | 66 | 67 | 68 | # The Data Sets 69 | 70 | ## Data Set 1: A simple binary preference system 71 | 72 | Products have a binary style and people have a binary preference. 73 | 74 | - All variables will be 'public' in the data set 75 | 76 | 77 | ### Product Style 78 | - _product_style_ will be categorical with two mutually exclusive elements (A and B). 79 | - The distribution of product styles will be uniform i.e. Approx 50% of products will have style A and 50% will have style B. 80 | 81 | 82 | ### Style Preference 83 | - _person_style_preference_ will be categorical with two mutually exclusive elements (likes_A_dislikes_B | likes_B_dislikes_A ). 84 | - The distribution of product styles will be uniform i.e. Approx 50% of people will like style A and 50% will like style B. 85 | 86 | 87 | ### Reviews and Opinion Function 88 | - _review_score_ will be boolean (1 for a positive review and 0 for a negative review) 89 | - Each person will have made either 1 or 2 reviews. The mean number of reviews-per-person will be approx 1.5 i.e. approx 50% will have made 2 reviews and 50% will have made 1 review. 90 | - _review_score_ is the dot product of the _product_style_ and _person_style_preference_ normalised to the range of 0 to 1 91 | 92 | Note: having people with 0 reviews would be useless since you cannot train or validate/test using them. 93 | 94 | Note: fixing the number of reviews-per-person would restrict the graph structure too much and open up the problem to approaches that we aren't interested in right now. 95 | 96 | 97 | ### Entity Ratios and Data Set Size 98 | 99 | I basically made these up. Intuitively the reviews-per-product and reviews-per-person parameters affect how much we can infer about people/product hidden variables. I like the idea of those figures being very different so we can see how systems cope with that distinction. 100 | 101 | - _people_:_products_ = 50:1 102 | - _people_:_reviews_ = 1:1.5 103 | - _reviews_:_products_ = 75:1 104 | 105 | Data set size: 12000 reviews / 160 products / 8000 people 106 | 107 | n.b. because we assign the reviews randomly some products may not have reviews, but it is relatively unlikely. 108 | 109 | ### Graph Schema 110 | 111 | PERSON(id: , style_preference: A|B, is_golden: True|False) -- WROTE(is_golden: True|False) -> REVIEW(id: , score: 1|0, is_golden: True|False) -- OF(is_golden: True|False) --> PRODUCT(id: , style: A:B, is_golden: True|False) 112 | 113 | ### Data generation algorithm 114 | 115 | 1) Instantiate all products for public data set and write to Neo, keeping an array of the ids. 116 | 1) Iteratively instantiate people, decide how many reviews that person will have made (probabilistically) 117 | 1) For each review that the person has to make randomly choose a product to review (without replacement) 118 | 1) Calculate the review score and submit the Person + their reviews to Neo 119 | 1) Read the data back out of neo and validate the entity ratios 120 | 1) Create the golden test sets: 121 | - NEW_PEOPLE: create 2000/reviews_per_person new people + their reviews of randomly selected (with replacement) existing products. 122 | - NEW_PRODUCTS: create 2000/reviews_per_product new products, have randomly selected (with replacement) people review them. 123 | - EXISTING randomly pick 2000 people (with replacement) have each of them review a randomly selected (with replacement) product 124 | - INDEPENDENT is easy, but best to leave till last to avoid confusion - just repeat the basic data generation from scratch 125 | 126 | 127 | -------------------------------------------------------------------------------- /bin/floyd-run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | floyd run \ 4 | --data davidmack/datasets/graph_experiments/1:/data \ 5 | --env tensorflow-1.4 \ 6 | --gpu \ 7 | --tensorboard \ 8 | --message "adj dense with dropout" \ 9 | "ENVIRONMENT=floyd python train.py \ 10 | --output-dir /output \ 11 | --data-dir /data/ \ 12 | --epochs 100" -------------------------------------------------------------------------------- /bin/start_neo4j_locally.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CONTAINER_ID=$(docker run -d -e NEO4J_dbms_memory_heap_max__size=2000m --publish=7474:7474 --publish=7687:7687 --volume=$(pwd)/data/neo4j:/data neo4j:3.2.7) 4 | sleep 10 5 | docker run -it --net host neo4j:3.2.7 bin/cypher-shell -u neo4j -p neo4j "CALL dbms.changePassword('local neo hates security!')" 6 | 7 | echo "Neo4j running locally. To stop it: docker kill ${CONTAINER_ID}" -------------------------------------------------------------------------------- /config/.gitignore: -------------------------------------------------------------------------------- 1 | local* -------------------------------------------------------------------------------- /config/__init__.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from .environment import Environment 3 | 4 | # There are too many layers and too many files to this config system 5 | default_values = { 6 | 'neo4j_url': 'bolt://localhost', 7 | 'neo4j_user': 'neo4j', 8 | 'neo4j_password': 'local neo hates security!' 9 | } 10 | 11 | environment_box = Environment(None) 12 | 13 | 14 | def set_environment(environment_name): 15 | environment_box.name = environment_name 16 | 17 | 18 | def get(config_variable_name): 19 | # don't execute code in overrides till necessary 20 | from .overrides import overrides 21 | return overrides[environment_box.name].get(config_variable_name, default_values[config_variable_name]) 22 | 23 | 24 | class Config(object): 25 | @property 26 | def neo4j_url(self): 27 | return get('neo4j_url') 28 | 29 | @property 30 | def neo4j_user(self): 31 | return get('neo4j_user') 32 | 33 | @property 34 | def neo4j_password(self): 35 | return get('neo4j_password') 36 | 37 | 38 | config: Config = Config() 39 | 40 | import os 41 | 42 | if 'ENVIRONMENT' not in os.environ: 43 | raise Exception("You must set an ENVIRONMENT variable. Sorry, I am very opinionated that we should not have a default value because it will mask misconfiguration issues later.") 44 | set_environment(os.environ['ENVIRONMENT']) -------------------------------------------------------------------------------- /config/environment.py: -------------------------------------------------------------------------------- 1 | class Environment(object): 2 | def __init__(self, name): 3 | self.name = name -------------------------------------------------------------------------------- /config/overrides.py: -------------------------------------------------------------------------------- 1 | import json 2 | overrides = dict() 3 | 4 | # There are too many layers and too many files to this config system 5 | overrides.update(**{ 6 | 'remote': { 7 | 'neo4j_url': 'bolt://796bafef-staging.databases.neo4j.io', 8 | 'neo4j_user': 'readonly', 9 | 'neo4j_password': '0s3DGA6Zq' 10 | }, 11 | 'floyd': { # Todo: implement me? 12 | 'neo4j_url': 'bolt://796bafef-staging.databases.neo4j.io', 13 | 'neo4j_user': 'readonly', 14 | 'neo4j_password': '0s3DGA6Zq' 15 | }, 16 | 'local': { # Just uses defaults 17 | 18 | } 19 | }) 20 | 21 | with open('./config/local_overrides.json') as f: 22 | overrides.update(json.load(f)) -------------------------------------------------------------------------------- /data_sets/synthetic_review_prediction/article_0/__init__.py: -------------------------------------------------------------------------------- 1 | from .configure import DATASET_NAME, create_data_set_properties 2 | from .generate import run as _run 3 | 4 | 5 | def run(client): 6 | print(DATASET_NAME) 7 | return _run(client, create_data_set_properties()) 8 | -------------------------------------------------------------------------------- /data_sets/synthetic_review_prediction/article_0/configure.py: -------------------------------------------------------------------------------- 1 | from ..meta_classes import DataSetProperties 2 | from ..meta_classes.data_set_properties import PersonStyleWeightDistribution, PersonStyleWeight, ProductStyleWeight 3 | from ..utils import WeightedOption, Distribution 4 | from ..classes import PersonStylePreferenceEnum, ProductStyleEnum, Style 5 | from ..experiment_1.opinion_function import opinion_function 6 | from ..experiment_1.style_functions import person_style_function, product_style_function 7 | from graph_io.classes.dataset_name import DatasetName 8 | 9 | DATASET_NAME = DatasetName('article_0') 10 | 11 | 12 | def create_data_set_properties() -> DataSetProperties: 13 | N_STYLES = 2 14 | styles = [Style(str(i)) for i in range(N_STYLES)] 15 | 16 | for style in styles: 17 | ProductStyleEnum.register('LIKES_STYLE_'+style.value, style) 18 | PersonStylePreferenceEnum.register('HAS_STYLE_'+style.value, style) 19 | 20 | data_set_properties = DataSetProperties( 21 | dataset_name=DATASET_NAME, 22 | n_reviews=20000, 23 | reviews_per_product=10, 24 | reviews_per_person_distribution=[ 25 | WeightedOption[int](1, 0.25), 26 | WeightedOption[int](2, 0.25), 27 | WeightedOption[int](3, 0.25), 28 | WeightedOption[int](4, 0.25) 29 | ], 30 | person_styles_distribution=PersonStyleWeightDistribution([ 31 | PersonStyleWeight(x, 1) for x in PersonStylePreferenceEnum.iterate() 32 | ]), 33 | product_styles_distribution=Distribution[ProductStyleWeight, ProductStyleEnum]([ 34 | ProductStyleWeight(x, 1) for x in ProductStyleEnum.iterate() 35 | ]), 36 | opinion_function=opinion_function, 37 | person_style_function=person_style_function, 38 | product_style_function=product_style_function, 39 | n_companies=0, 40 | person_company_number_of_relationships_distribution=[] 41 | ) 42 | 43 | return data_set_properties 44 | -------------------------------------------------------------------------------- /data_sets/synthetic_review_prediction/article_0/generate.py: -------------------------------------------------------------------------------- 1 | from ..classes import PersonWroteReview, ReviewOfProduct, IsGoldenFlag 2 | import random 3 | 4 | from ..meta_classes import DataSetProperties 5 | from ..experiment_1.simple_data_set import SimpleDataSet 6 | from ..utils import DatasetWriter 7 | from graph_io import QueryParams, CypherQuery 8 | 9 | 10 | def run(client, data_set_properties: DataSetProperties): 11 | 12 | with DatasetWriter(client, data_set_properties.dataset_name, {"is_golden",""}) as writer: 13 | 14 | writer.nuke_dataset() 15 | 16 | data_set: SimpleDataSet = SimpleDataSet(data_set_properties) 17 | 18 | def create_indexes(): 19 | client.execute_cypher_write(CypherQuery("CREATE INDEX ON :NODE(id)"), QueryParams()) 20 | #client.execute_cypher_write(CypherQuery("CREATE INDEX ON :NODE(id, dataset_name)"), QueryParams()) 21 | pass 22 | 23 | create_indexes() 24 | 25 | for i, product in enumerate(data_set.generate_public_products()): 26 | writer.create_node_if_not_exists(product, {"style"}) 27 | 28 | for i, person in enumerate(data_set.generate_public_people()): 29 | writer.create_node_if_not_exists(person, {"style_preference"}) 30 | 31 | for review in data_set.generate_reviews(person): 32 | review.test = random.random() <= 0.1 33 | writer.create_node_if_not_exists(review, {"score", "test"}) 34 | writer.create_edge_if_not_exists(PersonWroteReview(review.by_person, review.id, IsGoldenFlag(False)), set()) 35 | writer.create_edge_if_not_exists(ReviewOfProduct(review.id, review.of_product, IsGoldenFlag(False)), set()) 36 | 37 | 38 | -------------------------------------------------------------------------------- /data_sets/synthetic_review_prediction/utils/dataset_writer.py: -------------------------------------------------------------------------------- 1 | from graph_io import SimpleNodeClient, CypherQuery, QueryParams 2 | from ..classes import GraphNode, GraphEdge, IsGoldenFlag 3 | from graph_io.classes.dataset_name import DatasetName 4 | from typing import Set, AnyStr 5 | from multiprocessing.pool import ThreadPool 6 | from multiprocessing.queues import Queue 7 | from uuid import UUID 8 | 9 | 10 | class DatasetWriter(object): 11 | ADDITIONAL_NODE_PROPERTIES: Set[AnyStr] = {'id'} 12 | 13 | def __init__(self, 14 | client: SimpleNodeClient, 15 | dataset_name: DatasetName, 16 | properties_to_ignore: Set[str] = set() 17 | ): 18 | self.properties_to_ignore = properties_to_ignore 19 | self.dataset_name = dataset_name 20 | self._client = client 21 | self.pool = ThreadPool(1) 22 | 23 | def __enter__(self): 24 | # TODO: do query batching with a buffer etc. to increase performance 25 | return self 26 | 27 | def __exit__(self, exc_type, exc_val, exc_tb): 28 | self._client.run_batch() 29 | # TODO: on non error exits wait until the buffer has all flushed 30 | pass 31 | 32 | def nuke_dataset(self): 33 | query = CypherQuery("MATCH (n:NODE {dataset_name: $dataset_name}) DETACH DELETE n") 34 | self._client.execute_cypher_write(query, QueryParams(dataset_name=self.dataset_name)) 35 | 36 | def create_node_if_not_exists(self, node: GraphNode, properties: Set[AnyStr]): # TODO: define properties on the node entity itself? 37 | properties = properties.union(self.ADDITIONAL_NODE_PROPERTIES) 38 | 39 | query_params = self._get_properties_for_query(node, properties) 40 | 41 | create_query = CypherQuery(f"MERGE (n:{node.label_string} {query_params.query_string} )") 42 | 43 | result = self._client.add_to_batch(create_query, query_params) 44 | # TODO: check that result wasn't an error 45 | 46 | print("merged node", query_params._params, result) 47 | 48 | def create_edge_if_not_exists(self, edge: GraphEdge, properties: Set[AnyStr]): 49 | _from = edge._from 50 | _to = edge._to 51 | 52 | query_params = self._get_properties_for_query(edge, properties) 53 | 54 | match = f"MATCH (from:{_from.label_string} {{ id: $from_id }}), (to:{_to.label_string} {{ id: $to_id }})" 55 | merge = f"MERGE (from)-[r:{edge.relationship} {query_params.query_string} ]->(to)" 56 | 57 | create_query = CypherQuery(match + "\n" + merge) 58 | query_params = query_params.union(QueryParams(from_id=str(_from.id.value), to_id=str(_to.id.value))) 59 | 60 | result = self._client.add_to_batch(create_query, query_params) 61 | print("merged edge", query_params._params, result) 62 | 63 | def _get_properties_for_query(self, node, properties, prefix=None): 64 | properties.add('is_golden') 65 | 66 | properties_dict = { 67 | name if not prefix else f"{prefix}_{name}": getattr(node, name) for name in properties if name not in self.properties_to_ignore 68 | } 69 | 70 | query_params = QueryParams(dataset_name=self.dataset_name, **properties_dict) 71 | return query_params 72 | -------------------------------------------------------------------------------- /experiment/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .experiment import Experiment 3 | from .experiment_header import ExperimentHeader 4 | from .directory import directory -------------------------------------------------------------------------------- /experiment/arguments.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | 4 | from .directory import directory, default_experiment 5 | 6 | class Arguments(object): 7 | def parse(): 8 | 9 | parser = argparse.ArgumentParser() 10 | 11 | parser.add_argument('--experiment', type=str, default=default_experiment, choices=directory.keys()) 12 | parser.add_argument('--dataset-name', type=str, default=None) 13 | 14 | 15 | parser.add_argument('--batch_size', type=int, default=32) 16 | parser.add_argument('--epochs', type=int, default=None) 17 | parser.add_argument('--random-seed', type=int, default=None) 18 | parser.add_argument('--verbose', type=int, default=1) 19 | 20 | parser.add_argument('--golden', action='store_true') 21 | parser.add_argument('--not-lazy', dest='lazy', action='store_false') 22 | parser.add_argument('--no-say', dest='say_result', action='store_false') 23 | parser.add_argument('--load-weights', action='store_true') 24 | parser.add_argument('--print-weights', action='store_true') 25 | parser.add_argument('--custom-test', action='store_true') 26 | 27 | parser.add_argument('--output-dir', type=str, default="./output") 28 | parser.add_argument('--data-dir', type=str, default="./data") 29 | 30 | return parser.parse_args() 31 | -------------------------------------------------------------------------------- /experiment/directory.py: -------------------------------------------------------------------------------- 1 | from data_sets import * 2 | from basic_types import NanoType 3 | 4 | from .experiment_header import ExperimentHeader 5 | 6 | shared_query = { 7 | "product_and_product_subgraph": """ 8 | MATCH p= 9 | (a:PERSON {is_golden:{golden}, dataset_name:{dataset_name}}) 10 | -[:WROTE {is_golden:{golden}, dataset_name:{dataset_name}}]-> 11 | (b:REVIEW {is_golden:{golden}, dataset_name:{dataset_name}}) 12 | -[:OF {is_golden:{golden}, dataset_name:{dataset_name}}]-> 13 | (product:PRODUCT {is_golden:{golden}, dataset_name:{dataset_name}}) 14 | 15 | WITH 16 | product, 17 | COLLECT(p) as neighbors 18 | 19 | RETURN 20 | product, 21 | neighbors 22 | 23 | """ 24 | 25 | } 26 | 27 | directory = { 28 | "review_from_visible_style": ExperimentHeader( 29 | """ 30 | A simple baseline experiment. 31 | 32 | From a person's style preference and a product's style, predict review score. 33 | 34 | review_score = dot(style_preference, product_style) 35 | """, 36 | EXPERIMENT_2_DATASET, 37 | """MATCH p= 38 | (a:PERSON {is_golden:{golden}, dataset_name:{dataset_name}}) 39 | -[:WROTE {is_golden:{golden}, dataset_name:{dataset_name}}]-> 40 | (b:REVIEW {is_golden:{golden}, dataset_name:{dataset_name}}) 41 | -[:OF {is_golden:{golden}, dataset_name:{dataset_name}}]-> 42 | (c:PRODUCT {is_golden:{golden}, dataset_name:{dataset_name}}) 43 | RETURN a.style_preference AS style_preference, c.style AS style, b.score AS score 44 | """, 45 | float 46 | ), 47 | 48 | 49 | "review_from_hidden_style_neighbor_conv": ExperimentHeader( 50 | """ 51 | A simple experiment requiring the ML system to aggregate information from a sub-graph 52 | 53 | Predict a person's score for a product, given a person's style preference and the product 54 | 55 | This needs to be able to take in the review graph for a product 56 | and infer the product's style based on the style_preference and scores other people gave the product. 57 | 58 | Plan for the network (assume 1 hot encoding for categorical variables): 59 | 60 | For a product (product): 61 | For a person (person): 62 | 63 | - get array of N other people's reviews: [other_person.style_preference, score] x N 64 | - Apply 1d_convolution output: [product_style] x N 65 | - Apply average across N, output: [product_style] 66 | - Apply softmax, output: [product_style] 67 | - Concat with person, output: [product_style, person.style_preference] 68 | - Apply dense layer, activation sigmoid, output: [score] 69 | 70 | - Train that! 71 | 72 | """, 73 | EXPERIMENT_2_DATASET, 74 | """ 75 | MATCH (a:PERSON) 76 | -[e1:WROTE ]-> 77 | (b:REVIEW {is_golden:{golden}, dataset_name:{dataset_name}}) 78 | -[e2:OF ]-> 79 | (c:PRODUCT), 80 | others= 81 | (other_person:PERSON {is_golden:{golden}, dataset_name:{dataset_name}}) 82 | -[:WROTE {is_golden:{golden}, dataset_name:{dataset_name}}]-> 83 | (other_review:REVIEW {is_golden:{golden}, dataset_name:{dataset_name}}) 84 | -[:OF {is_golden:{golden}, dataset_name:{dataset_name}}]-> 85 | (c) 86 | WHERE other_person<>a AND other_review<>b 87 | WITH 88 | a,b,c, 89 | e1,e2, 90 | COLLECT(others) as neighbors 91 | WHERE a.dataset_name={dataset_name} AND a.is_golden={golden} 92 | AND b.dataset_name={dataset_name} AND b.is_golden={golden} 93 | AND c.dataset_name={dataset_name} AND c.is_golden={golden} 94 | AND e1.dataset_name={dataset_name} AND e1.is_golden={golden} 95 | AND e2.dataset_name={dataset_name} AND e2.is_golden={golden} 96 | RETURN 97 | a.style_preference AS style_preference, 98 | b.score AS score, 99 | neighbors 100 | 101 | """, 102 | float 103 | ), 104 | 105 | "review_from_all_hidden_simple_unroll": ExperimentHeader( 106 | """ 107 | # Objective 108 | 109 | Learn a function `score(input_person, input_product)` that gives a product review 110 | given a person and a product. 111 | 112 | ## Input format 113 | 114 | People, reviews and products are essentially anonymous and defined by their relationship 115 | to each-other. 116 | 117 | Our network needs to take in a portion of the graph then output the predicted score. 118 | 119 | The graph is transformed and formatted in a consistent fashion, allowing the network 120 | to understand which person and product is being input. 121 | 122 | # Solution 123 | 124 | Allow the network to find look-a-likes by generating array of person-product-person-product-person chains 125 | 126 | E.g. If me and my lookalike both liked product X, then we'll agree for product Y 127 | 128 | This has a limitation that it can only successfully predict a score of there happens to be someone 129 | with the same style_preference who has reviewed a product you have also reviewed. 130 | 131 | """, 132 | EXPERIMENT_4_DATASET, 133 | """ 134 | MATCH g=(input_person:PERSON) 135 | -[:WROTE]-> 136 | (target_review:REVIEW {dataset_name:{dataset_name}}) 137 | -[:OF]-> 138 | (input_product:PRODUCT) 139 | <-[:OF]- 140 | (review1:REVIEW) 141 | <-[:WROTE]- 142 | (person2:PERSON) 143 | -[:WROTE]-> 144 | (review2:REVIEW) 145 | -[:OF]-> 146 | (product2:PRODUCT) 147 | <-[:OF]- 148 | (review3:REVIEW) 149 | <-[:WROTE]- 150 | (input_person) 151 | 152 | WHERE 153 | input_person<>person2 154 | AND input_product<>product2 155 | 156 | RETURN 157 | target_review.score as score, 158 | COLLECT([1.0, review1.score, review2.score, review3.score])[0..50] as neighbors, 159 | 160 | // These two need to be here otherwise the query implicitly groups by score 161 | input_product.id, 162 | input_person.id 163 | 164 | """, 165 | float, 166 | { 167 | "neighbor_count":50 168 | } 169 | ), 170 | 171 | "review_from_all_hidden_random_walks": ExperimentHeader( 172 | """ 173 | Let's try to do a RNN that operates on pieces of the graph 174 | Generate random walks. 175 | 176 | This is a great problem because it requires the network to find a specific 177 | shape of subgraph in order to answer the question. 178 | 179 | It needs to find a loop, with 1s on the review scores, like such: 180 | 181 | (REVIEW=1) --> (PRODUCT) <-- (REVIEW=1) <-- (PERSON_B) 182 | ↑ | 183 | | ↓ 184 | (PERSON_A) --> (THE_REVIEW) --> (PRODUCT) <-- (REVIEW=1) 185 | 186 | 187 | # Idea 188 | 189 | What if the parameters define a shape the network wants to look for? 190 | 191 | That's the solution to this problem and could be useful for other problems, 192 | particularly since the magic of neural networks lets you define a noise-resiliant 193 | function, and an ensemble of shapes. 194 | 195 | Let: 196 | 197 | const string_length = 9 198 | pattern:List[part] = |----|-----|-----|----| ==> Convolve 1D with path 199 | path:List[part] = (a)-->(b)-->(c)-->(d) 200 | part = (type, parameter_values, is_target) | Loop | None 201 | target_type = "REVIEW" 202 | 203 | ## Algorithm 204 | 205 | 1) For each node of type=target_type: 206 | 1.a) Generate all paths s.t. |path| <= string_length 207 | 1.b) If a path is cyclic it should have a 'Loop' element after the nodes 208 | 2) Feed to network ([path, ..., path], target_review_score) 209 | 3) Network performs 1D convolution of each path with pattern kernel (The overflow of the kernel should wrap around the input path) 210 | 4) Network performs a 1D convolution on those outputs 211 | 5) Network sums those values 212 | 6) Network applies a dense layer, thus outputting y_prediction 213 | 214 | 215 | """, 216 | EXPERIMENT_4_DATASET, 217 | """ 218 | MATCH p= 219 | (review:REVIEW {is_golden:{golden}, dataset_name:{dataset_name}}) 220 | -[*8]- 221 | (otherB) 222 | WHERE review.id={id} 223 | WITH 224 | review, 225 | COLLECT(p)[0..600] as neighbors 226 | RETURN 227 | review, 228 | neighbors 229 | """, 230 | float, 231 | { 232 | "generate_address": False, 233 | "target_dropout": 0.0, 234 | "memory_size": 1000, 235 | "word_size": 4, 236 | "sequence_size": 600, 237 | "patch_width": 7, 238 | "patch_size": 20, 239 | "epochs": 20, 240 | "repeat_batch": 1, 241 | "working_width": 64, 242 | "id_limit": 32 * 10 243 | }, 244 | ["id_limit"] 245 | ), 246 | 247 | "review_from_all_hidden_adj": ExperimentHeader( 248 | """ 249 | Try the following: 250 | - variable pr represents PRODUCT style vectors 251 | - variable pe represents PERSON preference vectors 252 | - x = adj matrix of PRODUCT-REVIEW-PERSON 253 | - y = adj matrix of same with REVIEW.score as the weights 254 | - Use optimizer to optimize the style/pref vectors such that: Dot(MatMul(pr, T(pe)), x) = y 255 | 256 | 257 | """, 258 | EXPERIMENT_5_DATASET, 259 | """ 260 | MATCH p= 261 | (person:PERSON) --> 262 | (review:REVIEW {is_golden:{golden}, dataset_name:{dataset_name}}) --> 263 | (product:PRODUCT) 264 | RETURN 265 | person.id as person_id, review.score as score, product.id as product_id 266 | """, 267 | "adj_equals", 268 | { 269 | "product_count": 160, # total 160 270 | "person_count": 1200, # total 1200 271 | "style_width": 12, 272 | "epochs": 10000, 273 | "batch_per_epoch": 10 274 | } 275 | ), 276 | 277 | "style_from_neighbor_conv": ExperimentHeader( 278 | """ 279 | A precursor to review_from_hidden_style_neighbor_conv 280 | 281 | This experiment seeks to see if we can efficiently determine a product's style 282 | given it's set of reviews and the style_preference of each reviewer. 283 | 284 | This should be easy!! 285 | 286 | """, 287 | EXPERIMENT_2_DATASET, 288 | shared_query["product_and_product_subgraph"], 289 | list, 290 | ), 291 | 292 | "style_from_neighbor_rnn": ExperimentHeader( 293 | """ The same as style_from_neighbor_conv but using an RNN instead of convolution """, 294 | EXPERIMENT_2_DATASET, 295 | shared_query["product_and_product_subgraph"], 296 | list 297 | ) 298 | 299 | } 300 | 301 | default_experiment = "review_from_all_hidden_adj" 302 | 303 | 304 | -------------------------------------------------------------------------------- /experiment/experiment.py: -------------------------------------------------------------------------------- 1 | 2 | from datetime import datetime 3 | from colorama import init, Fore, Style 4 | import logging 5 | import coloredlogs 6 | import colored_traceback.auto 7 | import os 8 | 9 | from graph_ml import Train, Dataset 10 | from .arguments import Arguments 11 | from .directory import directory 12 | 13 | 14 | init() 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | class Experiment(object): 19 | def __init__(self, name, header, params): 20 | self.name = name 21 | self.header = header 22 | self.params = params 23 | self.run_tag = str(datetime.now()) 24 | 25 | @classmethod 26 | def run(cls): 27 | 28 | params = Arguments.parse() 29 | 30 | if params.verbose > 0: 31 | coloredlogs.install(level='INFO', logger=logging.getLogger("experiment")) 32 | coloredlogs.install(level='INFO', logger=logging.getLogger("graph_ml")) 33 | coloredlogs.install(level='INFO', logger=logging.getLogger("graph_io")) 34 | 35 | experiment = Experiment(params.experiment, directory[params.experiment], params) 36 | 37 | print(Fore.GREEN) 38 | print("#######################################################################") 39 | print(f"📟 Running experiment {experiment.name} {experiment.run_tag}") 40 | print("#######################################################################") 41 | print(Style.RESET_ALL) 42 | 43 | dataset = Dataset.get(experiment) 44 | score = Train.run(experiment, dataset) 45 | 46 | print(Fore.YELLOW) 47 | print("#######################################################################") 48 | print("Experiment results") 49 | print(f"{experiment.name} test loss {round(score[0],6)}") 50 | print(f"{experiment.name} test accuracy {round(score[1])}%") 51 | print("#######################################################################") 52 | print(Style.RESET_ALL) 53 | 54 | # t = '-title {!r}'.format(title) 55 | # s = '-subtitle {!r}'.format(subtitle) 56 | # m = '-message {!r}'.format(message) 57 | os.system(f"terminal-notifier -message 'test accuracy {round(score[1]*100)}% loss {round(score[0],2)}' -title Octavian") 58 | 59 | if params.say_result: 60 | os.system(f"say test accuracy {round(score[1]*100)} percent") 61 | 62 | -------------------------------------------------------------------------------- /experiment/experiment_header.py: -------------------------------------------------------------------------------- 1 | 2 | from typing import List 3 | from graph_io.classes import DatasetName 4 | 5 | class ExperimentHeader(object): 6 | def __init__(self, doc="", dataset_name: DatasetName=None, cypher_query=None, target=None, params={}, lazy_params:List[str]=[]): 7 | # Jesus I have to spell this out?! 8 | # WTF are the python language devs doing?! 9 | self.dataset_name = dataset_name 10 | self.doc = doc 11 | self.cypher_query = cypher_query 12 | self.target = target 13 | self.params = params 14 | self.lazy_params = lazy_params -------------------------------------------------------------------------------- /floyd_requirements.txt: -------------------------------------------------------------------------------- 1 | neo4j-driver 2 | lazy 3 | h5py 4 | colorama 5 | coloredlogs 6 | more-itertools 7 | git+git://github.com/datalogai/recurrentshop.git#egg=recurrentshop 8 | colored-traceback 9 | sklearn 10 | tqdm -------------------------------------------------------------------------------- /graph_ml/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .train import Train 3 | from .dataset import Dataset 4 | from .ntm import NTMBase 5 | -------------------------------------------------------------------------------- /graph_ml/adjacency_layer.py: -------------------------------------------------------------------------------- 1 | from keras import backend as K 2 | import tensorflow as tf 3 | from keras.engine.topology import Layer, Input 4 | from keras import regularizers, initializers, layers, activations 5 | from functools import partial 6 | import numpy as np 7 | 8 | class PD(regularizers.Regularizer): 9 | def __init__(self, a=0.0001, b=0.0, axis=-1): 10 | self.a = K.cast_to_floatx(a) 11 | self.b = K.cast_to_floatx(b) 12 | 13 | self.axis = axis 14 | 15 | def __call__(self, x): 16 | sum_to_one = K.abs(1.0 - K.sum(K.abs(x), axis=self.axis)) 17 | different_by_one = K.abs(1.0 - K.abs(x[:,0] - x[:,1])) 18 | core = self.a * sum_to_one + self.b * different_by_one 19 | 20 | return K.sum(core) 21 | 22 | def get_config(self): 23 | return {'a': float(self.a), 'b': float(self.b)} 24 | 25 | 26 | class Clip(regularizers.Regularizer): 27 | def __init__(self, max=1): 28 | self.max = max 29 | 30 | def __call__(self, x): 31 | K.clip(x, min_value=-1, max_value=1) 32 | 33 | def get_config(self): 34 | return {'max': float(self.max)} 35 | 36 | 37 | class Adjacency(Layer): 38 | 39 | def __init__(self, person_count, product_count, style_width, **kwargs): 40 | self.person_count = person_count 41 | self.product_count = product_count 42 | self.style_width = style_width 43 | self.dense1 = layers.Dense(units=(style_width), activation=activations.softplus, use_bias=False, kernel_regularizer=Clip) 44 | #self.dense2 = layers.(units=(1), activation=activations.linear) 45 | self.dense3 = layers.Dense(units=1, activation=partial(activations.relu, alpha=0.1), use_bias=False, kernel_regularizer=Clip) 46 | super(Adjacency, self).__init__(**kwargs) 47 | 48 | def __call__(self, inputs, **kwargs): 49 | self.batch_size = inputs.shape[0] 50 | product_ct = inputs.shape[1] 51 | person_ct = inputs.shape[2] 52 | my_batch = product_ct * person_ct 53 | 54 | self.inner_input = Input(batch_shape=(product_ct, person_ct, 2, self.style_width), dtype='float32', name="inner_d0") 55 | self.reshaped_to_look_like_a_batch = K.reshape(self.inner_input, (product_ct * person_ct, 2 * self.style_width)) 56 | self.dense1_called = self.dense1(self.reshaped_to_look_like_a_batch) 57 | #self.dense2_called = self.dense2(self.dense1_called) 58 | self.dense3_called = self.dense3(self.dense1_called) 59 | self.reshaped_to_look_like_adj_mat = K.reshape(self.dense3_called, (product_ct, person_ct, 1)) 60 | return super(Adjacency, self).__call__(inputs, **kwargs) 61 | 62 | def cartesian_product_matrix(self, a, b): 63 | tile_a = tf.tile(tf.expand_dims(a, 1), [1, tf.shape(b)[0], 1]) 64 | tile_a = tf.expand_dims(tile_a, 2) 65 | 66 | tile_b = tf.tile(tf.expand_dims(b, 0), [tf.shape(a)[0], 1, 1]) 67 | tile_b = tf.expand_dims(tile_b, 2) 68 | 69 | cartesian_product = tf.concat([tile_a, tile_b], axis=-1) 70 | 71 | return cartesian_product 72 | 73 | 74 | 75 | def build(self, input_shape): 76 | # Create a trainable weight variable for this layer. 77 | self.person = self.add_weight(name='people', 78 | shape=(self.person_count, self.style_width), 79 | initializer='uniform', 80 | # initializer='ones', 81 | # regularizer=PD(), 82 | trainable=True) 83 | 84 | self.product = self.add_weight(name='product', 85 | shape=(self.product_count, self.style_width), 86 | initializer='uniform', 87 | # initializer='ones', 88 | # regularizer=PD(), 89 | trainable=True) 90 | 91 | 92 | # self.wc1 = self.add_weight(name='w1', 93 | # shape=(2, 1), 94 | # initializer='glorot_uniform', 95 | # trainable=True) 96 | 97 | # self.b1 = self.add_weight(name='b1', 98 | # shape=(1, ), 99 | # initializer='zero', 100 | # trainable=True) 101 | 102 | self.w1 = self.add_weight(name='w1', 103 | shape=(2 * self.style_width, 104 | self.style_width), 105 | initializer='glorot_uniform', 106 | trainable=True) 107 | 108 | # self.b1 = self.add_weight(name='b1', 109 | # shape=(self.style_width, ), 110 | # initializer='zero', 111 | # trainable=True) 112 | 113 | self.w2 = self.add_weight(name='w2', 114 | shape=(self.style_width, 1), 115 | initializer='glorot_uniform', 116 | trainable=True) 117 | 118 | # self.b2 = self.add_weight(name='b2', 119 | # shape=(1, ), 120 | # initializer='zero', 121 | # trainable=True) 122 | 123 | 124 | # self.b3 = self.add_weight(name='b2', 125 | # shape=(1,), 126 | # initializer='zero', 127 | # trainable=True) 128 | 129 | # self.w3 = self.add_weight(name='m2', 130 | # shape=(1,), 131 | # initializer='one', 132 | # trainable=True) 133 | 134 | 135 | super(Adjacency, self).build(input_shape) # Be sure to call this somewhere! 136 | 137 | def jitter(self, idx=[0,1], var=0.2): 138 | wts = self.get_weights() 139 | 140 | for i in idx: 141 | wts[i] += np.random.normal(0, var, wts[i].shape) 142 | 143 | self.set_weights(wts) 144 | 145 | def call(self, x): 146 | return self.call_dense(x) 147 | 148 | # 100pc test accuracy 149 | def call_dot_softmax(self, x): 150 | pr = self.product 151 | pe = self.person 152 | 153 | pr = K.softmax(self.product) 154 | pe = K.softmax(self.person) 155 | 156 | m = K.dot(pr, K.transpose(pe)) 157 | m = (self.w3 * m) + self.b3 158 | m = K.relu(m, alpha=0.1) 159 | 160 | m = m * x 161 | 162 | return m 163 | 164 | # 100pc test accuracy 165 | def call_dot(self, x): 166 | pr = self.product 167 | pe = self.person 168 | 169 | m = K.dot(pr, K.transpose(pe)) 170 | m = m * x 171 | 172 | return m 173 | 174 | # Seen at 68% 1-accuracy test 175 | def call_dense(self, x): 176 | self.jitter(idx=[0,1], var=0.1) 177 | 178 | pr = self.product 179 | pe = self.person 180 | 181 | pr = K.softmax(pr) 182 | pe = K.softmax(pe) 183 | 184 | all_pairs = self.cartesian_product_matrix(pr, pe) 185 | flat = K.reshape(all_pairs, (self.product_count * self.person_count, self.style_width * 2)) 186 | 187 | m = K.dot(flat, self.w1) 188 | # m = K.bias_add(m, self.b1) 189 | m = K.relu(m, alpha=0.1) 190 | 191 | m = K.dropout(m, level=0.1) 192 | 193 | m = K.dot(m, self.w2) 194 | m = K.relu(m, alpha=0.1) 195 | 196 | m = K.reshape(m, (1, self.product_count, self.person_count)) 197 | masked = m * x 198 | return masked 199 | 200 | 201 | 202 | # 100pc test accuracy 203 | def call_dense_conv(self, x): 204 | self.jitter(idx=[0,1]) 205 | 206 | pr = self.product 207 | pe = self.person 208 | 209 | pr = K.softmax(pr) 210 | pe = K.softmax(pe) 211 | 212 | all_pairs = self.cartesian_product_matrix(pr, pe) 213 | 214 | flat = K.reshape(all_pairs, (self.product_count * self.person_count * self.style_width, 2)) 215 | m = K.dot(flat, self.wc1) 216 | m = K.tanh(m) 217 | 218 | m = K.reshape(m, (self.product_count * self.person_count, self.style_width)) 219 | m = K.dot(m, self.w2) 220 | m = K.relu(m, alpha=0.1) 221 | 222 | m = K.reshape(m, (1, self.product_count, self.person_count)) 223 | masked = m * x 224 | return masked 225 | 226 | 227 | 228 | def compute_output_shape(self, input_shape): 229 | return input_shape 230 | 231 | 232 | -------------------------------------------------------------------------------- /graph_ml/dataset.py: -------------------------------------------------------------------------------- 1 | 2 | from collections import Counter, namedtuple 3 | import random 4 | import pickle 5 | import os.path 6 | import hashlib 7 | import neo4j 8 | import math 9 | from typing import Callable, Generator, Tuple 10 | import logging 11 | import itertools 12 | from itertools import cycle 13 | import more_itertools 14 | from more_itertools import peekable 15 | 16 | import keras 17 | import numpy as np 18 | from keras.preprocessing import text 19 | from keras.utils import np_utils 20 | 21 | from .path import generate_output_path, generate_data_path 22 | from graph_io import * 23 | # from experiment import Experiment 24 | from .util import * 25 | from .dataset_helpers import * 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | 30 | class Dataset(object): 31 | 32 | 33 | # Applies a per-experiment recipe to Neo4j to get a dataset to train on 34 | # This performs all transformations in-memory - it is not very efficient 35 | @classmethod 36 | def get(cls, experiment): 37 | 38 | # TODO: delete this 39 | legacy_recipes = { 40 | 'review_from_visible_style': Recipe( 41 | split=lambda row: Point(np.concatenate((row['style_preference'], row['style'])), row['score']) 42 | ), 43 | 'review_from_hidden_style_neighbor_conv': Recipe( 44 | split=DatasetHelpers.review_from_hidden_style_neighbor_conv(100), 45 | finalize_x=lambda x: {'person':np.array([i['person'] for i in x]), 'neighbors': np.array([i['neighbors'] for i in x])} 46 | ), 47 | 'style_from_neighbor_conv': Recipe( 48 | split=DatasetHelpers.style_from_neighbor(100) 49 | ), 50 | 'style_from_neighbor_rnn': Recipe( 51 | split=DatasetHelpers.style_from_neighbor(100) 52 | ) 53 | } 54 | 55 | try: 56 | recipe = legacy_recipes[experiment.name] 57 | except: 58 | # TODO: move all to this pattern 59 | recipe = getattr(DatasetHelpers, experiment.name)(experiment) 60 | 61 | 62 | return Dataset(experiment, recipe) 63 | 64 | 65 | 66 | # Split data into test/train set, organise it into a class 67 | def __init__(self, experiment, recipe): 68 | 69 | self.experiment = experiment 70 | self.recipe = recipe 71 | 72 | if experiment.params.random_seed is not None: 73 | random.seed(experiment.params.random_seed) 74 | 75 | if experiment.params.dataset_name is not None: 76 | dataset_name = experiment.params.dataset_name 77 | else: 78 | dataset_name = experiment.header.dataset_name 79 | 80 | query_params = QueryParams( 81 | golden=experiment.params.golden, 82 | dataset_name=dataset_name, 83 | experiment=experiment.name) 84 | 85 | query_params.update(QueryParams(**experiment.header.params)) 86 | 87 | # Calculate params for lazy data loading 88 | data_path_params = {i:query_params[i] for i in experiment.header.lazy_params} 89 | data_path_params["dataset_name"] = dataset_name 90 | 91 | dataset_file = generate_data_path(experiment, '.pkl', data_path_params) 92 | logger.info(f"Dataset file {dataset_file}") 93 | 94 | if os.path.isfile(dataset_file) and experiment.params.lazy: 95 | logger.info(f"Opening dataset pickle {dataset_file}") 96 | data = pickle.load(open(dataset_file, "rb")) 97 | 98 | else: 99 | logger.info("Querying data from database") 100 | with SimpleNodeClient() as client: 101 | cq = CypherQuery(experiment.header.cypher_query) 102 | data = recipe.query(client, cq, query_params) 103 | 104 | # Later shift to query-on-demand 105 | data = list(data) 106 | pickle.dump(data, open(dataset_file, "wb")) 107 | 108 | # We need to know total length of data, so for ease I've listed it here. 109 | # I've used generators everywhere, so if it wasn't for Keras, this would 110 | # be memory efficient 111 | 112 | logger.info(f"Rows returned by Neo4j {len(data)}") 113 | list_data = list(recipe.transform(data)) 114 | total_data = len(list_data) 115 | logger.info(f"Number of rows of data: {total_data}") 116 | 117 | 118 | def repeat_infinitely(gen_fn): 119 | while True: 120 | for x in gen_fn(): 121 | yield x 122 | stream = repeat_infinitely(lambda: recipe.partition(recipe.transform(data))) 123 | 124 | def just(tag): 125 | return ( (i[1].x, i[1].y) for i in stream if i[0] == tag) 126 | 127 | def chunk(it, length): 128 | chunky = more_itertools.chunked(it, length) 129 | for i in chunky: 130 | xs = np.array([j[0] for j in i]) 131 | ys = np.array([j[1] for j in i]) 132 | yield (xs, ys) 133 | 134 | 135 | bs = experiment.params.batch_size 136 | 137 | self.train_generator = peekable(chunk(just("train"), bs)) 138 | self.validation_generator = peekable(chunk(just("validate"), bs)) 139 | self.test_generator = peekable(chunk(just("test"), bs)) 140 | 141 | self.generator = { 142 | "test": self.test_generator, 143 | "train": self.train_generator, 144 | "validate": self.validation_generator 145 | } 146 | 147 | f = self.train_generator.peek() 148 | # logger.info(f"First training item: x:{f[0].shape}, y:{f[1].shape}") 149 | 150 | # These are not exact counts since the data is randomly split at generation time 151 | self.validation_steps = math.ceil(total_data * 0.1 / experiment.params.batch_size) 152 | self.test_steps = math.ceil(total_data * 0.1 / experiment.params.batch_size) 153 | self.steps_per_epoch = math.ceil(total_data * 0.8 / experiment.params.batch_size) * int(experiment.header.params.get('repeat_batch', 1)) 154 | 155 | self.input_shape = self.train_generator.peek()[0][0].shape 156 | 157 | 158 | 159 | 160 | 161 | -------------------------------------------------------------------------------- /graph_ml/dataset_helpers.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from collections import Counter, namedtuple 4 | import random 5 | import pickle 6 | import os.path 7 | import hashlib 8 | import neo4j 9 | import math 10 | from typing import Callable, Generator, Tuple 11 | import logging 12 | import itertools 13 | from itertools import cycle 14 | import more_itertools 15 | from more_itertools import peekable 16 | 17 | import keras 18 | import numpy as np 19 | from keras.preprocessing import text 20 | from keras.utils import np_utils 21 | 22 | from .path import generate_output_path, generate_data_path 23 | from graph_io import * 24 | # from experiment import Experiment 25 | from .util import * 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | 30 | 31 | class Point(object): 32 | def __init__(self, x, y): 33 | self.x = x 34 | self.y = y 35 | 36 | # This is weird, I know, re-write later when I'm making this more efficient 37 | def append(self, point): 38 | self.x.append(point.x) 39 | self.y.append(point.y) 40 | 41 | def __str__(self): 42 | return "{x:\n" + str(self.x) + ",\ny:\n" + str(self.y) + "}" 43 | 44 | def __repr__(self): 45 | return self.__str__() 46 | 47 | 48 | def noop(): 49 | pass 50 | 51 | RecordGenerator = Generator[neo4j.v1.Record, None, None] 52 | PointGenerator = Generator[Point, None, None] 53 | 54 | class Recipe: 55 | def __init__(self, 56 | transform:Callable[[RecordGenerator], PointGenerator] = None, 57 | query:Callable[[], RecordGenerator] = None, 58 | partition:Callable[[PointGenerator], Generator[Tuple[str, Point], None, None]] = None, 59 | split:Callable[[neo4j.v1.Record], Point] = None, 60 | finalize_x = None): 61 | 62 | self.transform = transform 63 | self.query = query 64 | self.partition = partition 65 | 66 | # TODO: migrate older experiments 67 | if transform is None: 68 | def legacy_transform(rows): 69 | for i in rows: 70 | p = split(i) 71 | p.x = finalize_x(p.x) if finalize_x else p.x 72 | yield p 73 | self.transform = legacy_transform 74 | 75 | if query is None: 76 | def default_query(client, cypher_query, query_params): 77 | return client.execute_cypher(cypher_query, query_params) 78 | 79 | self.query = default_query 80 | 81 | if partition is None: 82 | def default_partition(data): 83 | random.shuffle(data) 84 | c = 0 85 | for i in data: 86 | 87 | if c == 9: 88 | l = "test" 89 | elif c == 8: 90 | l = "validate" 91 | else: 92 | l = "train" 93 | 94 | c = (c + 1) % 10 95 | 96 | yield (l, i) 97 | self.partition = default_partition 98 | 99 | 100 | class DatasetHelpers(object): 101 | 102 | @staticmethod 103 | def ensure_length(arr, length): 104 | delta = length - arr.shape[0] 105 | if delta > 0: 106 | pad_shape = ((0,delta),) 107 | for i in range(len(arr.shape)-1): 108 | pad_shape += ((0, 0),) 109 | arr = np.pad(arr, pad_shape, 'constant', constant_values=0.0) 110 | elif delta < 0: 111 | arr = arr[:length] 112 | 113 | assert len(arr) == length, f"ensure_length failed to resize, {len(arr)} != {length}" 114 | 115 | return arr 116 | 117 | @staticmethod 118 | def path_map_style_preference_score(cls, path): 119 | other_person = path.nodes[0] 120 | other_review = path.nodes[1] 121 | return np.concatenate(( 122 | np.array(other_person.properties['style_preference']), 123 | [other_review.properties['score']] 124 | )) 125 | 126 | # Turn neighbors sub-graph into a sampled array of neighbours 127 | # @argument length What size of array should be returned. Use None for variable. If you request a fixed length, the first column of the feature is a 0.0/1.0 flag of where there is data or zeros in that feature row 128 | @classmethod 129 | def collect_neighbors(cls, row, key, path_map, length:int): 130 | subrows = [] 131 | for path in row[key]: 132 | subrows.append(path_map(path)) 133 | 134 | # Lets always shuffle to keep the network on its toes 135 | # If you use --random-seed you'll fix this to be the same each run 136 | np.random.shuffle(subrows) 137 | 138 | if length is not None: 139 | if len(subrows) > length: 140 | subrows = subrows[:length] 141 | 142 | subrows = np.pad(subrows, ((0,0), (1,0)), 'constant', constant_values=1.0) # add 'none' flag 143 | 144 | # pad out if too small 145 | # note if there are zero subrows, this won't know the width to make the zeros, so it'll be 1 wide and broadcast later 146 | if len(subrows) < length: 147 | delta = length - subrows.shape[0] 148 | subrows = np.pad(subrows, ((0,delta), (0, 0)), 'constant', constant_values=0.0) 149 | 150 | return subrows 151 | 152 | 153 | @classmethod 154 | def review_from_hidden_style_neighbor_conv(cls, length): 155 | def transform_row(row): 156 | neighbors = cls.collect_neighbors(row, 'neighbors', cls.path_map_style_preference_score) 157 | return Point({'person': np.array(row["style_preference"]), 'neighbors':neighbors}, row["score"]) 158 | return transform_row 159 | 160 | 161 | @classmethod 162 | def style_from_neighbor(cls, length): 163 | # Python you suck at developer productivity. 164 | # Seriously, coffeescript has all these things sorted out 165 | # Like no anonymous functions? Fuck you. 166 | def transform_row(row): 167 | neighbors = cls.collect_neighbors(row, 'neighbors', cls.path_map_style_preference_score, length) 168 | return Point(neighbors, row["product"].properties["style"]) 169 | return transform_row 170 | 171 | 172 | @classmethod 173 | def review_from_all_hidden_simple_unroll(cls, experiment): 174 | def t(row): 175 | length = experiment.header.params["neighbor_count"] 176 | neighbors = np.array(row["neighbors"]) 177 | delta = length - neighbors.shape[0] 178 | 179 | if delta > 0: 180 | neighbors = np.pad(neighbors, ((0,delta), (0, 0)), 'constant', constant_values=0.0) 181 | 182 | return Point(neighbors, row["score"]) 183 | 184 | return Recipe(t) 185 | 186 | @staticmethod 187 | def review_from_all_hidden_random_walks(experiment): 188 | 189 | encode_label = { 190 | "NODE": [1,0,0,0,0], 191 | "PERSON": [0,1,0,0,0], 192 | "REVIEW": [0,0,1,0,0], 193 | "PRODUCT": [0,0,0,1,0], 194 | "LOOP": [0,0,0,0,1] 195 | } 196 | 197 | FakeNode = namedtuple('FakeNode', ['id', 'properties', 'labels']) 198 | loop_node = FakeNode(None, {}, set(['NODE', 'LOOP'])) 199 | 200 | def extract_label(l): 201 | return encode_label.get(list(set(l) - set('NODE'))[0], [1,0,0,0]) 202 | 203 | node_id_dict = {} 204 | 205 | def node_id_to_memory_addr(nid): 206 | 207 | if nid not in node_id_dict: 208 | node_id_dict[nid] = len(node_id_dict) % experiment.header.params['memory_size'] 209 | 210 | return node_id_dict[nid] 211 | 212 | def package_node(n, is_target=False): 213 | ms = experiment.header.params['memory_size'] 214 | 215 | if experiment.header.params["generate_address"]: 216 | address_trunc = node_id_to_memory_addr(n.id) 217 | address_one_hot = np.zeros(ms) 218 | address_one_hot[address_trunc] = 1.0 219 | else: 220 | address_one_hot = np.array([]) 221 | 222 | label = extract_label(n.labels) 223 | score = n.properties.get("score", -1.0) 224 | 225 | if random.random() < experiment.header.params["target_dropout"] or is_target: 226 | score = -1.0 227 | 228 | x = np.concatenate(([score, float(is_target)], label, address_one_hot)) 229 | 230 | return x 231 | 232 | 233 | def path_to_patch(node, path): 234 | ps = np.array([package_node(i, i.id == node.id) for i in path.nodes]) 235 | 236 | if path.nodes[0].id == path.nodes[-1].id: 237 | print("outputting loop_node for ", path.nodes[0].id, [i.id for i in path.nodes]) 238 | l = np.array([package_node(loop_node, False)]) 239 | np.append(ps, l, axis=0) 240 | 241 | ps = np.repeat(ps, 2, axis=0) 242 | 243 | patch_size = experiment.header.params["patch_size"] 244 | ps = DatasetHelpers.ensure_length(ps, patch_size) 245 | return ps 246 | 247 | 248 | def row_to_point(row): 249 | patch_size = experiment.header.params["patch_size"] 250 | seq_size = experiment.header.params["sequence_size"] 251 | 252 | neighbors = row["neighbors"] 253 | review = row["review"] 254 | 255 | x = np.array([path_to_patch(review, path) for path in neighbors]) 256 | x = DatasetHelpers.ensure_length(x, seq_size) 257 | # x = np.repeat(x, 3, axis=0) 258 | 259 | y = row["review"].properties.get("score", -1.0) 260 | # y = np.repeat([y], seq_size) 261 | # y = np.expand_dims(y, axis=-1) 262 | 263 | target_shape = (seq_size, patch_size, experiment.header.params["patch_width"]) 264 | assert x.shape == target_shape, f"{x.shape} != {target_shape}" 265 | 266 | return Point(x, y) 267 | 268 | def query(client, cypher_query, query_params): 269 | return client.execute_cypher_once_per_id( 270 | cypher_query, 271 | query_params, 272 | dataset_name=experiment.header.dataset_name, 273 | id_limit=experiment.header.params["id_limit"], 274 | id_type="REVIEW" 275 | ) 276 | 277 | def balance_classes(stream): 278 | # ugh arch pain 279 | # instead pass in an arg that is a callable stream generator 280 | 281 | classes = [0.0, 1.0] 282 | last = [None, None] 283 | 284 | # Over-sample 285 | # This is imperfectly balanced as it cold-starts without last values 286 | for i in stream: 287 | for index, c in enumerate(classes): 288 | if np.array([i.y]).flatten()[0] == c: 289 | last[index] = i 290 | yield i 291 | elif last[index] is not None: 292 | yield last[index] 293 | 294 | 295 | def transform(stream): 296 | # y_count = Counter() 297 | # y_count[str(y)] += 1 298 | # print(f"Counter of y values: {[(i, y_count[i] / len(list(y_count.elements())) * 100.0) for i in y_count]}") 299 | stream = (row_to_point(row) for row in stream) 300 | stream = balance_classes(stream) 301 | return stream 302 | 303 | return Recipe(transform=transform,query=query) 304 | 305 | @staticmethod 306 | def review_from_all_hidden_adj(experiment) -> Recipe: 307 | bs = experiment.params.batch_size 308 | person_product = {} 309 | 310 | reviews_per_person = Counter() 311 | reviews_per_product = Counter() 312 | 313 | pr_c = experiment.header.params["product_count"] 314 | pe_c = experiment.header.params["person_count"] 315 | 316 | shape = (pr_c, pe_c) 317 | unmasked_products=np.zeros(shape=(pr_c,)) 318 | unmasked_products[0] = 1 319 | unmasked_people=np.zeros(shape=(pe_c,)) 320 | cache = [] 321 | training_mask = np.zeros(shape) 322 | pause=[0] 323 | def gen_output(datas): 324 | for i in range(bs * experiment.header.params["batch_per_epoch"]): 325 | for partition, pt in datas.items(): 326 | if partition=="train": 327 | pe_flag = False 328 | pr_flag = False 329 | if pause[0] > 48: 330 | 331 | def do_product(): 332 | if not pr_flag: 333 | for x in range(pe_c): 334 | if unmasked_people[x] == 0 and any(pt.x[y][x] == 1 for y in range(pr_c) if unmasked_products[y] == 1): 335 | unmasked_people[x] = 1 336 | pe_flag = True 337 | break 338 | 339 | def do_person(): 340 | if not pe_flag: 341 | for y in range(pr_c): 342 | if unmasked_products[y] == 0 and any(pt.x[y][x] == 1 for x in range(pe_c) if unmasked_people[x] == 1): 343 | unmasked_products[y] = 1 344 | pr_flag = True 345 | break 346 | 347 | if random.random() > 0.5: 348 | do_product() 349 | else: 350 | do_person() 351 | 352 | if not pr_flag and not pe_flag: 353 | for x in range(pe_c): 354 | if unmasked_people[x] == 0: 355 | unmasked_people[x] = 1 356 | pe_flag = True 357 | break 358 | if not pe_flag: 359 | for y in range(pr_c): 360 | if unmasked_products[y] == 0: 361 | unmasked_products[y] = 1 362 | pr_flag = True 363 | break 364 | for x in range(pe_c): 365 | #TODO this is like a np.cross or something 366 | for y in range(pr_c): 367 | if unmasked_people[x] * unmasked_products[y] == 1: 368 | training_mask[y][x] = 1 369 | if not pe_flag and not pr_flag: 370 | assert np.sum(training_mask) == pr_c * pe_c 371 | print('all data') 372 | pause[0] = 0 373 | pause[0]+=1 374 | 375 | pt = Point(np.where(training_mask, pt.x, 0), np.where(training_mask, pt.y, 0)) 376 | #print(np.sum(pt.x)) 377 | #print(np.sum(pt.y)) 378 | yield (partition, pt) 379 | # yield Point(adj_con, adj_score) 380 | 381 | def transform(stream): 382 | if len(cache) == 1: 383 | return gen_output(cache[0]) 384 | 385 | data = list(stream) 386 | 387 | products = set() 388 | people = set() 389 | # Construct adjacency dict 390 | for i in data: 391 | if i["person_id"] not in person_product: 392 | person_product[i["person_id"]] = {} 393 | 394 | if len(people) < pe_c or i["person_id"] in people: 395 | if len(products) < pr_c or i["product_id"] in products: 396 | 397 | person_product[i["person_id"]][i["product_id"]] = i["score"] 398 | 399 | reviews_per_person[i["person_id"]] += 1 400 | reviews_per_product[i["product_id"]] += 1 401 | 402 | products.add(i["product_id"]) 403 | people.add(i["person_id"]) 404 | 405 | def exists(person, product): 406 | return 1.0 if person in person_product and product in person_product[person] else 0.0 407 | 408 | def score(person, product): 409 | return person_product.get(person, 0.0).get(product, 0.0) 410 | 411 | ppe = list(dict(reviews_per_person).values()) 412 | ppr = list(dict(reviews_per_product).values()) 413 | 414 | #print("Reviews per product: ", np.histogram(ppe) ) 415 | #print("Reviews per person: ", np.histogram(ppr) ) 416 | 417 | #logger.info(f"People returned {len(people)} of capacity {pe_c}") 418 | #logger.info(f"Products returned {len(products)} of capacity {pr_c}") 419 | 420 | people = sorted(list(people))[:pe_c] 421 | products = sorted(list(products))[:pr_c] 422 | 423 | def build(fn): 424 | return DatasetHelpers.ensure_length(np.array([ 425 | DatasetHelpers.ensure_length( 426 | np.array([fn(person, product) for person in people]) 427 | , pe_c) for product in products 428 | ]), pr_c) 429 | 430 | adj_score = build(score) 431 | adj_con = build(exists) 432 | 433 | # print("Connections:",adj_con) 434 | # print("Scores:",adj_score) 435 | 436 | assert_mtx_shape(adj_score, shape, "adj_score") 437 | assert_mtx_shape(adj_con, shape) 438 | 439 | mask_seed = np.random.randint(10, size=shape) 440 | masks = { 441 | "test": np.equal(mask_seed, 0), 442 | "train": np.greater(mask_seed, 1), 443 | "validate": np.equal(mask_seed, 1), 444 | "all": Point(adj_con, adj_score) 445 | } 446 | 447 | def gen_d(mask): 448 | return Point(np.where(mask, adj_con, 0), np.where(mask, adj_score, 0)) 449 | 450 | datas = { 451 | k: gen_d(v) 452 | for (k, v) in masks.items() 453 | } 454 | 455 | warm_up = False 456 | 457 | 458 | if warm_up: 459 | cache.append(datas) 460 | return gen_output(datas) 461 | 462 | else: 463 | for i in range(experiment.params.batch_size * experiment.header.params["batch_per_epoch"]): 464 | for partition, pt in datas.items(): 465 | yield (partition, pt) 466 | 467 | 468 | return Recipe(transform=transform, partition=lambda x:x) 469 | 470 | 471 | -------------------------------------------------------------------------------- /graph_ml/model.py: -------------------------------------------------------------------------------- 1 | 2 | import keras 3 | from keras.models import Sequential, Model 4 | from keras.layers import * 5 | import keras.backend as K 6 | 7 | import tensorflow as tf 8 | 9 | from .ntm import * 10 | from .adjacency_layer import Adjacency 11 | 12 | 13 | # Rainbow sprinkles for your activation function 14 | # Try to use all activation functions 15 | # @argument m: (?,N) tensor 16 | # @returns (?,N*5) tensor 17 | def PolyActivation(m): 18 | # wildcard of the day - let's do inception style activation because I've no idea which is best 19 | # and frequently I get great boosts from switching activation functions 20 | activations = ['tanh', 'sigmoid', 'softmax', 'softplus', 'relu'] 21 | 22 | # TODO: Add dense layer to resize back to original size 23 | # I cannot work out how to do that in Keras yet :/ 24 | return Concatenate()([ 25 | Activation(i)(m) for i in activations 26 | ]) 27 | 28 | 29 | # Choose activation function for me 30 | # More efficient than PolyActivation 31 | # @returns Same sized tensor as input 32 | def PolySwitchActivation(m): 33 | # will fail for shared nodes 34 | print(m.shape) 35 | 36 | if len(m.shape) != 3: 37 | # TODO: make this work in a sane way 38 | m = Reshape([i for i in m.shape.dims if i is not None] + [1])(m) # warning: assumes tensorflow 39 | 40 | activations = ['tanh', 'sigmoid', 'softmax', 'softplus', 'relu'] 41 | return add([ 42 | Conv1D(1,1)(Activation(i)(m)) for i in activations 43 | ]) 44 | 45 | class Model(object): 46 | 47 | @classmethod 48 | def generate(cls, experiment, dataset): 49 | params = experiment.params 50 | 51 | # TODO: Move this into Experiment header 52 | n_styles = 6 53 | n_sequence = 100 54 | 55 | bs = experiment.params.batch_size 56 | 57 | if experiment.name == "review_from_visible_style": 58 | model = Sequential([ 59 | Dense(8, 60 | input_shape=dataset.input_shape, 61 | activation='softmax'), 62 | Dense(1, activation='sigmoid'), 63 | ]) 64 | 65 | 66 | elif experiment.name == "review_from_hidden_style_neighbor_conv": 67 | neighbors = Input(shape=(n_sequence,n_styles*2,), dtype='float32', name='neighbors') 68 | person = Input(shape=(n_styles,), dtype='float32', name='person') 69 | 70 | m = cls.style_from_neighbors(neighbors, n_styles, n_sequence) 71 | m = Concatenate()([m, person]) 72 | m = Dense(n_styles*4)(m) 73 | m = PolyActivation(m) 74 | m = Dense(1, activation='sigmoid')(m) 75 | 76 | model = keras.models.Model(inputs=[person, neighbors], outputs=[m]) 77 | 78 | 79 | elif experiment.name == "style_from_neighbor_conv": 80 | neighbors = Input(shape=(n_sequence,n_styles+2,), dtype='float32', name='neighbors') 81 | m = cls.style_from_neighbors(neighbors, n_styles, n_sequence) 82 | 83 | model = keras.models.Model(inputs=[neighbors], outputs=[m]) 84 | 85 | 86 | elif experiment.name == "style_from_neighbor_rnn": 87 | neighbors = Input(shape=(n_sequence,n_styles+2,), dtype='float32', name='neighbors') 88 | m = LSTM(n_styles*4)(neighbors) 89 | m = Dense(n_styles)(m) 90 | m = Activation('sigmoid', name='final_activation')(m) 91 | 92 | model = keras.models.Model(inputs=[neighbors], outputs=[m]) 93 | 94 | 95 | elif experiment.name == "review_from_all_hidden_simple_unroll": 96 | thinking_width = 10 97 | 98 | neighbors = Input(shape=(experiment.header.params["neighbor_count"],4,), dtype='float32', name='neighbors') 99 | m = Conv1D(thinking_width, 1, activation='tanh')(neighbors) 100 | m = MaxPooling1D(experiment.header.params["neighbor_count"])(m) 101 | m = Reshape([thinking_width])(m) 102 | m = Dense(1)(m) 103 | m = Activation("sigmoid", name='final_activation')(m) 104 | 105 | model = keras.models.Model(inputs=[neighbors], outputs=[m]) 106 | 107 | 108 | elif experiment.name == 'review_from_all_hidden_random_walks': 109 | 110 | ss = experiment.header.params["sequence_size"] 111 | ps = experiment.header.params["patch_size"] 112 | pw = experiment.header.params["patch_width"] 113 | 114 | patch = Input(batch_shape=(bs,ss,ps,pw), dtype='float32', name="patch") 115 | # flat_patch = Reshape([ss*ps*pw])(patch) 116 | # score = Dense(experiment.header.params["working_width"]*2, activation="tanh")(flat_patch) 117 | # score = Dense(experiment.header.params["working_width"], activation="tanh")(flat_patch) 118 | 119 | # rnn = PatchNTM(experiment).build() 120 | # score = rnn(patch) 121 | 122 | # Data format 123 | # x = [x_path, x_path, x_path] 124 | # x_path = [x_node, x_node, x_node] 125 | # x_node = [label, score, is_head] 126 | 127 | # x = [ 128 | # [ 129 | # [label, score, is_head]:Node, 130 | # [label, score, is_head]:Node 131 | # ]:Path, 132 | # [ 133 | # [label, score, is_head]:Node, 134 | # [label, score, is_head]:Node 135 | # ]:Path 136 | # ]:Sequence 137 | 138 | # Convolve path-pattern 139 | channels = 8 140 | pattern_length = 8 141 | 142 | m = patch 143 | 144 | # Add channels for convolution 145 | m = Lambda(lambda x: K.expand_dims(x, axis=-1))(m) 146 | 147 | # Compute!! 148 | m = Conv3D(channels, (1, pattern_length, pw), activation='relu')(m) 149 | pattern_conv_out_size = ps - pattern_length + 1 150 | 151 | m = Reshape([ss * channels * pattern_conv_out_size])(m) 152 | m = Dense(4, activation="relu", name="score_dense")(m) 153 | score = Dense(1, activation="sigmoid", name="score_out")(m) 154 | 155 | model = keras.models.Model(inputs=[patch], outputs=[score]) 156 | 157 | 158 | elif experiment.name == 'review_from_all_hidden_adj': 159 | 160 | pr_c = experiment.header.params["product_count"] 161 | pe_c = experiment.header.params["person_count"] 162 | style_width = experiment.header.params["style_width"] 163 | 164 | adj_con = Input(batch_shape=(bs, pr_c, pe_c), dtype='float32', name="adj_con") 165 | features = Adjacency(pe_c, pr_c, style_width, name="hidden_to_adj")(adj_con) 166 | 167 | model = keras.models.Model(inputs=[adj_con], outputs=[features]) 168 | 169 | model.compile(loss=keras.losses.mean_squared_error, 170 | optimizer=keras.optimizers.Adam(lr=0.2, decay=0.01), 171 | metrics=['accuracy']) 172 | 173 | return model 174 | 175 | 176 | 177 | # Compile time! 178 | if experiment.header.target == float: 179 | model.compile(loss=keras.losses.mean_squared_error, 180 | optimizer=keras.optimizers.SGD(lr=0.3), 181 | metrics=['accuracy']) 182 | 183 | elif experiment.header.target == list: 184 | model.compile(loss='categorical_crossentropy', 185 | optimizer=keras.optimizers.SGD(lr=0.3), 186 | metrics=['accuracy']) 187 | 188 | 189 | 190 | return model 191 | 192 | @classmethod 193 | def style_from_neighbors(cls, neighbors, n_styles, n_sequence): 194 | m = Conv1D(n_styles, 1, activation='tanh')(neighbors) 195 | m = MaxPooling1D(n_sequence)(m) 196 | m = Reshape([n_styles])(m) 197 | m = Dense(n_styles)(m) 198 | m = Activation('softmax')(m) 199 | 200 | return m 201 | 202 | 203 | -------------------------------------------------------------------------------- /graph_ml/ntm.py: -------------------------------------------------------------------------------- 1 | 2 | import keras 3 | import keras.backend as K 4 | 5 | import tensorflow as tf 6 | 7 | from keras.models import Model 8 | from keras.layers import * 9 | from recurrentshop import RecurrentModel 10 | 11 | from .util import * 12 | 13 | class NTMBase(object): 14 | 15 | def __init__(self, experiment): 16 | self.experiment = experiment 17 | 18 | self.patch_size = experiment.header.params["patch_size"] 19 | self.patch_width = experiment.header.params["patch_width"] 20 | self.working_width = experiment.header.params["working_width"] 21 | self.word_size = self.experiment.header.params["word_size"] 22 | self.batch_size = self.experiment.params.batch_size 23 | self.memory_size = self.experiment.header.params["memory_size"] 24 | self.patch_data_width = self.patch_width - self.memory_size 25 | 26 | self.word_shape = [self.word_size] 27 | self.word_shape_batch = [self.batch_size, self.word_size] 28 | self.memory_shape = [self.memory_size, self.word_size] 29 | self.memory_shape_batch = [self.batch_size] + self.memory_shape 30 | 31 | 32 | def combine_nodes(self, patch, width): 33 | patch_data = Lambda(lambda x: x[:,:,0:self.patch_data_width:])(patch) 34 | 35 | n1 = Conv1D( 36 | filters=width, 37 | kernel_size=1, 38 | activation='tanh', 39 | kernel_initializer='random_uniform', 40 | bias_initializer='zeros', 41 | name="ConvPatch1")(patch_data) 42 | 43 | n2 = Conv1D( 44 | filters=width, 45 | kernel_size=1, 46 | activation='tanh', 47 | kernel_initializer='random_uniform', 48 | bias_initializer='zeros', 49 | name="ConvPatch2")(patch_data) 50 | 51 | n = multiply([n1, n2]) 52 | 53 | n = Conv1D( 54 | filters=width, 55 | kernel_size=1, 56 | activation='tanh', 57 | kernel_initializer='random_uniform', 58 | bias_initializer='zeros', 59 | name="ConvPatch3")(n) 60 | 61 | n = MaxPooling1D(self.patch_size)(n) 62 | n = Reshape([width])(n) 63 | return n 64 | 65 | def patch_extract(self, address, patch, slice_begin): 66 | extract_width = self.patch_width - (slice_begin % self.patch_width) 67 | 68 | address_repeated = Lambda(lambda x:K.repeat_elements(K.expand_dims(x, -1), extract_width, -1))(address) 69 | patch_slices = Lambda(lambda x: x[:,:,slice_begin::])(patch) 70 | assert_shape(patch_slices, [self.patch_size, extract_width]) 71 | 72 | rows = multiply([patch_slices, address_repeated]) 73 | row = Lambda(lambda x: K.sum(x,-2))(rows) 74 | assert_shape(row, [extract_width]) 75 | 76 | return row 77 | 78 | def resolve_address(self, address, patch): 79 | assert_shape(address, [self.patch_size]) 80 | assert_shape(patch, [self.patch_size, self.patch_width]) 81 | return self.patch_extract(address, patch, -self.memory_size) 82 | 83 | def read(self, memory, address): 84 | address_repeated = Lambda(lambda x:K.repeat_elements(K.expand_dims(x, -1), self.word_size, -1))(address) 85 | read_rows = multiply([memory, address_repeated]) 86 | read = Lambda(lambda x: K.sum(x,-2))(read_rows) 87 | 88 | assert_shape(read, [self.word_size]) 89 | 90 | return read 91 | 92 | def write(self, memory, address, write): 93 | assert_shape(memory, self.memory_shape) 94 | assert_shape(write, [self.word_size]) 95 | assert_shape(address, [self.memory_size]) 96 | 97 | address_expanded = expand_dims(address, -1) 98 | write = expand_dims(write, 1) 99 | write_e = dot([address_expanded, write], axes=[2,1], name="WriteExpanded") 100 | memory = add([memory, write_e], name="MemoryWrite") 101 | return memory 102 | 103 | def erase(self, memory, address, erase): 104 | assert_shape(memory, self.memory_shape) 105 | assert_shape(erase, [self.word_size]) 106 | assert_shape(address, [self.memory_size]) 107 | 108 | erase = expand_dims(erase, 1) 109 | address_expanded = expand_dims(address, -1) 110 | erase_e = dot([address_expanded, erase], axes=[2,1], name="EraseExpanded") 111 | assert_shape(erase_e, self.memory_shape) 112 | erase_mask = Lambda(lambda x: 1.0 - x)(erase_e) 113 | memory = multiply([memory, erase_mask]) 114 | return memory 115 | 116 | def generate_address(self, input_data, patch, name): 117 | address_ptr = Dense(self.patch_size, activation="softplus",name=name)(input_data) 118 | address = self.resolve_address(address_ptr, patch) 119 | return address 120 | 121 | 122 | class PatchNTM(NTMBase): 123 | 124 | def __init__(self, experiment): 125 | NTMBase.__init__(self, experiment) 126 | 127 | def build(self): 128 | 129 | patch = Input((self.patch_size, self.patch_width), name="InputPatch") 130 | memory_tm1 = Input(batch_shape=self.memory_shape_batch, name="Memory") 131 | memory_t = memory_tm1 132 | 133 | # conv = self.combine_nodes(patch, working_width) 134 | # first_node = Lambda(lambda x: x[:,:self.patch_data_width])(flat_patch) 135 | patch_without_memory_addr = Lambda(lambda x: x[:,:,:self.patch_data_width:])(patch) 136 | flat_patch = Reshape([self.patch_size*self.patch_data_width])(patch_without_memory_addr) 137 | 138 | working_memory = Dense(self.working_width, activation='relu')(flat_patch) 139 | # conv = self.combine_nodes(patch, self.working_width) 140 | # working_memory = concatenate([working_memory, conv]) 141 | # working_memory = Dense(self.working_width, activation='relu')(working_memory) 142 | 143 | pre_memory = working_memory 144 | 145 | use_memory = False 146 | 147 | if use_memory: 148 | # ------- Memory operations --------- # 149 | 150 | primary_address = Lambda(lambda x: x[:,3,self.patch_data_width:])(patch) 151 | print(primary_address) 152 | 153 | address = self.generate_address(primary_address, patch, name="address_read1") 154 | read1 = self.read(memory_t, address) 155 | 156 | # Turn batch dimension from None to batch_size 157 | batched_working_memory = Lambda(lambda x: K.reshape(x, [self.batch_size, self.working_width]))(working_memory) 158 | batched_working_memory = concatenate([batched_working_memory, read1], batch_size=self.batch_size) 159 | 160 | batched_working_memory = Dense(self.working_width, activation='relu')(batched_working_memory) 161 | 162 | erase_word = Dense(self.word_size, name="DenseEraseWord", activation='relu')(batched_working_memory) 163 | # address = self.generate_address(batched_working_memory, patch, name="address_erase") 164 | erase_word = Lambda(lambda x: K.ones_like(x))(erase_word) 165 | memory_t = self.erase(memory_t, primary_address, erase_word) 166 | 167 | write_word = Dense(self.word_size, name="DenseWriteWord", activation='relu')(batched_working_memory) 168 | # address = self.generate_address(batched_working_memory, patch, name="address_write") 169 | memory_t = self.write(memory_t, primary_address, write_word) 170 | 171 | # address = self.generate_address(batched_working_memory, patch, name="address_read2") 172 | # read2 = self.read(memory_t, address) 173 | 174 | # working_memory = concatenate([batched_working_memory, read1]) 175 | working_memory = Dense(self.working_width, activation="relu")(batched_working_memory) 176 | 177 | 178 | return RecurrentModel( 179 | input=patch, 180 | output=working_memory, 181 | return_sequences=True, 182 | stateful=True, 183 | 184 | initial_states=[memory_tm1], 185 | final_states=[memory_t], 186 | state_initializer=[initializers.random_normal(stddev=1.0)] 187 | ) 188 | 189 | 190 | -------------------------------------------------------------------------------- /graph_ml/path.py: -------------------------------------------------------------------------------- 1 | 2 | import hashlib 3 | import os.path 4 | import logging 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | def generate_path(experiment, prefix, suffix, extra=""): 9 | query = experiment.header.cypher_query 10 | m = hashlib.md5() 11 | 12 | m.update(query.encode('utf-8')) 13 | m.update(extra.encode('utf-8')) 14 | # logger.info(f"generate_path {prefix} {suffix} {query} {extra}") 15 | return os.path.join(prefix + '/' + experiment.name + '_' + m.hexdigest() + suffix) 16 | 17 | def generate_output_path(experiment, suffix): 18 | return generate_path(experiment, experiment.params.output_dir, suffix) 19 | 20 | def generate_data_path(experiment, suffix, query_params=None): 21 | return generate_path(experiment, experiment.params.data_dir, suffix, str(query_params)) 22 | -------------------------------------------------------------------------------- /graph_ml/train.py: -------------------------------------------------------------------------------- 1 | 2 | import os.path 3 | from datetime import datetime 4 | import logging 5 | from sklearn.metrics import classification_report 6 | import itertools 7 | 8 | import keras 9 | import numpy as np 10 | import keras.callbacks 11 | 12 | from .model import Model 13 | from .dataset import Dataset 14 | from .path import generate_output_path 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | class StopEarlyIfAbove(keras.callbacks.Callback): 19 | def __init__(self, monitor='val_acc', value=0.99, verbose=0, patience=3): 20 | super(keras.callbacks.Callback, self).__init__() 21 | self.monitor = monitor 22 | self.value = value 23 | self.verbose = verbose 24 | self.stopped_epoch = 0 25 | self.patience = patience 26 | 27 | def on_epoch_end(self, epoch, logs={}): 28 | current = logs.get(self.monitor) 29 | if current is None: 30 | logger.error("Early stopping requires %s available!" % self.monitor) 31 | exit() 32 | 33 | if current > self.value: 34 | self.patience -= 1 35 | if self.patience <= 0: 36 | self.stopped_epoch = epoch 37 | self.model.stop_training = True 38 | 39 | def on_train_end(self, logs=None): 40 | if self.stopped_epoch > 0 and self.verbose > 0: 41 | logger.info("Epoch {}: early stopping {} > {}".format(self.stopped_epoch+1, self.monitor, self.value)) 42 | 43 | 44 | class SpecialValidator(keras.callbacks.Callback): 45 | def __init__(self, experiment, dataset, model, verbose): 46 | self.experiment = experiment 47 | self.model = model 48 | self.dataset = dataset 49 | self.verbose = verbose 50 | super(keras.callbacks.Callback, self).__init__() 51 | 52 | 53 | def on_train_end(self, logs): 54 | self.test(self.verbose) 55 | 56 | def on_epoch_end(self, epoch, logs): 57 | self.test() 58 | 59 | def test(self, verbose=False): 60 | print() # Clear from epoch status bar 61 | for (label, genie) in self.dataset.generator.items(): 62 | # print(f"Prediction for {label}") 63 | 64 | row = genie.peek() 65 | y_true = row[1][0] 66 | x_test = row[0][0] 67 | 68 | y_pred = self.model.predict_generator( 69 | generator=genie, 70 | steps=1, 71 | workers=0, 72 | use_multiprocessing=False, 73 | ) 74 | y_pred = np.array(y_pred[0]) 75 | 76 | y_correct = np.isclose(y_pred, y_true, atol=0.1) 77 | y_zero = np.isclose(y_pred, 0, atol=0.1) 78 | 79 | # The bits that should be one 80 | y_true_set_and_in_mask = np.where(np.greater(y_true, 0.1), np.greater(x_test, 0.1), False) 81 | 82 | # The bits that should be one and were one 83 | y_masked = np.where(y_true_set_and_in_mask, y_correct, False) 84 | 85 | # The correct predictions for the input adj 86 | y_masked_david = np.where(np.greater(x_test, 0.1), y_correct, False) 87 | 88 | if verbose: 89 | print("y_pred: ", np.around(y_pred, 1)) 90 | print("y_correct: ", y_correct) 91 | # print(f"y_masked {np.count_nonzero(y_masked)} / {np.count_nonzero(y_correct)} / {np.count_nonzero(x_test)}") 92 | 93 | net_accuracy = round(np.count_nonzero(y_masked) / (np.count_nonzero(y_true_set_and_in_mask)+0.001) * 100, 3) 94 | net_accuracy_david = round(np.count_nonzero(y_masked_david) / (np.count_nonzero(x_test)+0.001) * 100, 3) 95 | gross_accuracy = round(np.count_nonzero(y_correct) / np.size(y_correct) * 100, 3) 96 | 97 | print(f"{label} 1-accuracy: {net_accuracy}% accuracy: {net_accuracy_david}%") 98 | # print() 99 | 100 | if label == "validate" and net_accuracy == 100: 101 | self.model.stop_training = True 102 | 103 | 104 | 105 | 106 | 107 | class Train(object): 108 | 109 | @staticmethod 110 | def run(experiment, dataset): 111 | 112 | params = experiment.params 113 | 114 | if params.random_seed is not None: 115 | np.random.seed(params.random_seed) 116 | 117 | logger.info("Generate model") 118 | 119 | model = Model.generate(experiment, dataset) 120 | params_file = generate_output_path(experiment, ".hdf5") 121 | 122 | if os.path.isfile(params_file) and params.load_weights: 123 | model.load_weights(params_file) 124 | 125 | callbacks = [ 126 | #StopEarlyIfAbove(verbose=params.verbose), 127 | SpecialValidator(experiment, dataset, model, params.print_weights), 128 | # keras.callbacks.ModelCheckpoint(params_file, verbose=params.verbose, save_best_only=True, monitor='val_loss', mode='auto', period=3), 129 | # keras.callbacks.TensorBoard(log_dir=generate_output_path(experiment, f"_log/{experiment.run_tag}/")), 130 | #keras.callbacks.EarlyStopping(monitor='loss', min_delta=0.000000001, patience=8, verbose=0, mode='auto') 131 | ] 132 | 133 | # TODO: move to more general overriding mechanism 134 | # Perhaps unify os.environ, arguments, experiment parameters 135 | if params.epochs is not None: 136 | epochs = params.epochs 137 | else: 138 | epochs = experiment.header.params.get('epochs', 20) 139 | 140 | logger.info("Fit model") 141 | 142 | # Once I've worked out Python multithreading conflicts we can introduce workers > 0 143 | model.fit_generator( 144 | generator=dataset.train_generator, 145 | steps_per_epoch=dataset.steps_per_epoch, 146 | validation_data=dataset.validation_generator, 147 | validation_steps=dataset.validation_steps, 148 | 149 | epochs=epochs, 150 | verbose=params.verbose, 151 | 152 | workers=0, 153 | use_multiprocessing=False, 154 | shuffle=True, 155 | callbacks=callbacks 156 | ) 157 | 158 | logger.info("Evaluate model") 159 | 160 | score = model.evaluate_generator( 161 | generator=dataset.test_generator, 162 | steps=dataset.test_steps, 163 | workers=0, 164 | use_multiprocessing=False, 165 | ) 166 | 167 | 168 | if params.print_weights: 169 | for layer in model.layers: 170 | for var, weight in zip(layer.weights, layer.get_weights()): 171 | print(f"{var.name} {np.around(weight, decimals=1)}") 172 | 173 | 174 | return score 175 | 176 | 177 | -------------------------------------------------------------------------------- /graph_ml/util.py: -------------------------------------------------------------------------------- 1 | 2 | from keras.layers import Lambda 3 | import keras.backend as K 4 | 5 | # Take that keras 6 | from tensorflow import float32 7 | 8 | def assert_shape(tensor, shape, strict=False): 9 | if strict: 10 | assert hasattr(tensor, '_keras_shape'), f"{tensor.name} is missing _keras_shape" 11 | assert tensor.shape[1:] == shape, f"{tensor.name} is wrong shape, expected {shape} found {tensor.shape[1:]}" 12 | 13 | def assert_mtx_shape(mtx, shape, name="matrix"): 14 | assert mtx.shape == shape, f"{name} is wrong shape, expected {shape} found {mtx.shape}" 15 | 16 | def expand_dims(v, axis): 17 | return Lambda(lambda x: K.expand_dims(x,axis))(v) -------------------------------------------------------------------------------- /output/.gitignore: -------------------------------------------------------------------------------- 1 | * -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | python -m unittest discover test -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .test_memory_cell import test_memory_cell -------------------------------------------------------------------------------- /test/test_memory_cell.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import keras.backend as K 4 | from keras.utils.test_utils import keras_test 5 | from keras.models import Model 6 | from keras.layers import * 7 | 8 | from recurrentshop import RecurrentModel 9 | 10 | import numpy as np 11 | from numpy.testing import * 12 | 13 | import random 14 | from collections import namedtuple 15 | from tensorflow import float32 16 | from unittest import TestCase 17 | 18 | from graph_ml import Train, Dataset 19 | from graph_ml import NTMBase 20 | from experiment import Experiment, ExperimentHeader 21 | 22 | Args = namedtuple('DummyArgs', 'batch_size') 23 | 24 | 25 | class Tests(TestCase): 26 | 27 | @keras_test 28 | def test_memory_ops(self): 29 | 30 | memory_size = 10 31 | word_size = 4 32 | batch_size = 1 33 | 34 | header = ExperimentHeader(params={"word_size":word_size, "memory_size":memory_size, "patch_size":4, "patch_width":4}) 35 | experiment = Experiment("test_memory_cell", header, Args(batch_size)) 36 | 37 | # Initialise memory with zeros 38 | memory_initial = np.random.random((batch_size, memory_size, word_size)) 39 | memory_tm1 = K.constant(memory_initial, name="memory",dtype=float32) 40 | memory_t = memory_tm1 41 | 42 | # Write address is random int 43 | address_w = random.randint(0,memory_size - 1) 44 | address_one_hot_w = np.zeros([batch_size, memory_size]) 45 | address_one_hot_w[0][address_w] = 1.0 46 | t_address_w = K.constant(address_one_hot_w, name="address",dtype=float32) 47 | 48 | # Write random pattern 49 | write = np.random.random([batch_size, word_size]) 50 | t_write = K.constant(write, name="write") 51 | 52 | pb = NTMBase(experiment) 53 | memory_t = pb.write(memory_t, t_address_w, t_write) 54 | read = pb.read(memory_t, t_address_w) 55 | 56 | address_e = (address_w+1) % memory_size 57 | address_one_hot_e = np.zeros([batch_size, memory_size]) 58 | address_one_hot_e[0][address_e] = 1.0 59 | t_address_e = K.constant(address_one_hot_e, name="address",dtype=float32) 60 | 61 | t_erase = K.constant(np.ones([batch_size, word_size]),name="erase") 62 | memory_t = pb.erase(memory_t, t_address_e, t_erase) 63 | 64 | read_final = K.eval(read) 65 | memory_after_erase = K.eval(memory_t) 66 | 67 | write_expected = [write[0] + memory_initial[0][address_w]] 68 | 69 | for i in range(batch_size): 70 | for j in range(memory_size): 71 | if j == address_w: 72 | assert_allclose(memory_after_erase[i][j], write_expected[0]) 73 | elif j == address_e: 74 | assert_allclose(memory_after_erase[i][j], 0) 75 | else: 76 | assert_allclose(memory_after_erase[i][j], memory_initial[i][j]) 77 | 78 | assert_allclose(read_final, write_expected) 79 | 80 | 81 | @keras_test 82 | def test_memory_loopback(self): 83 | 84 | memory_size = 10 85 | word_size = 4 86 | batch_size = 1 87 | 88 | header = ExperimentHeader(params={"word_size":word_size, "memory_size":memory_size, "patch_size":4, "patch_width":4}) 89 | experiment = Experiment("test_memory_cell", header, Args(batch_size)) 90 | 91 | # Initialise memory with zeros 92 | memory_initial = np.random.random((batch_size, memory_size, word_size)) 93 | memory_tm1 = K.constant(memory_initial, name="memory",dtype=float32) 94 | memory_t = memory_tm1 95 | 96 | # Write address is random int 97 | address = random.randint(0,memory_size - 1) 98 | address_one_hot = np.zeros([batch_size, memory_size]) 99 | address_one_hot[0][address] = 1.0 100 | t_address = K.constant(address_one_hot, name="address",dtype=float32) 101 | 102 | # Write random pattern 103 | write = np.random.random([batch_size, word_size]) 104 | t_write = K.constant(write, name="write") 105 | t_erase = K.constant(np.ones([batch_size, word_size]),name="erase") 106 | 107 | pb = NTMBase(experiment) 108 | memory_t = pb.erase(memory_t, t_address, t_erase) 109 | memory_t = pb.write(memory_t, t_address, t_write) 110 | t_read = pb.read( memory_t, t_address) 111 | 112 | read_final = K.eval(t_read) 113 | 114 | assert_allclose(read_final, write) 115 | 116 | 117 | @keras_test 118 | def test_address_resolution(self): 119 | 120 | # Data setup 121 | memory_size = 20 122 | word_size = 4 123 | batch_size = 1 124 | patch_size = 10 125 | patch_width = memory_size + 5 126 | 127 | header = ExperimentHeader(params={"word_size":word_size, "memory_size":memory_size, "patch_size":patch_size, "patch_width":patch_width}) 128 | experiment = Experiment("test_memory_cell", header, Args(batch_size)) 129 | 130 | pointer = random.randint(0,patch_size - 1) 131 | pointer_one_hot = np.zeros([batch_size, patch_size]) 132 | pointer_one_hot[0][pointer] = 1.0 133 | 134 | patch = np.random.random([batch_size, patch_size, patch_width]) 135 | 136 | t_patch = K.constant(patch, dtype=float32, name="patch") 137 | t_pointer_one_hot = K.constant(pointer_one_hot, dtype=float32, name="pointer_one_hot") 138 | pb = NTMBase(experiment) 139 | resolved = K.eval(pb.resolve_address(t_pointer_one_hot, t_patch)) 140 | 141 | for i in range(batch_size): 142 | assert_almost_equal(resolved[i], patch[i][pointer][-memory_size::]) 143 | 144 | 145 | 146 | @keras_test 147 | def test_address_resolution_gradient(self): 148 | 149 | # Data setup 150 | memory_size = 20 151 | word_size = 4 152 | batch_size = 1 153 | patch_size = 10 154 | patch_width = memory_size + 5 155 | 156 | header = ExperimentHeader(params={"word_size":word_size, "memory_size":memory_size, "patch_size":patch_size, "patch_width":patch_width}) 157 | experiment = Experiment("test_memory_cell", header, Args(batch_size)) 158 | 159 | pb = NTMBase(experiment) 160 | 161 | ptr = Input((patch_size,), name="ptr") 162 | patch = Input((patch_size,patch_width), name="patch") 163 | memory = Input((memory_size, word_size), name="memory") 164 | 165 | resolved = pb.resolve_address(ptr, patch) 166 | read = pb.read(memory, resolved) 167 | 168 | out = Dense(3)(read) 169 | 170 | model = Model([ptr, patch, memory], out) 171 | model.compile(loss='mse', optimizer='sgd') 172 | 173 | model.fit({ 174 | "ptr": np.random.random((batch_size, patch_size)), 175 | "patch": np.random.random((batch_size, patch_size, patch_width)), 176 | "memory": np.random.random((batch_size, memory_size, word_size)), 177 | }, np.random.random((batch_size, 3))) 178 | 179 | 180 | model.predict({ 181 | "ptr": np.zeros((batch_size, patch_size)), 182 | "patch": np.zeros((batch_size, patch_size, patch_width)), 183 | "memory": np.zeros((batch_size, memory_size, word_size)), 184 | }) 185 | 186 | 187 | @keras_test 188 | def test_memory_gradient(self): 189 | 190 | # Data setup 191 | memory_size = 20 192 | word_size = 4 193 | batch_size = 1 194 | patch_size = 10 195 | patch_width = memory_size + 5 196 | 197 | header = ExperimentHeader(params={"word_size":word_size, "memory_size":memory_size, "patch_size":patch_size, "patch_width":patch_width}) 198 | experiment = Experiment("test_memory_cell", header, Args(batch_size)) 199 | 200 | pb = NTMBase(experiment) 201 | 202 | patch = Input((patch_size, patch_width), name="patch") 203 | memory_tm1 = Input((memory_size, word_size), name="memory") 204 | memory_t = memory_tm1 205 | 206 | flat_patch = Reshape((patch_size*patch_width,))(patch) 207 | 208 | write_word = Dense(word_size)(flat_patch) 209 | erase_word = Dense(word_size)(flat_patch) 210 | 211 | ptr = Dense(patch_size)(flat_patch) 212 | address = pb.resolve_address(ptr, patch) 213 | memory_t = pb.erase(memory_t, address, erase_word) 214 | 215 | ptr = Dense(patch_size)(flat_patch) 216 | address = pb.resolve_address(ptr, patch) 217 | memory_t = pb.write(memory_t, address, write_word) 218 | 219 | ptr = Dense(patch_size)(flat_patch) 220 | address = pb.resolve_address(ptr, patch) 221 | read = pb.read(memory_t, address) 222 | 223 | out = Dense(3)(read) 224 | 225 | model = Model([patch, memory_tm1], out) 226 | model.compile(loss='mse', optimizer='sgd') 227 | 228 | model.fit({ 229 | "patch": np.random.random((batch_size, patch_size, patch_width)), 230 | "memory": np.random.random((batch_size, memory_size, word_size)), 231 | }, np.random.random((batch_size, 3))) 232 | 233 | 234 | model.predict({ 235 | "patch": np.zeros((batch_size, patch_size, patch_width)), 236 | "memory": np.zeros((batch_size, memory_size, word_size)), 237 | }) 238 | 239 | 240 | 241 | 242 | @keras_test 243 | def test_memory_rnn_gradient(self): 244 | 245 | # Data setup 246 | memory_size = 20 247 | word_size = 4 248 | batch_size = 1 249 | patch_size = 10 250 | patch_width = memory_size + 5 251 | sequence_length = 10 252 | 253 | header = ExperimentHeader(params={"word_size":word_size, "memory_size":memory_size, "patch_size":patch_size, "patch_width":patch_width}) 254 | experiment = Experiment("test_memory_cell", header, Args(batch_size)) 255 | 256 | pb = NTMBase(experiment) 257 | 258 | patch = Input((patch_size, patch_width), name="patch") 259 | memory_tm1 = Input((memory_size, word_size), name="memory") 260 | memory_t = memory_tm1 261 | 262 | flat_patch = Reshape((patch_size*patch_width,))(patch) 263 | 264 | write_word = Dense(word_size)(flat_patch) 265 | erase_word = Dense(word_size)(flat_patch) 266 | 267 | ptr = Dense(patch_size)(flat_patch) 268 | address = pb.resolve_address(ptr, patch) 269 | memory_t = pb.erase(memory_t, address, erase_word) 270 | 271 | ptr = Dense(patch_size)(flat_patch) 272 | address = pb.resolve_address(ptr, patch) 273 | memory_t = pb.write(memory_t, address, write_word) 274 | 275 | ptr = Dense(patch_size)(flat_patch) 276 | address = pb.resolve_address(ptr, patch) 277 | read = pb.read(memory_t, address) 278 | 279 | out = Dense(3)(read) 280 | 281 | rnn = RecurrentModel(input=patch, output=out, initial_states=[memory_tm1], final_states=[memory_t]) 282 | a = Input((sequence_length, patch_size, patch_width), name="patch_seq") 283 | b = rnn(a) 284 | model = Model(a, b) 285 | model.compile(loss='mse', optimizer='sgd') 286 | 287 | model.fit({ 288 | "patch_seq": np.random.random((batch_size, sequence_length, patch_size, patch_width)), 289 | # "memory": np.random.random((batch_size, memory_size, word_size)), 290 | }, np.random.random((batch_size, 3))) 291 | 292 | 293 | model.predict({ 294 | "patch_seq": np.zeros((batch_size, sequence_length, patch_size, patch_width)), 295 | # "memory": np.zeros((batch_size, memory_size, word_size)), 296 | }) 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from experiment import Experiment 4 | 5 | if __name__ == '__main__': 6 | Experiment.run() 7 | 8 | --------------------------------------------------------------------------------